From 272b32fd8530cf1ddf56f508376c3120864a8a86 Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Wed, 2 Mar 2022 23:21:34 +0800 Subject: [PATCH 001/261] Replacing dropout eval eigen usage by cuda kernel (#40053) * Replacing dropout eval eigen usage by cuda kernel --- paddle/fluid/operators/dropout_impl.cu.h | 28 +++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 2fa956a2e65..cdcf683fb92 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, bool is_fix_seed, int seed_val, const Tensor& x, const Tensor* seed, Tensor* mask, Tensor* y) { auto& place = *dev_ctx.eigen_device(); + int64_t x_numel = x.numel(); + auto stream = dev_ctx.stream(); + auto* x_data = x.data(); + auto* y_data = y->data(); if (!is_test) { - int64_t x_numel = x.numel(); - auto stream = dev_ctx.stream(); auto* mask_data = mask->data(); size_t size = phi::product(mask->dims()); - auto* x_data = x.data(); - auto* y_data = y->data(); if (dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } #endif } else { - auto X = EigenMatrix::Reshape(x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); if (upscale_in_train) { - Y.device(place) = X; +// todo: can y share with data with x directly? +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + hipMemcpyDeviceToDevice, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + cudaMemcpyDeviceToDevice, stream)); +#endif } else { - Y.device(place) = X * static_cast(1.0f - dropout_prob); + T factor = static_cast(1.0f - dropout_prob); + std::vector ins = {&x}; + std::vector outs = {y}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } } -- GitLab From c16f85f95d0c42989e22c5ebae709f60506111a0 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 3 Mar 2022 01:24:26 +0800 Subject: [PATCH 002/261] Add the implementation of Gloo for ProcessGroup (#39892) * add pg_gloo --- .../distributed/collective/CMakeLists.txt | 3 + .../collective/ProcessGroupGloo.cc | 308 ++++++++++++++++++ .../distributed/collective/ProcessGroupGloo.h | 138 ++++++++ paddle/fluid/distributed/store/store.h | 2 + paddle/fluid/distributed/store/tcp_store.cc | 85 +++-- paddle/fluid/distributed/store/tcp_store.h | 12 +- paddle/fluid/distributed/store/tcp_utils.cc | 3 +- paddle/fluid/pybind/CMakeLists.txt | 3 + paddle/fluid/pybind/communication.cc | 12 +- paddle/fluid/pybind/distributed_py.cc | 54 ++- .../tests/unittests/process_group_gloo.py | 119 +++++++ .../test_collective_process_group.py | 3 + 12 files changed, 701 insertions(+), 41 deletions(-) create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.cc create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.h create mode 100644 python/paddle/fluid/tests/unittests/process_group_gloo.py diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index a5b40f8aa07..96bc4a710f8 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +if (WITH_DISTRIBUTE) + cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) +endif() cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc new file mode 100644 index 00000000000..03ad48f560a --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#ifdef _WIN32 +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#ifdef _WIN32 +#define GENERATE_FUNC(type, func, ...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT64: \ + func(__VA_ARGS__); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } + +#define HOST_NAME_MAX 256 + +#else +#define GENERATE_FUNC(type, func, args...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(args); \ + break; \ + case experimental::DataType::INT32: \ + func(args); \ + break; \ + case experimental::DataType::INT64: \ + func(args); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } +#endif + +typedef void (*reduce_func)(void*, const void*, const void*, size_t); + +template +reduce_func get_function(const ReduceOp& r) { + switch (r) { + case ReduceOp::SUM: + return reduce_func(&::gloo::sum); + case ReduceOp::PRODUCT: + return reduce_func(&::gloo::product); + case ReduceOp::MIN: + return reduce_func(&::gloo::min); + case ReduceOp::MAX: + return reduce_func(&::gloo::max); + case ReduceOp::AVG: + VLOG(0) << "Error: Unsupported ReduceOp::AVG."; + exit(-1); + } + + VLOG(0) << "Error: Unknown ReduceOp."; + exit(-1); +} + +bool CheckTensorsInCPUPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kCPU; + }); +} + +template +T* get_data(const Tensor& tensor) { + auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); + return static_cast(raw_tensor->data()); +} + +template +std::vector get_multi_data(const std::vector& tensors) { + std::vector ret(tensors.size()); + for (size_t i = 0; i < tensors.size(); i++) { + ret[i] = get_data(tensors[i]); + } + return ret; +} + +template +void set_output(P& opts, const Tensor& tensor) { // NOLINT + opts.setOutput(get_data(tensor), tensor.numel()); +} + +template +void set_input(P& opts, const Tensor& tensor) { // NOLINT + opts.setInput(get_data(tensor), tensor.numel()); +} + +template +void set_outputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setInputs(get_multi_data(tensors), tensors[0].numel()); +} + +ProcessGroupGloo::GlooTask::GlooTask(int rank, + const std::vector& inputs, + CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) { + PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, + platform::errors::Fatal( + "Only CPU place is supported for ProcessGroupGloo.")); +} + +ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, + int rank, int world_size, + const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(store) { + _context = std::make_shared(rank, world_size); + auto prefix_store = + ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + _context->connectFullMesh(prefix_store, options->device); +} + +class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { + public: + BroadcastGlooTask(const std::shared_ptr& context, + const std::vector& inputs, int rank, int root, + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), + _context(context), + _root(root), + _inputs(inputs), + _tag(tag) {} + + void Run() override { _do_broadcast(_inputs[0]); } + + private: + std::shared_ptr _context; + const int _root; + std::vector _inputs{}; + const uint32_t _tag; + + void _do_broadcast(const Tensor& tensor) { + gloo::BroadcastOptions opts(_context); + const auto& dtype = tensor.type(); + GENERATE_FUNC(dtype, set_output, opts, tensor); + opts.setRoot(_root); + opts.setTag(_tag); + gloo::broadcast(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Broadcast( + std::vector& inputs, const BroadcastOptions& opts) { + auto root = opts.source_rank; + std::unique_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_unique(context, inputs, rank_, root, tag); + task->Run(); + return task; +} + +class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllreduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, ReduceOp reduce_op, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), + _context(context), + _inputs(inputs), + _reduce_op(reduce_op), + _tag(tag) {} + + void Run() override { _do_allreduce(_inputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + uint32_t _tag; + + gloo::AllreduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::AllreduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_allreduce(std::vector& tensors) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::AllreduceOptions opts(_context); + GENERATE_FUNC(dtype, set_inputs, opts, tensors); + GENERATE_FUNC(dtype, set_outputs, opts, tensors); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + gloo::allreduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllReduce( + std::vector& inputs, const AllreduceOptions& opts) { + auto tag = next_tag(); + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context, inputs, + opts.reduce_op, tag); + task->Run(); + return task; +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { + ::gloo::transport::tcp::attr attr; + attr.iface = ifname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) { + ::gloo::transport::tcp::attr attr; + attr.hostname = hostname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDefaultDevice() { + std::array hostname{}; + auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal( + "Get hostname error for createDefaultDevice.")); + ::addrinfo* result; + result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC); + ::addrinfo* cur; + for (cur = result; cur != nullptr; cur = cur->ai_next) { + SocketType socket = + ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (socket == -1) { + continue; + } + ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen); +#ifdef _WIN32 + closesocket(socket); +#else + close(socket); +#endif + if (ret == -1) { + continue; + } + break; + } + freeaddrinfo(result); + if (cur != nullptr) { + return createDeviceForHostname(hostname.data()); + } + return createDeviceForHostname("127.0.0.1"); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h new file mode 100644 index 00000000000..d989939fcb8 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +#ifdef PADDLE_WITH_GLOO +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/distributed/store/tcp_store.h" + +constexpr const char* GLOO_BACKEND_NAME = "GLOO"; + +namespace paddle { +namespace distributed { + +class ProcessGroupGloo : public ProcessGroup { + public: + class GlooTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + explicit GlooTask(int rank, const std::vector& input_tensors, + CommType comm_type); + + ~GlooTask() = default; + + virtual void Run() = 0; + bool Wait(std::chrono::milliseconds timeout) override { return true; } + bool IsCompleted() override { return true; } + void Synchronize() override {} + + protected: + friend class ProcessGroupGloo; + }; + + class GlooStore : public ::gloo::rendezvous::Store { + public: + explicit GlooStore( + const std::shared_ptr& store) + : _store(store) {} + + ~GlooStore() = default; + + std::vector get(const std::string& key) override { + VLOG(3) << "GlooStore::get"; + auto value = _store->get(key); + return std::vector(value.begin(), value.end()); + } + + void wait(const std::vector& keys) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + } + + void set(const std::string& key, const std::vector& value) override { + VLOG(3) << "GlooStore::set"; + std::vector tmp(value.begin(), value.end()); + _store->set(key, tmp); + } + + void wait(const std::vector& keys, + const std::chrono::milliseconds& timeout) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + // wait(keys); + } + + protected: + std::shared_ptr _store; + }; + + class GlooOptions { + public: + GlooOptions() = default; + ~GlooOptions() = default; + static std::shared_ptr create() { + return std::make_shared(); + } + std::shared_ptr<::gloo::transport::Device> device; + }; + + explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, + int world_size, + std::shared_ptr options); + + ~ProcessGroupGloo() = default; + + std::shared_ptr Broadcast( + std::vector& inputs, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr AllReduce( + std::vector& inputs, + const AllreduceOptions& opts = AllreduceOptions()) override; + + std::shared_ptr<::gloo::Context> get_context() { return _context; } + uint64_t next_tag() { return _tag++; } + + const std::string GetBackendName() const override { + return GLOO_BACKEND_NAME; + } + + // Helper functions for Gloo. + static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname( + const std::string& hostname); + static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface( + const std::string& ifname); + static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); + + protected: + uint32_t _tag; + std::shared_ptr _context; + std::shared_ptr _store; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2673314d222..2581a74d7e8 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -32,6 +32,8 @@ class Store { virtual int64_t add(const std::string& key, int64_t value) = 0; virtual std::vector get(const std::string& key) = 0; virtual void wait(const std::string& key) = 0; + virtual void set(const std::string& key, + const std::vector& value) = 0; virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index de85ac0d910..8675981955d 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -27,11 +27,13 @@ namespace detail { constexpr int INFTIME = -1; -std::unique_ptr MasterDaemon::start(SocketType socket) { - return std::make_unique(socket); +std::unique_ptr MasterDaemon::start(SocketType socket, + int nranks) { + return std::make_unique(socket, nranks); } -MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { +MasterDaemon::MasterDaemon(SocketType socket, int nranks) + : _listen_socket(socket), _nranks(nranks) { _background_thread = std::thread{&MasterDaemon::run, this}; } @@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) { tcputils::send_value(socket, new_value); } +void MasterDaemon::_do_set(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_set"; + std::string key = tcputils::receive_string(socket); + auto value = tcputils::receive_vector(socket); + _store[key] = value; +} + void MasterDaemon::_do_get(SocketType socket) { std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); @@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) { iter, _store.end(), platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); std::vector value = iter->second; - VLOG(3) << "TCPStore: value (" - << std::stoll(std::string(reinterpret_cast(value.data()), - value.size())) - << ") for key (" << key << ")."; tcputils::send_vector(socket, value); } void MasterDaemon::_do_stop(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_stop"; ReplyType value = ReplyType::STOP_WAIT; - _stop = true; + if (--_nranks == 0) { + _stop = true; + } tcputils::send_value(socket, value); } @@ -140,21 +148,27 @@ void MasterDaemon::run() { case Command::GET: _do_get(fds[i].fd); break; + case Command::SET: + _do_set(fds[i].fd); + break; case Command::WAIT: _do_wait(fds[i].fd); break; case Command::STOP: _do_stop(fds[i].fd); break; + default: + VLOG(0) << "Unknow command: " << static_cast(command); + exit(-1); } } } } -std::unique_ptr TCPServer::create(uint16_t port) { +std::unique_ptr TCPServer::create(uint16_t port, int nranks) { int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); auto server = std::make_unique(); - server->_master_daemon = MasterDaemon::start(socket); + server->_master_daemon = MasterDaemon::start(socket, nranks); return server; } @@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, size_t num_workers, std::chrono::seconds timeout) : Store(timeout), _is_master(is_master), _num_workers(num_workers) { if (_is_master) { - _server = detail::TCPServer::create(port); + _server = detail::TCPServer::create(port, num_workers); } _client = detail::TCPClient::connect(host, port); @@ -213,36 +227,41 @@ void TCPStore::waitWorkers() { } add(_init_key, 1); - if (_server) { - auto begin = std::chrono::steady_clock::now(); - do { - auto value = get(_init_key); - int completed = std::stoi(std::string(value.begin(), value.end())); - VLOG(3) << completed << " worker ready, total " << _num_workers; - if (completed >= _num_workers) { - break; - } - const auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - begin); - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { - PADDLE_ENFORCE_EQ( - completed, _num_workers, - platform::errors::InvalidArgument( - "TCPStore timeouted and not all workers got ready.")); - } - } while (true); - } + auto begin = std::chrono::steady_clock::now(); + do { + auto value = get(_init_key); + int completed = std::stoi(std::string(value.begin(), value.end())); + VLOG(3) << completed << " worker ready, total " << _num_workers; + if (completed >= _num_workers) { + break; + } + const auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { + PADDLE_ENFORCE_EQ( + completed, _num_workers, + platform::errors::InvalidArgument( + "TCPStore timeouted and not all workers got ready.")); + } + } while (true); VLOG(3) << "TCPStore initialized."; } int64_t TCPStore::add(const std::string& key, int64_t value) { + VLOG(3) << "TCPStore add."; _client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_value(value); return _client->receive_value(); } +void TCPStore::set(const std::string& key, const std::vector& value) { + VLOG(3) << "TCPStore set."; + _client->send_command_for_key(Command::SET, _key_prefix + key); + _client->send_vector(value); +} + std::vector TCPStore::get(const std::string& key) { wait(key); _client->send_command_for_key(Command::GET, _key_prefix + key); @@ -252,6 +271,7 @@ std::vector TCPStore::get(const std::string& key) { void TCPStore::wait(const std::string& key) { ReplyType reply; + VLOG(3) << "TCPStore wait."; do { _client->send_command_for_key(Command::WAIT, _key_prefix + key); @@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) { TCPStore::~TCPStore() { _client->send_command_for_key(Command::STOP, ""); + VLOG(3) << "~TCPStore"; ReplyType ret = _client->receive_value(); PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, platform::errors::InvalidArgument( diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h index cd706dd6640..17c1d8ea30a 100644 --- a/paddle/fluid/distributed/store/tcp_store.h +++ b/paddle/fluid/distributed/store/tcp_store.h @@ -27,15 +27,16 @@ namespace paddle { namespace distributed { enum class ReplyType { WAITING, STOP_WAIT }; -enum class Command { ADD, GET, WAIT, STOP }; +enum class Command { ADD, GET, SET, WAIT, STOP }; namespace detail { class MasterDaemon { public: - static std::unique_ptr start(SocketType listen_socket); + static std::unique_ptr start(SocketType listen_socket, + int nranks); MasterDaemon() = delete; - explicit MasterDaemon(SocketType listen_socket); + explicit MasterDaemon(SocketType listen_socket, int nranks); ~MasterDaemon(); private: @@ -43,18 +44,20 @@ class MasterDaemon { void _do_add(SocketType socket); void _do_wait(SocketType socket); void _do_get(SocketType socket); + void _do_set(SocketType socket); void _do_stop(SocketType socket); SocketType _listen_socket; std::vector _sockets; std::unordered_map> _store; std::thread _background_thread{}; + int _nranks; bool _stop = false; }; class TCPServer { public: TCPServer() = default; - static std::unique_ptr create(std::uint16_t port); + static std::unique_ptr create(std::uint16_t port, int nranks); private: std::unique_ptr _master_daemon; @@ -97,6 +100,7 @@ class TCPStore : public Store { int64_t add(const std::string& key, int64_t value) override; std::vector get(const std::string& key) override; void wait(const std::string& key) override; + void set(const std::string& key, const std::vector& value) override; private: void waitWorkers(); diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc index d0561d0b9a9..a28cba28833 100644 --- a/paddle/fluid/distributed/store/tcp_utils.cc +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -46,9 +46,10 @@ void close_socket(SocketType socket) { hints.ai_socktype = SOCK_STREAM; const char* node = host.empty() ? nullptr : host.c_str(); + const char* port_cstr = port.empty() ? nullptr : port.c_str(); int n; - n = ::getaddrinfo(node, port.c_str(), &hints, &res); + n = ::getaddrinfo(node, port_cstr, &hints, &res); const char* gai_err = ::gai_strerror(n); const char* proto = (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 48d42f803a8..5e61133510d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -85,6 +85,9 @@ if(NOT ON_INFER) if (WITH_NCCL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) endif() + if (WITH_GLOO) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) + endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) endif() diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index a0d2777f825..c01accaf598 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -31,9 +31,15 @@ namespace pybind { using TCPStore = paddle::distributed::TCPStore; void BindTCPStore(py::module* m) { - py::class_(*m, "TCPStore") - .def( - py::init()) + py::class_>(*m, "TCPStore") + .def(py::init([](std::string hostname, uint16_t port, bool is_master, + size_t world_size, std::chrono::seconds timeout) { + return std::make_shared(hostname, port, is_master, + world_size, timeout); + }), + py::arg("hostname"), py::arg("port"), py::arg("is_master"), + py::arg("world_size"), py::arg("timeout"), + py::call_guard()) .def("add", &TCPStore::add) .def("get", &TCPStore::get); } diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index a4a1d07db2c..3b5644764a5 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -35,6 +35,11 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #endif +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/distributed/store/tcp_store.h" +#endif + namespace py = pybind11; namespace paddle { @@ -42,6 +47,14 @@ namespace pybind { using Tensor = paddle::experimental::Tensor; +#if defined(PADDLE_WITH_GLOO) +using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo; +using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore; +using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions; +#endif + +static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT + void BindDistributed(py::module *m) { py::enum_(*m, "ReduceOp") .value("SUM", distributed::ReduceOp::SUM) @@ -129,6 +142,7 @@ void BindDistributed(py::module *m) { *m, "ProcessGroupNCCL", ProcessGroup) .def(py::init(), py::call_guard()); +#endif py::class_>(*m, "task") @@ -138,7 +152,6 @@ void BindDistributed(py::module *m) { py::call_guard()) .def("synchronize", &distributed::ProcessGroup::Task::Synchronize, py::call_guard()); -#endif // define parallel strategy, it will be removed py::class_ pg_strategy( @@ -178,6 +191,45 @@ void BindDistributed(py::module *m) { self.nrings_ = nrings; }); +#if defined(PADDLE_WITH_GLOO) + py::class_(*m, "GlooOptions") + .def(py::init<>()) + .def_readwrite("_device", &GlooOptions::device) + .def_static("create", &GlooOptions::create); + + py::class_>(*m, "GlooStore") + .def(py::init( + [](const std::shared_ptr &store) { + return std::make_shared(store); + }), + py::call_guard()); + + py::class_>( + *m, "ProcessGroupGloo", ProcessGroup) + .def(py::init &, int, int, + std::shared_ptr &>(), + py::call_guard()) + .def(py::init([](const std::shared_ptr &store, int rank, + int world_size) { + auto opts = GlooOptions::create(); + char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); + if (ifname && strlen(ifname) > 1) { + opts->device = ProcessGroupGloo::createDeviceForInterface( + std::string(ifname)); + } else { + opts->device = ProcessGroupGloo::createDefaultDevice(); + } + return std::make_shared(store, rank, world_size, + opts); + }), + py::arg("store"), py::arg("rank"), + py::arg("world_size"), // py::arg("timeout") = + // kProcessGroupDefaultTimeout, + py::call_guard()) + .def_static("create_default_device", + &ProcessGroupGloo::createDefaultDevice); +#endif + m->def("eager_assign_group_by_size", [](py::handle py_tensors, std::vector is_sparse_gradient, std::vector group_size_limits, diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py new file mode 100644 index 00000000000..5420e1d36b3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +import datetime +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float32" + self.shape = (2, 10, 5) + + def test_create_process_group_gloo(self): + with _test_eager_guard(): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, + nranks, datetime.timedelta(0)) + gloo_store = paddle.fluid.core.GlooStore(store) + opt = paddle.fluid.core.GlooOptions() + pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks) + + # test allreduce sum + # rank 0 + paddle.device.set_device('cpu') + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = x + y + if rank == 0: + task = pg.allreduce(tensor_x) + task.wait() + assert np.array_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + # test allreduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if rank == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if rank == 0: + task = pg.broadcast(tensor_x, 0) + task.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + task.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_y) + print("test broadcast api ok") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py index 6ae5424a882..58baa0a2fa9 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py +++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py @@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus): def test_process_group_nccl(self): self.run_mnist_2gpu('process_group_nccl.py') + def test_process_group_gloo(self): + self.run_mnist_2gpu('process_group_gloo.py') + if __name__ == "__main__": unittest.main() -- GitLab From ebd0f51287ad3ea0c8d91ee899b9edfcbc351c8e Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 3 Mar 2022 09:32:42 +0800 Subject: [PATCH 003/261] Move bn to pten (#39347) * add bn cpu version; test=develop * move batch norm to pten * move batch norm to pten; test=develop * fix bug; test=develop * fix func::tranpose depend bug; test=develop * fix compile bugs; test=develop * fix use_op batch_norm bug; test=develop * fix cudnn bn add relu test; test=develop * fix pten context build and double grad bug; test= develop * remve useless code; test=develop * add batch norm gpu fp16 support; test=develop * fix test bn op bug; test=develop * remove output dtype set; test=develop * fix bug; test=develop * fix bug; test=develop * fix applay pass to program bug; test=develop * revert to develop; test=develop * fix rocm bug; test=develop * revert operator to develop; test=develop * fix pre_commit; test=develop * fix statci check error; test=develop * resolve conflict; test=develop * ana batch norm bug; * revert batch norm op * resolve conlict * fix nan inf and speed bug; test=develop * fix bug; test=develop * fix error; test=develop * test expand op; test=develop * fix bug; test=develop * resolve confilct * resolve confilct; test=develop * polish code; test=develop * polish code; test=develop * change mutable data to ctx alloc; test=develop * make format same with ci; test=develop * fix format error with ci; test=develop --- .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc | 9 +- paddle/fluid/framework/operator.cc | 2 - paddle/fluid/operators/batch_norm_op.cc | 12 - paddle/fluid/operators/batch_norm_op.cu | 1322 ----------------- .../operators/fused/cudnn_bn_add_relu_test.cc | 2 +- paddle/fluid/operators/inplace_abn_op.cc | 83 +- paddle/fluid/operators/inplace_abn_op.cu | 81 +- paddle/fluid/operators/norm_utils.cu.h | 47 +- paddle/phi/kernels/batch_norm_grad_kernel.h | 90 ++ paddle/phi/kernels/batch_norm_kernel.h | 43 + .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 674 +++++++++ paddle/phi/kernels/cpu/batch_norm_kernel.cc | 204 +++ .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 1038 +++++++++++++ paddle/phi/kernels/gpu/batch_norm_kernel.cu | 680 +++++++++ paddle/phi/kernels/gpu/batch_norm_utils.h | 142 ++ paddle/phi/ops/compat/batch_norm_sig.cc | 89 ++ .../dygraph_to_static/test_mobile_net.py | 1 + .../unittests/test_apply_pass_to_program.py | 1 + .../tests/unittests/test_batch_norm_op.py | 16 +- .../tests/unittests/test_batch_norm_op_v2.py | 14 +- .../fluid/tests/unittests/test_conv2d_op.py | 2 + .../tests/unittests/test_expand_v2_op.py | 1 + .../tests/unittests/test_inplace_abn_op.py | 9 +- .../tests/unittests/test_norm_nn_grad.py | 2 + .../unittests/test_program_prune_backward.py | 2 + .../fluid/tests/unittests/test_reshape_op.py | 1 + 26 files changed, 3175 insertions(+), 1392 deletions(-) create mode 100644 paddle/phi/kernels/batch_norm_grad_kernel.h create mode 100644 paddle/phi/kernels/batch_norm_kernel.h create mode 100644 paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/batch_norm_kernel.cc create mode 100644 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/batch_norm_kernel.cu create mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h create mode 100644 paddle/phi/ops/compat/batch_norm_sig.cc diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 96aa95bde33..11190309814 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include -#include #include -#include -#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -25,7 +26,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/place.h" -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6414dd455db..8ebc64e5f2c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext( vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr - } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 949cf021cf0..174207deb08 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ops::BatchNormDoubleGradMaker); REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, ops::BatchNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index d59396db151..a19b087245a 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -template -static __global__ void BNForwardInference( - const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int num = N * C * HxW; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType x_sub_mean = - static_cast>(x[i]) - mean[c]; - BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); - y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( - const T *x, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - bool test_mode = is_test && (!trainable_stats); - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5" - "But received: the size of input's dimensions is [%d]", - x_dims.size())); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - test_mode || - (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); - - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_y(y->type()); - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, y, - &transformed_y); - } else { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * D * C, 1, W * D * C, D * C, C}; - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor( -// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto &dev_ctx = ctx.template device_context(); - - auto handle = dev_ctx.cudnn_handle(); - - // Now, depending on whether we are running test or not, we have two paths. - // It is training mode when it's not reference AND not using pre-trained - // model. - bool training = !test_mode && !use_global_stats; - if (!training) { - // only when test we use input to do computation. - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - // Run inference mode. - PADDLE_ENFORCE_EQ( - est_mean->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of mean's dimensions must equal to 1." - "But received: the size of mean's dimensions mean is [%d]," - "the dimensions of mean is [%s].", - est_mean->dims().size(), est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of variance's dimensions must equal to 1." - "But received: the size of variance's dimensions is [%d]," - "the dimensions of variance is [%s].", - est_var->dims().size(), est_var->dims())); - PADDLE_ENFORCE_EQ( - est_mean->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of mean must equal to the number of " - "Channels, which is [%d]. But received: the first dimension" - "of mean is [%d], the dimensions of mean is [%s].", - C, est_mean->dims()[0], est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of variance must equal to the number" - "of Channels, which is [%d]. But received: the first dimension of" - "variance is [%d], the dimensions of variance is [%s].", - C, est_var->dims()[0], est_var->dims())); - -#ifdef PADDLE_WITH_HIP - const int block_size = 256; - const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; - if (compute_format == DataLayout::kNCHW) { - BNForwardInference< - T, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } else { - BNForwardInference< - T, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardInference( -// handle, miopenBNSpatial, -// const_cast( -// static_cast(CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// const_cast(static_cast( -// est_mean->template data>())), -// const_cast(static_cast( -// est_var->template data>())), -// epsilon)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); -#endif - } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), - &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // Run training mode. - // obtain running mean and running inv var, and there is no need - // to initialize them. - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - - if ((N * H * W * D) == 1) { - // Only 1 element in normalization dimension, - // skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - } else { - double this_factor = 1. - momentum; - - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), transformed_x.type(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), nullptr, nullptr, data_desc_, - transformed_y.template data(), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), - nullptr, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, block, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); -#endif - } - } - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_y, y); - } -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( - const T *dy, const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const double epsilon, const int N, - const int C, const int HxW, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); - BatchNormParamType mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - ds_sum += static_cast>(dy[index]) * - (static_cast>(x[index]) - mean_i); - db_sum += static_cast>(dy[index]); - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale[i] = ds_sum * inv_var_i; - dbias[i] = db_sum; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - -template -static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - double epsilon, int C, int M, - const int num, const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -class InplaceHelper { - public: - void operator()(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, double epsilon, int C, - int M, const int num, const T *y, int grid2, const int block, - const gpuStream_t &stream) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); - KeBNRestoreData<<>>( - layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( - const T *dy, const T *x, const BatchNormParamType *scale, - const BatchNormParamType *saved_mean, - const BatchNormParamType *saved_inv_variance, const int C, const int N, - const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType inv_var_val; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType dscale_val; - __shared__ BatchNormParamType dbias_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - if (saved_mean && saved_inv_variance) { - if (threadIdx.x == 0) { - inv_var_val = saved_inv_variance[i]; - mean_val = saved_mean[i]; - } - } else { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = - static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = - static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - inv_var_val = - 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); - } - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - ds_sum += - dy_i * (static_cast>(x[index]) - mean_val); - db_sum += dy_i; - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale_val = ds_sum * inv_var_val; - dbias_val = db_sum; - dscale[i] = dscale_val; - dbias[i] = dbias_val; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = scale[i] * inv_var_val * - (static_cast>(dy[index]) - - dbias_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_val) * - inv_var_val * dscale_val / inner_size); - } - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( - const T *dy, const BatchNormParamType *scale, - const BatchNormParamType *mean, const T *x, - const BatchNormParamType *variance, const int C, const int N, - const int HxW, T *dx) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType inv_var_i = variance[i]; - BatchNormParamType mean_i = mean[i]; - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[index]) - mean_i); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) - .Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = - (static_cast>(dy[index]) - - dy_sum_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_i) * - dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * - scale[i] * inv_var_i; - } - } -} - -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - const bool is_test = ctx.Attr("is_test"); - use_global_stats = is_test || use_global_stats; - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5." - "But received: the size of input's dimensions is [%d]," - "the dimensions of input is [%s]", - x_dims.size(), x_dims)); - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of scale's dimensions must equal to 1. But received: " - "the size of scale's dimensions is [%d], the dimensions of scale " - "is [%s].", - scale->dims().size(), scale->dims())); - PADDLE_ENFORCE_EQ( - scale->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of scale must equal to Channels[%d]. But " - "received: the first dimension of scale is [%d]", - C, scale->dims()[0])); - - auto dtype = platform::CudnnDataType::type; - const auto *reserve_space = ctx.Input("ReserveSpace"); -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && - reserve_space != nullptr; - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_d_y(d_y->type()); - Tensor transformed_d_x; - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, d_y, - &transformed_d_y); - TransToChannelFirst(ctx, d_y, - &transformed_d_y); - if (d_x) { - ResizeToChannelFirst(ctx, d_x, - &transformed_d_x); - } - } else { - transformed_x.ShareDataWith(*x); - transformed_d_y.ShareDataWith(*d_y); - if (d_x) { - transformed_d_x.ShareDataWith(*d_x); - } - } - - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - - auto &dev_ctx = ctx.template device_context(); - const int num = transformed_x.numel(); -#ifdef HIPCC - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - auto stream = dev_ctx.stream(); - InplaceHelper inplace_functor; - - if (!use_global_stats) { - if ((N * H * W * D) == 1) { - if (d_x) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - } - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, -// data_desc_, mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - if (is_inplace) { - inplace_functor(compute_format, transformed_x.data(), - scale->template data>(), - bias->template data>(), - saved_mean_data, saved_var_data, epsilon, C, H * W * D, - num, transformed_x.data(), grid2, block, stream); - } - - // This branch calls CUDNN APIs - if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( - ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - if (compute_format == DataLayout::kNCHW) { - BNBackward< - T, block, - DataLayout::kNCHW><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } else { - BNBackward< - T, block, - DataLayout::kNHWC><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_d_x, d_x); - } - } else { - // This branch call CUDA kernels - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto *running_mean_data = - running_mean->template data>(); - const auto *running_var_data = - running_var->template data>(); - - if (is_inplace) { - auto px = *x; - inplace_functor(data_layout, px.mutable_data(ctx.GetPlace()), - scale->template data>(), - bias->template data>(), - running_mean_data, running_var_data, epsilon, C, - H * W * D, num, x->data(), grid2, block, stream); - } - - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } - } - } -}; - -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - NormDoubleGradFunctor( - ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon, - use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY); - } -}; - } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 6119af18ce1..b3ac3606eaf 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -32,7 +32,7 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index e0779249c41..7f513696998 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -17,6 +17,8 @@ #include #include #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { @@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { }; template -class InplaceABNKernel - : public paddle::operators::BatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -213,7 +214,33 @@ class InplaceABNKernel auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); - BatchNormKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); auto cur_y = EigenVector::Flatten(*y); InplaceABNActivation functor; @@ -222,8 +249,7 @@ class InplaceABNKernel }; template -class InplaceABNGradKernel - : public paddle::operators::BatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Input("Y"); @@ -244,7 +270,52 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - BatchNormGradKernel::Compute(ctx); + // BatchNormGradKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } }; diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index be7a7bd7171..db8f8c72d13 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -15,14 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { template class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel, - public paddle::operators::BatchNormKernel { + : public paddle::operators::SyncBatchNormKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,7 +37,33 @@ class InplaceABNKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormKernel::Compute(ctx); } else { - BatchNormKernel::Compute(ctx); + // BatchNormKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); } auto cur_y = EigenVector::Flatten(*y); @@ -49,8 +76,7 @@ class InplaceABNKernel // https://kevinzakka.github.io/2016/09/14/batch_normalization/ template class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel, - public paddle::operators::BatchNormGradKernel { + : public paddle::operators::SyncBatchNormGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -74,7 +100,50 @@ class InplaceABNGradKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormGradKernel::Compute(ctx); } else { - BatchNormGradKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } } }; diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index c400a8f4239..0ed1f2719de 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( } template -void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, +void NormDoubleGradFunctor(const DeviceContext &ctx, const DataLayout data_layout, const Tensor *X, const Tensor *Scale, const Tensor *dY, const Tensor *Saved_mean, - const Tensor *Saved_variance, const double epsilon, + const Tensor *Saved_variance, const Tensor *Mean, + const Tensor *Variance, const double epsilon, const bool use_global_stats, const Tensor *ddX, const Tensor *ddScale, const Tensor *ddBias, Tensor *dX, Tensor *dScale, Tensor *ddY) { @@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; auto &x_dims = X->dims(); const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, Tensor scale_tmp; if (!Scale) { scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_tmp, static_cast(1)); + set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); #ifdef __HIPCC__ @@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, #else const int block = 512; #endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); int grid1 = (num + block - 1) / block; const T *mean_data, *variance_data; if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); + const auto *running_mean = Mean; + const auto *running_var = Variance; const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); mean_data = running_mean_data; @@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } else { const T *smean_data = Saved_mean->data(); const T *svariance_data = Saved_variance->data(); + mean_data = smean_data; variance_data = svariance_data; } if (dX) { T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); + set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } else { DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } @@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (dScale) { T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); + set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } else { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } else { DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } @@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (ddY) { T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); + set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h new file mode 100644 index 00000000000..c15dbd2f63f --- /dev/null +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormGradRawKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h new file mode 100644 index 00000000000..7ddf32e27c7 --- /dev/null +++ b/paddle/phi/kernels/batch_norm_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc new file mode 100644 index 00000000000..de2343a384a --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -0,0 +1,674 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormGradRawKernel(const Context& ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + const auto* d_y = &y_grad; + + DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + auto* d_x = x_grad; + auto* d_scale = scale_grad; + auto* d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + // batch_norm with inplace as false will take X as grad input, which + // is same as cuDNN batch_norm backward calculation, batch_norm + // with inplace as true only take Y as input and X should be calculate + // by inverse operation of batch_norm on Y + + if (is_inplace) { + if (d_x) { + PADDLE_ENFORCE_EQ(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } else { + if (d_x) { + PADDLE_ENFORCE_NE(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + const T* mean_data = saved_mean.data(); + const T* inv_var_data = saved_variance.data(); + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + T* d_bias_data = nullptr; + T* d_scale_data = nullptr; + if (d_scale && d_bias) { + d_bias_data = ctx.template Alloc(d_bias); + d_scale_data = ctx.template Alloc(d_scale); + } + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + EigenVectorArrayMap d_bias_arr(d_bias_data, C); + EigenVectorArrayMap d_scale_arr(d_scale_data, C); + + if (d_scale && d_bias) { + d_bias_arr.setZero(); + d_scale_arr.setZero(); + } + + if (d_x && (N * sample_size) == 1 && !use_global_stats) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + return; + } + + int scale_coefff = use_global_stats ? 1 : N * sample_size; + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; + + DenseTensor dy_sum; + dy_sum.Resize({C}); + auto dy_sum_data = ctx.template Alloc(&dy_sum); + EigenVectorArrayMap dy_sum_arr(dy_sum_data, C); + + DenseTensor dy_mul_x_sub_mean_mul_invstd_sum; + dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); + auto dy_mul_x_sub_mean_mul_invstd_sum_data = + ctx.template Alloc(&dy_mul_x_sub_mean_mul_invstd_sum); + EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( + dy_mul_x_sub_mean_mul_invstd_sum_data, C); + + dy_sum_arr.setZero(); + dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + + // inplace calculation + // Y: ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + // X: (y - bias) / scale / (inv_var) + est_mean + // formula transform ====> + // (y - bias) / (scale * inv_var) + est_mean + switch (data_layout) { + case DataLayout::kNCHW: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), sample_size, N * C); + ConstEigenArrayMap y_data(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / + scale_inv_var_nhw(nc % C) / scale_coefff + + mean_arr(nc % C); + } + } + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + dy_sum_arr(c) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), sample_size, N * C); + if (!use_global_stats) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c)); + } + } else { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); + } + } + } + break; + } + case DataLayout::kNHWC: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), C, N * sample_size); + ConstEigenArrayMap y_data(x.data(), C, N * sample_size); + for (int nhw = 0; nhw < N * sample_size; nhw++) { + x_data.col(nhw) = + (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff + + mean_arr; + } + } + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + dy_sum_arr += d_y_arr.col(nhw); + dy_mul_x_sub_mean_mul_invstd_sum_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), C, N * sample_size); + if (!use_global_stats) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - + (x_arr.col(nhw) - mean_arr) * + dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); + } + } else { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); + } + } + } + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } +} + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context& ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad) { + const auto* X = &x; + const auto* Scale = &scale; + const auto* dY = &y_grad; + const auto* Saved_mean = &saved_mean; + const auto* Saved_variance = &saved_variance; + + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const auto data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto* ddX = &x_grad_grad; + const auto* ddScale = &scale_grad_grad; + const auto* ddBias = &bias_grad_grad; + + auto* dX = x_grad; + auto* dScale = scale_grad; + auto* ddY = y_grad_grad; + ctx.template Alloc(dX); + ctx.template Alloc(ddY); + + const auto& x_dims = X->dims(); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = X->numel() / C; + phi::funcs::SetConstant set_constant; + + const T* mean_data = Saved_mean->data(); + const T* inv_var_data = Saved_variance->data(); + + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + // transpose NCHW -> NHWC for easy calculate + DenseTensor transformed_x(X->type()); + DenseTensor transformed_dy(dY->type()); + DenseTensor transformed_ddx(ddX->type()); + + DenseTensor transformed_dx(dX->type()); + DenseTensor transformed_ddy(ddY->type()); + if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + // Input Tensor + ResizeToChannelLast(ctx, X, &transformed_x); + TransToChannelLast(ctx, X, &transformed_x); + ResizeToChannelLast(ctx, dY, &transformed_dy); + TransToChannelLast(ctx, dY, &transformed_dy); + ResizeToChannelLast(ctx, ddX, &transformed_ddx); + TransToChannelLast(ctx, ddX, &transformed_ddx); + // Output Tensor + ResizeToChannelLast(ctx, dX, &transformed_dx); + ResizeToChannelLast(ctx, ddY, &transformed_ddy); + } else { + transformed_x.ShareDataWith(*X); + transformed_dy.ShareDataWith(*dY); + transformed_ddx.ShareDataWith(*ddX); + + transformed_dx.ShareDataWith(*dX); + transformed_ddy.ShareDataWith(*ddY); + } + + ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + Tensor mean_tile; + mean_tile.Resize({C, sample_size}); + EigenArrayMap mean_tile_data( + ctx.template Alloc(&mean_tile), C, sample_size); + + DenseTensor inv_var_tile; + inv_var_tile.Resize({C, sample_size}); + EigenArrayMap inv_var_tile_data( + ctx.template Alloc(&inv_var_tile), C, sample_size); + + mean_tile_data = mean_arr.replicate(1, sample_size); + inv_var_tile_data = inv_var_arr.replicate(1, sample_size); + + DenseTensor Scale_data; + if (!Scale) { + Scale_data.Resize({C}); + ctx.template Alloc(&Scale_data); + set_constant(ctx, &Scale_data, static_cast(1)); + } + ConstEigenVectorArrayMap scale_arr( + Scale ? Scale->data() : Scale_data.data(), C); + + Tensor scale_tile; + scale_tile.Resize({C, sample_size}); + EigenArrayMap scale_tile_data( + ctx.template Alloc(&scale_tile), C, sample_size); + scale_tile_data = scale_arr.replicate(1, sample_size); + + ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); + ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); + + DenseTensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({C, sample_size}); + + EigenArrayMap x_sub_mean_mul_invstd_arr( + ctx.template Alloc(&x_sub_mean_mul_invstd), C, sample_size); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dX) { + ctx.template Alloc(dX); + EigenArrayMap dx_arr( + ctx.template Alloc(&transformed_dx), C, sample_size); + dx_arr.setZero(); + if (use_global_stats) { + // math: dx = (ddscale * dy) * inv_var + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; + } + } else { + // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, + // axis=(n,h,w)) * + // np.sum(dy, axis=(n,h,w)) - + // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - + // mean), + // axis=(n,h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / + // NxHxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * + // np.sum(dy, + // axis=(n,h,w)) * (x - mean) * + // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - + // inv_var + // * + // np.mean(dy, axis=(n,h,w)) - + // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), + // axis=(n,h,w))) + + if (ddX) { + dx_arr += + (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size) + .colwise() * + (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - + (dy_arr * ddx_arr).rowwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * + (ddx_arr.rowwise().sum() / sample_size - ddx_arr); + + dx_arr = scale_tile_data * dx_arr; + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr += + (dy_arr * inv_var_tile_data - + (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) * + inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size) * + ddscale_tile_data; + } + } + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_dx, dX); + } + } + if (dScale) { + EigenVectorArrayMap dscale_arr(ctx.template Alloc(dScale), C); + dscale_arr.setZero(); + if (use_global_stats) { + // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var + if (ddX) { + dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); + } + } else { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * + // ddx + if (ddX) { + Tensor first_grad; + first_grad.Resize({C, sample_size}); + EigenArrayMap first_grad_arr( + ctx.template Alloc(&first_grad), C, sample_size); + first_grad_arr.setZero(); + + first_grad_arr += + inv_var_tile_data * + (dy_arr - + dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); + } + } + } + + if (ddY) { + ctx.template Alloc(ddY); + EigenArrayMap ddy_arr( + ctx.template Alloc(&transformed_ddy), C, sample_size); + ddy_arr.setZero(); + if (use_global_stats) { + // math: ddy = r * ddx * inv_var + ddbias + + // ddscale * (x - mean) * inv_var + if (ddX) { + ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; + } + } else { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(n,h,w))) + if (ddX) { + ddy_arr += + scale_tile_data * inv_var_tile_data * + (ddx_arr - + ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + } + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + } + + if (ddBias) { + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({C, sample_size}); + EigenArrayMap ddbias_tile_data( + ctx.template Alloc(&ddbias_tile), C, sample_size); + ddbias_tile_data = ddbias_arr.replicate(1, sample_size); + + ddy_arr += ddbias_tile_data; + } + + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_ddy, ddY); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + CPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double) {} + +PD_REGISTER_KERNEL(batch_norm_grad_grad, + CPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc new file mode 100644 index 00000000000..743128e8dea --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { + bool test_mode = is_test && (!trainable_statistics); + + bool global_stats = test_mode || use_global_stats; + + auto data_layout = paddle::framework::StringToDataLayout(data_layout_str); + + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensionss is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // alloc memory + ctx.template Alloc(y); + ctx.template Alloc(mean_out); + ctx.template Alloc(variance_out); + ctx.template Alloc(saved_mean); + ctx.template Alloc(saved_variance); + + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + if (!global_stats) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e(ctx.template Alloc(saved_mean), C); + EigenVectorArrayMap saved_variance_e( + ctx.template Alloc(saved_variance), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + EigenVectorArrayMap running_mean_arr(ctx.template Alloc(mean_out), C); + EigenVectorArrayMap running_var_arr(ctx.template Alloc(variance_out), + C); + + if ((N * sample_size) == 1) { + // Only 1 element in normalization dimension, + // we skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + return; + } + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } + + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (global_stats) { + ConstEigenVectorArrayMap var_arr(variance.data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std(saved_variance->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + global_stats ? mean.data() : saved_mean->data(), C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(ctx.template Alloc(y), sample_size, N * C); + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(ctx.template Alloc(y), C, N * sample_size) = + (ConstEigenArrayMap(x.data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d", + data_layout)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu new file mode 100644 index 00000000000..2c9ee5ede01 --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -0,0 +1,1038 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( + const T *dy, + const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const double epsilon, + const int N, + const int C, + const int HxW, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); + BatchNormParamType mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += static_cast>(dy[index]) * + (static_cast>(x[index]) - mean_i); + db_sum += static_cast>(dy[index]); + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum * inv_var_i; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, + const int C, + const int HxW, + const int num, + T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + +template +static __global__ void KeBNRestoreData(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C; + auto y_i = static_cast>(y[i]); + auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; + x[i] = static_cast(x_i); + } +} + +template +class InplaceHelper { + public: + void operator()(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y, + int grid2, + const int block, + const gpuStream_t &stream) { + PADDLE_ENFORCE_EQ(x, + y, + phi::errors::InvalidArgument( + "X and Y should be inplaced in inplace mode")); + KeBNRestoreData<<>>( + layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); + } +}; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, + const int C, + const int N, + const int HxW, + const double epsilon, + T *dx, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, + const int N, + const int HxW, + T *dx) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType inv_var_i = variance[i]; + BatchNormParamType mean_i = mean[i]; + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[index]) - mean_i); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) + .Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = + (static_cast>(dy[index]) - + dy_sum_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_i) * + dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * + scale[i] * inv_var_i; + } + } +} + +template +void BatchNormGradRawKernel(const Context &ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + double epsilon = static_cast(epsilon_f); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto *d_y = &y_grad; + + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + const auto &x_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5." + "But received: the size of input's dimensions is [%d]," + "the dimensions of input is [%s]", + x_dims.size(), + x_dims)); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + if (d_scale && d_bias) { + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + } + + PADDLE_ENFORCE_EQ( + scale.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of scale's dimensions must equal to 1. But received: " + "the size of scale's dimensions is [%d], the dimensions of scale " + "is [%s].", + scale.dims().size(), + scale.dims())); + PADDLE_ENFORCE_EQ( + scale.dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of scale must equal to Channels[%d]. But " + "received: the first dimension of scale is [%d]", + C, + scale.dims()[0])); + + auto dtype = paddle::platform::CudnnDataType::type; +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && + FLAGS_cudnn_batchnorm_spatial_persistent && + (reserve_space.get_ptr() != nullptr); + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_d_y(d_y->type()); + DenseTensor transformed_d_x; + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, d_y, &transformed_d_y); + TransToChannelFirst(ctx, d_y, &transformed_d_y); + if (d_x) { + ResizeToChannelFirst(ctx, d_x, &transformed_d_x); + } + } else { + transformed_x.ShareDataWith(x); + transformed_d_y.ShareDataWith(*d_y); + if (d_x) { + transformed_d_x.ShareDataWith(*d_x); + } + } + + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else + const int block = 512; +#endif + int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + auto stream = ctx.stream(); + InplaceHelper inplace_functor; + + if (!use_global_stats) { + if ((N * H * W * D) == 1) { + if (d_x) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + } + phi::funcs::SetConstant> functor; + functor(ctx, d_scale, static_cast>(0)); + functor(ctx, d_bias, static_cast>(0)); + return; + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor( + &bn_param_desc_)); +#endif + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); +#endif + + const auto *saved_mean_data = + saved_mean.template data>(); + const auto *saved_var_data = + saved_variance.template data>(); + + if (is_inplace) { + inplace_functor(compute_format, + transformed_x.data(), + scale.template data>(), + bias.template data>(), + saved_mean_data, + saved_var_data, + epsilon, + C, + H * W * D, + num, + transformed_x.data(), + grid2, + block, + stream); + } + + // This branch calls CUDNN APIs + if (d_x && d_scale && d_bias) { + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/transformed_x.template data(), + /*yDesc=*/nullptr, + /*yData=*/nullptr, + /*dyDesc=*/data_desc_, + /*dyData=*/transformed_d_y.template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/ctx.template Alloc(&transformed_d_x), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale.template data>(), + /*bnBiasData=*/nullptr, + /*dBnScaleData=*/d_scale + ->template mutable_data>( + ctx.GetPlace()), + /*dBnBiasData=*/d_bias + ->template mutable_data>( + ctx.GetPlace()), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/nullptr, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast( + reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + if (compute_format == DataLayout::kNCHW) { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackward( + ctx.cudnn_handle(), + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + transformed_d_y.template data(), + data_desc_, + ctx.template Alloc(&transformed_d_x), + bn_param_desc_, + scale.template data>(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean_data, + saved_var_data)); +#endif + } + + if (data_layout == DataLayout::kNHWC && + compute_format == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_d_x, d_x); + } + } else { + // This branch call CUDA kernels + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor( + bn_param_desc_)); +#endif + } else { + const auto *running_mean = mean.get_ptr(); + const auto *running_var = variance.get_ptr(); + + const auto *running_mean_data = + running_mean->template data>(); + const auto *running_var_data = + running_var->template data>(); + + if (is_inplace) { + auto px = x; + inplace_functor(data_layout, + ctx.template Alloc(&px), + scale.template data>(), + bias.template data>(), + running_mean_data, + running_var_data, + epsilon, + C, + H * W * D, + num, + x.data(), + grid2, + block, + stream); + } + + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } +} + +template +void BatchNormGradKernel(const Context &dev_ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context &ctx, + const DenseTensor &x_grad_grad, + const DenseTensor &scale_grad_grad, + const DenseTensor &bias_grad_grad, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *y_grad_grad) { + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const DenseTensor *running_mean = nullptr; + const DenseTensor *running_variance = nullptr; + if (use_global_stats) { + running_mean = mean.get_ptr(); + running_variance = variance.get_ptr(); + } + paddle::operators::NormDoubleGradFunctor(ctx, + data_layout, + &x, + &scale, + &y_grad, + &saved_mean, + &saved_variance, + running_mean, + running_variance, + epsilon, + use_global_stats, + &x_grad_grad, + &scale_grad_grad, + &bias_grad_grad, + x_grad, + scale_grad, + y_grad_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} + +#else +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu new file mode 100644 index 00000000000..6ad12245d2a --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -0,0 +1,680 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void BNForwardInference(const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + +template +void BatchNormKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space) { + double epsilon = epsilon_f; + const bool trainable_stats = trainable_statistics; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + bool test_mode = is_test && (!trainable_stats); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x.dims(); + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + ctx.template Alloc(y); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = + test_mode || + (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); + + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_y(y->type()); + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, y, &transformed_y); + } else { + transformed_x.ShareDataWith(x); + transformed_y.ShareDataWith(*y); + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); +#endif + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + // Note: PERSISTENT not implemented for inference + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, + data_desc_, + test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); +#endif + + auto handle = ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + // It is training mode when it's not reference AND not using pre-trained + // model. + bool training = !test_mode && !use_global_stats; + if (!training) { + // only when test we use input to do computation. + const auto *est_mean = &mean; + const auto *est_var = &variance; + // Run inference mode. + PADDLE_ENFORCE_EQ( + est_mean->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of mean's dimensions must equal to 1." + "But received: the size of mean's dimensions mean is [%d]," + "the dimensions of mean is [%s].", + est_mean->dims().size(), + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of variance's dimensions must equal to 1." + "But received: the size of variance's dimensions is [%d]," + "the dimensions of variance is [%s].", + est_var->dims().size(), + est_var->dims())); + PADDLE_ENFORCE_EQ( + est_mean->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of mean must equal to the number of " + "Channels, which is [%d]. But received: the first dimension" + "of mean is [%d], the dimensions of mean is [%s].", + C, + est_mean->dims()[0], + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of variance must equal to the number" + "of Channels, which is [%d]. But received: the first dimension of" + "variance is [%d], the dimensions of variance is [%s].", + C, + est_var->dims()[0], + est_var->dims())); + +#ifdef PADDLE_WITH_HIP + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + est_mean->template data>(), + est_var->template data>(), + epsilon)); +#endif + } else { + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + // need to solve here + // if (ctx.HasInput("MomentumTensor")) { + // const auto *mom_tensor = MomentumTensor; + // DenseTensor mom_cpu; + // paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + // &mom_cpu); + // momentum = mom_cpu.data()[0]; + // } + + // Run training mode. + // obtain running mean and running inv var, and there is no need + // to initialize them. + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + + if ((N * H * W * D) == 1) { + // Only 1 element in normalization dimension, + // skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + } else { + double this_factor = 1. - momentum; + + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + // auto *reserve_space = ctx.Output("ReserveSpace"); + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + phi::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*zDesc=*/nullptr, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*activationDesc=*/nullptr, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space_ptr = reserve_space->mutable_data( + ctx.GetPlace(), transformed_x.type(), reserve_space_size); + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, + mode_, + CUDNN_BATCHNORM_OPS_BN, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + nullptr, + nullptr, + data_desc_, + transformed_y.template data(), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()), + nullptr, + workspace_ptr, + workspace_size, + reserve_space_ptr, + reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, + block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, + block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); +#endif + } + } + } + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_y, y); + } +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); +#endif +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h new file mode 100644 index 00000000000..c9c62026edf --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_utils.h @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc new file mode 100644 index 00000000000..011d4c12ece --- /dev/null +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); +} + +KernelSignature BatchNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "batch_norm_grad", + {GradVarName("Y"), + "X", + "Scale", + "Bias", + "SavedMean", + "SavedVariance", + "ReserveSpace", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); +} + +KernelSignature BatchNormGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm_grad_grad", + {"DDX", + "DDScale", + "DDBias", + "DY", + "X", + "Scale", + "SavedMean", + "SavedVariance", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"DX", "DScale", "DDY"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad, + phi::BatchNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad, + phi::BatchNormGradGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index 30c1955adcf..c6f491a5484 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -520,6 +520,7 @@ def predict_static(args, data): paddle.enable_static() exe = fluid.Executor(args.place) # load inference model + [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model( args.model_save_dir, diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py index 4552d600baf..2b281d7d6f7 100644 --- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py +++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py @@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase): for k, v in self.get_strategy().items(): setattr(build_strategy, k, v) self.check_before_applied(main2, startup2) + apply_build_strategy(main2, startup2, build_strategy, {"use_cuda": self.use_cuda}) self.check_after_applied(main2, startup2) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index cce13a8bf3b..b02df024518 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase): def test_check_output(self): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): def test_check_output(self): places = [] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): places.append(place) - for place in places: - for data_format in ["NCHW", "NHWC"]: + #for data_format in ["NCHW", "NHWC"]: + for data_format in ["NCHW"]: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3]) @@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): class TestDygraphBatchNormTrainableStats(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 6a6f85a4832..c9abac8fb79 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -28,7 +28,7 @@ import paddle class TestBatchNorm(unittest.TestCase): def test_name(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: with fluid.dygraph.guard(p): @@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase): def test_error(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: #paddle.disable_static() @@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase): else: paddle.set_default_dtype("float64") self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) def tearDown(self): @@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase): class TestBatchNormUseGlobalStats(unittest.TestCase): def setUp(self): self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) self.init_test() @@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 8ea4e369d32..826f886dab1 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle import paddle.fluid.core as core @@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class( TestWithDilation_AsyPadding, grad_check=False) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index aee6ca249f5..a204c26c1b8 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index 077496200d9..67f6b910214 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.layer_helper import LayerHelper from paddle.fluid import compiler import paddle.fluid.unique_name as unique_name +import paddle class TestInplaceANBOpTraining(unittest.TestCase): @@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase): outs[0].name if not only_forward else None, build_strategy=build_strategy, exec_strategy=exec_strategy) - bn_fetches = exe.run(program=comp_prog1, + bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_name) fetch_outs.append(bn_fetches) fetch_names.append(fetch_name) - for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs + - fetch_names)): + for bn_val, inplace_abn_val, name1, name2 in zip(*( + fetch_outs + fetch_names)): self.assertTrue( np.allclose( bn_val, inplace_abn_val, atol=1e-2), @@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase): def test_op(self): use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] + #use_cudas = [False] for use_cuda in use_cudas: place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() layouts = ["NCHW", "NHWC"] @@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py index fe8c181b790..49fe397644d 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py @@ -21,6 +21,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core import gradient_checker +import paddle from decorator_helper import prog_scope @@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index b01c7cf1799..a1a3b31a976 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm import seresnext_net from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from fake_reader import fake_imdb_reader +import paddle def lstm_net(use_feed): @@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index c860d6972fb..40481b09782 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() -- GitLab From 71c69507cd9530cf49a72a8fcd083d2e8eb3e96b Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Thu, 3 Mar 2022 09:56:28 +0800 Subject: [PATCH 004/261] [Eager][YAML] Supported array-type parsing for output tensors (#40058) --- .../final_state_generator/eager_gen.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 02183e2ca5c..f2088dcda76 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -213,8 +213,12 @@ def ParseYamlReturns(string): returns = [x.strip() for x in string.strip().split(",")] for i in range(len(returns)): - ret = returns[i] - returns_list.append(["", ret, i]) + ret_type = returns[i] + + assert ret_type in yaml_types_mapping.keys() + ret_type = yaml_types_mapping[ret_type] + + returns_list.append(["", ret_type, i]) return returns_list -- GitLab From 6bf85eafc8dc0ab57c87bbf51e7ac225ba05776c Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 3 Mar 2022 10:01:38 +0800 Subject: [PATCH 005/261] Implement SparseConv3d kernel (#39784) * sparse conv3d: gpu code --- paddle/phi/core/sparse_coo_tensor.h | 1 + paddle/phi/kernels/sparse/CMakeLists.txt | 2 +- .../kernels/sparse/cpu/convolution_kernel.cc | 4 +- .../kernels/sparse/gpu/convolution_kernel.cu | 612 ++++++++++++++++++ .../kernels/test_sparse_conv3d_dev_api.cc | 102 +++ 5 files changed, 717 insertions(+), 4 deletions(-) create mode 100644 paddle/phi/kernels/sparse/gpu/convolution_kernel.cu diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index 0dd5d543414..ca3290f33e6 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase, void* AllocateFrom(Allocator* allocator, DataType dtype, size_t requested_size = 0) override; + void set_dims(const DDim& dims) { this->dims_ = dims; } private: // save the indices of non zero elements in original dense tensor diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt index 3e4a968b7a8..a319e9a13c3 100644 --- a/paddle/phi/kernels/sparse/CMakeLists.txt +++ b/paddle/phi/kernels/sparse/CMakeLists.txt @@ -1,3 +1,3 @@ -set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) +set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function) register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel") diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index fdf255bd542..93397d4c931 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" namespace phi { namespace sparse { @@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx, // 1. product rulebook DenseTensorMeta counter_meta( DataType::INT32, {kernel_size}, DataLayout::NCHW); - // DenseTensor rulebook = phi::Empty(dev_ctx); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); ProductRuleBook(dev_ctx, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu new file mode 100644 index 00000000000..aeb9409c417 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -0,0 +1,612 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" + +namespace phi { +namespace sparse { + +// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex +__global__ void InitByIndexKernel(const int n, int* out1, int* out2) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + out1[i] = i; + out2[i] = i; + } +} + +/** + * @brief: update the out index and indices + * unique_keys: save the index of the output feature list + * unique_values: indiates the index of key before deduplication + * out_indexs: indicates the position of the output index in the rulebook + * rulebook_len: indicates the length of rulebook + * out_dims: indicates the output dims + * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) + * rulebook_out_indexs: the output index in rulebook +**/ +__global__ void UpdateIndexKernel(const int* unique_keys, + const int* unique_values, + const int* out_indexs, + const int non_zero_num, + const int rulebook_len, + const Dims4D out_dims, + int* out_indices, + int* rulebook_out_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + const int index = unique_keys[i]; + int batch, x, y, z; + IndexToPoint(index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + + // update rulebook + int start = unique_values[i]; + int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; + // max(end-start) = kernel_size + for (int j = start; j < end; j++) { + rulebook_out_indexs[out_indexs[j]] = i; + } + } +} + +/** + * @brief product rulebook + * for input_i in x_indices: + * if input_i participate in the convolution calculation: + * infer the output_i by input_i and kernel_i + * save output_i + * + * x_indices: the indices of input features + * x_dims: the input dims + * kernel_dims: the kernel dims + * out_dims: the output dims + * non_zero_num: the number of input features + * rulebook: the rulebook to save the kernel index, input index and output index + * counter: save the number of times each location in the kernel participates in + *the caculation +**/ +__global__ void ProductRuleBookKernel(const int* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + int* rulebook, + int* counter) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int counter_buf[]; // kernel_size + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + int in_i = -1, out_index = -1; + if (Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + in_i = i; + out_index = + PointToIndex(batch, out_x, out_y, out_z, out_dims); + atomicAdd(&counter_buf[kernel_index], 1); + } + rulebook[kernel_index * non_zero_num + i] = in_i; + rulebook[kernel_index * non_zero_num + offset + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicAdd(&counter[i], counter_buf[i]); + } +} + +// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace +// this kernel with phi::GatherCUDAKernel; +// Vectorization can be used to improve read and write bandwidth +/** + * brief: gather data from params according to indices + * params: the inputs + * indices: the indices you want to gather + * output: the outputs + * index_size: the size of indices + * slice_size: slice size corresponding to each index, here is the channel size +**/ +template +__global__ void GatherKernel(const T* params, + const IndexT* indices, + T* output, + size_t index_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = indices[indices_i]; + int64_t params_i = gather_i * slice_size + slice_i; + *(output + i) = *(params + params_i); + } +} + +/** + * brief: scatter add + * input: the inputs + * unique_value: refer to UpdateIndexKernel notes + * out_index: the output feature index + * non_zero_num: the number of output features + * rulebook_len: the length of rulebook + * channels: the output channel size + * out: the outputs +**/ +template +__global__ void ScatterKernel(const T* input, + const int* unique_value, + const int* out_index, + const int non_zero_num, + const int rulebook_len, + const int channels, + T* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { + int indices_i = i / channels; + int channels_i = i - indices_i * channels; + + int start = unique_value[indices_i]; + int end = indices_i == non_zero_num - 1 ? rulebook_len + : unique_value[indices_i + 1]; + // max(end-start) = kernel_size + T sum = static_cast(0); + for (int j = start; j < end; j++) { + const int out_feature_i = out_index[j]; + sum += input[out_feature_i * channels + channels_i]; + } + out[indices_i * channels + channels_i] = sum; + } +} + +// brief: calculation the distance between start and end +__global__ void DistanceKernel(const int* start, + const int* end, + int* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +// the basic algorithm can refer to convolution_kernel.cc or +// the second paper +// example: +// 1. the rulebook: +// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... +// the out_index(key): 20, 30, 33, 30, 33, 20, 25 +// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... +// 3. sorted the (key, value) +// 4. unique the (key, value): +// unique_key: 20, 25, 30, 33 +// unique_values: 0, 2, 3, 5 +// the index of unique_values is: 0, 1, 2, 3 +// 5. update the out_index by unique_key, uniqe_value and the index of +// unique_value: +// the new out_index: 0, 2, 3, 2, 3, 0, 1 +template +int ProductRuleBook(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const DDim& out_dims, + DenseTensor* rulebook, + DenseTensor* counter_per_kernel, + DenseTensor* offsets_per_kernel, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value, + SparseCooTensor* out, + std::vector* h_counter, + std::vector* h_offsets) { + const auto& kernel_dims = kernel.dims(); + const int64_t non_zero_num = x.nnz(); + const auto& non_zero_indices = x.non_zero_indices(); + const int* indices_ptr = non_zero_indices.data(); + dev_ctx.Alloc(counter_per_kernel, + counter_per_kernel->dtype(), + sizeof(int) * counter_per_kernel->numel()); + int* counter_ptr = counter_per_kernel->data(); + dev_ctx.Alloc(offsets_per_kernel, + offsets_per_kernel->dtype(), + sizeof(int) * offsets_per_kernel->numel()); + int* offsets_ptr = offsets_per_kernel->data(); + int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num}); + dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel()); + int* rulebook_ptr = rulebook->data(); + + const auto x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); + Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); + Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); + Dims4D d_strides(1, strides[2], strides[1], strides[0]); + Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); + + // 1. product rule book + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, counter_per_kernel, 0); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + rulebook_ptr, + counter_ptr); + +// 2. remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 2 * kernel_size * non_zero_num, + -1); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + int rulebook_len = + (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1]; + + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({rulebook_len}); + unique_value->ResizeAndAllocate({rulebook_len}); + unique_key->ResizeAndAllocate({rulebook_len}); + dev_ctx.Alloc( + out_index, out_index->dtype(), sizeof(int) * out_index->numel()); + int* out_index_ptr = out_index->data(); + dev_ctx.Alloc( + unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel()); + int* unique_value_ptr = unique_value->data(); + dev_ctx.Alloc( + unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel()); + int* unique_key_ptr = unique_key->data(); + + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + InitByIndexKernel<<>>( + rulebook_len, out_index_ptr, unique_value_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, + rulebook_ptr + rulebook_len, + rulebook_len * sizeof(int), + hipMemcpyDeviceToDevice, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, + rulebook_ptr + rulebook_len, + rulebook_len * sizeof(int), + cudaMemcpyDeviceToDevice, + dev_ctx.stream()); +#endif + +// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher +// performance, but thrust::merge_by_key limited by data size +#ifdef PADDLE_WITH_HIP + thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key_ptr, + unique_key_ptr + rulebook_len, + out_index_ptr); + + // 4. unique + thrust::pair new_end = +#ifdef PADDLE_WITH_HIP + thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key_ptr, + unique_key_ptr + rulebook_len, + unique_value_ptr); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1>>>(unique_key_ptr, + new_end.first, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1); + int out_non_zero_num = 0; +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + dev_ctx.Alloc( + &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel()); + int* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>(unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void Conv3dKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + SparseCooTensor* out, + DenseTensor* rulebook) { + // update padding and dilation + // Currently, only support x.layout is NDHWC, groups = 1 + // if x.layout != NDHWC then transpose(x), transpose(weight) + + const auto& x_dims = x.dims(); + const auto& kernel_dims = kernel.dims(); + int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + DDim out_dims = {1, 1, 1, 1, 1}; + GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + out->set_dims(out_dims); + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + std::vector offsets(kernel_size + 1), h_counter(kernel_size); + + // Second algorithm: + // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf + // 1. product rulebook + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensorMeta offsets_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); + DenseTensor out_index = phi::Empty(dev_ctx); + DenseTensor unique_key = phi::Empty(dev_ctx); + DenseTensor unique_value = phi::Empty(dev_ctx); + + int n = ProductRuleBook(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + out_dims, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_key, + &unique_value, + out, + &h_counter, + &offsets); + + const int* counter_ptr = counter_per_kernel.data(); + const int* offsets_ptr = counter_per_kernel.data(); + + // 2. gather + DenseTensorMeta in_features_meta( + x.dtype(), {n, in_channels}, DataLayout::NCHW); + DenseTensorMeta out_features_meta( + x.dtype(), {n, out_channels}, DataLayout::NCHW); + phi::DenseTensor in_features = + phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::DenseTensor out_features = + phi::Empty(dev_ctx, std::move(out_features_meta)); + dev_ctx.Alloc( + &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); + T* in_features_ptr = in_features.data(); + dev_ctx.Alloc( + &out_features, out_features.dtype(), sizeof(T) * out_features.numel()); + T* out_features_ptr = out_features.data(); + + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook->data(), + in_features_ptr, + n, + in_channels); + + // 3. call gemm for every werght + auto blas = phi::funcs::GetBlas(dev_ctx); + auto* out_values = out->mutable_non_zero_elements(); + dev_ctx.Alloc( + out_values, out_values->dtype(), sizeof(T) * out_values->numel()); + T* out_values_ptr = out_values->data(); + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (h_counter[i] <= 0) { + continue; + } + + // call gemm: (n, in_channels) * (in_channels, out_channels) + const int M = h_counter[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * K * N; + T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels; + + blas.GEMM(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_out_ptr); + } + + // 4. scatter + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + ScatterKernel<<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_conv3d, + GPU, + ALL_LAYOUT, + phi::sparse::Conv3dKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 00b2a256a95..ace95b55055 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" @@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector& indices, f_verify(grads[1].data(), kernel_grad); } } + +// test gpu +#if defined(PADDLE_WITH_CUDA) + phi::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + + DenseTensor d_indices_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + dev_ctx_gpu.Alloc(&d_indices_tensor, + d_indices_tensor.dtype(), + sizeof(int) * d_indices_tensor.numel()); + phi::Copy( + dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); + + DenseTensor d_features_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + dev_ctx_gpu.Alloc(&d_features_tensor, + d_features_tensor.dtype(), + sizeof(T) * d_features_tensor.numel()); + phi::Copy( + dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); + + SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); + + DenseTensor d_kernel_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + kernel_dims, + DataLayout::NHWC)); + dev_ctx_gpu.Alloc(&d_kernel_tensor, + d_kernel_tensor.dtype(), + sizeof(T) * d_kernel_tensor.numel()); + phi::Copy( + dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); + + DenseTensor d_rulebook = phi::Empty(dev_ctx_gpu); + SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + paddings, + dilations, + strides, + 1, + &d_rulebook); + + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]); + } + + DenseTensor h_indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); + dev_ctx_cpu.Alloc(&h_indices_tensor, + h_indices_tensor.dtype(), + sizeof(int) * h_indices_tensor.numel()); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_indices(), + phi::CPUPlace(), + true, + &h_indices_tensor); + + int cmp_indices2 = memcmp(correct_out_indices.data(), + h_indices_tensor.data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices2, 0); + + DenseTensor h_features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {d_out.nnz()}, + d_out.layout())); + + dev_ctx_cpu.Alloc(&h_features_tensor, + h_features_tensor.dtype(), + sizeof(T) * h_features_tensor.numel()); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_elements(), + phi::CPUPlace(), + true, + &h_features_tensor); + for (uint64_t i = 0; i < correct_out_features.size(); i++) { + float tmp = std::fabs(static_cast(correct_out_features[i] - + h_features_tensor.data()[i])); + ASSERT_LT(tmp, diff); + } +#endif } void TestConv3d(const std::vector& indices, -- GitLab From 909d1e617c36cf19822cb3b96ea14783cda6dfff Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:05:59 +0800 Subject: [PATCH 006/261] Modified Reduce for XPU2 (#38918) 1. set xpu2 block_size = 64 2. fix a bug when reduce_num is too large --- paddle/phi/kernels/gpu/reduce.h | 130 ++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 9223a94c12a..94c2e980e36 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -178,6 +178,8 @@ struct IndexCalculator { : dim(dim) { dims = details::VectorToArray(cal_dims); strides = details::VectorToArray(full_strides); + reduce_strides = details::VectorToArray(cal_strides); +#ifndef PADDLE_WITH_XPU_KP std::vector cal_divmoders; // fast divmod for (auto i : cal_strides) { @@ -185,9 +187,22 @@ struct IndexCalculator { } divmoders = details::VectorToArray( cal_divmoders); +#endif } __device__ inline int operator()(int offset) const { +#ifdef PADDLE_WITH_XPU_KP + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + index += (offset / reduce_strides[i]) * strides[dims[i]]; + offset = offset % reduce_strides[i]; + } + return index; +#else int index = 0; #pragma unroll for (int i = 0; i < kMaxRank; ++i) { @@ -199,12 +214,16 @@ struct IndexCalculator { offset = divmod.val[1]; } return index; +#endif } int dim; phi::Array dims; phi::Array strides; + phi::Array reduce_strides; +#ifndef PADDLE_WITH_XPU2 phi::Array divmoders; +#endif }; template @@ -247,7 +266,7 @@ struct ReduceIndexMapping { __device__ __forceinline__ int BlockDimY() { #ifdef PADDLE_WITH_XPU2 - return dim.deal_size_y; + return 1; #else return blockDim.y; #endif @@ -454,10 +473,14 @@ struct ReduceConfig { bool is_last_dim = (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); if (rank == reduce_rank || is_last_dim) { +#ifdef PADDLE_WITH_XPU_KP + reduce_type = static_cast(ReduceType::kReduceAny); +#else reduce_type = static_cast(ReduceType::kReduceLastDim); +#endif } else if (reduce_rank == 1) { // ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP if (reduce_dim[0] == 0) { reduce_type = static_cast(ReduceType::kReduceHigherDim); } else { @@ -471,6 +494,7 @@ struct ReduceConfig { } } +#ifndef PADDLE_WITH_XPU_KP void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { constexpr int min_reduce_num_per_thread = 16; constexpr int max_reduce_num_per_thread = 256; @@ -569,6 +593,7 @@ struct ReduceConfig { grid_dim->y = details::AlignUp(reduce_num, blocking_size); } } +#endif void SetBlockDim() { // init @@ -577,14 +602,14 @@ struct ReduceConfig { dim3 block_dim(block_num, 1, 1); dim3 grid_dim(left_num, 1, 1); blocking_size = reduce_num; -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP if (reduce_last_dim) { - block_dim.x = 128; + block_dim.x = 64; block_dim.y = reduce_num; - grid_dim.x = 8; - grid_dim.y = 1; + grid_dim.x = 1; + grid_dim.y = 8; } else { - block_dim.x = 128; + block_dim.x = 64; block_dim.y = left_num; grid_dim.x = 8; grid_dim.y = 1; @@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x, store_offset = block.BlockIdY() * left_num + left_idx; loop_left = min(block.GetLoopSize(), left_num - left_idx); stride_left = 1; - tid = threadIdx.x; + tid = THREAD_ID_X; } else { auto block = ReduceIndexMapping(dim); input_idx = block.BlockIdY() * block.BlockDimY(); @@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x, loop_left = min(block.GetLoopSize(), left_num - left_idx); stride_left = block.BlockDimX() * block.GridDimX(); store_offset = block.BlockIdY() * left_num + left_idx; - tid = threadIdx.y; + tid = THREAD_ID_Y; } // calculate the offset, means the addr where each thread really start. // 1. reduce for each thread MPType input_compute[REDUCE_VEC_SIZE]; Tx input_reg[REDUCE_VEC_SIZE]; + int input_idx_tmp = input_idx; for (int i = 0; i < loop_left; i += stride_left) { int input_offset = left_index_calculator(left_idx + i); - const Tx* input = x + input_offset; + const _ptr_ Tx* input = x + input_offset; MPType reduce_var = init; // load REDUCE_VEC_SIZE data once, and then compute int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; + input_idx = input_idx_tmp; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { kps::ReadDataReduce config) { if (config.reduce_type == kReduceLastDim) { int stride_reduce = 1; @@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceAnyKernel<<<8, 128, stream>>>(x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + OneDimIndexCal><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); #else ReduceAnyKernel<<<8, 128, stream>>>( + IndexCalculator><<<8, 64, 0, stream>>>( x_data, config.output_data, reducer, @@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig dim = kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 - ReduceHigherDimKernel><<<8, 128, stream>>>( +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( config.output_data, y_data, reducer, @@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data, const TransformOp& transform, int reduce_num, const paddle::platform::Place& place, - gpuStream_t stream) { + KPStream stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, transform); @@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data, const TransformOp& transform, int reduce_num, const paddle::platform::Place& place, - gpuStream_t stream) { + KPStream stream) { PADDLE_THROW(phi::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } @@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, phi::DenseTensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, - gpuStream_t stream) { + KPStream stream) { y->mutable_data(x.place()); auto x_dim = phi::vectorize(x.dims()); @@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, config.SetOutputData(y_data, x.place(), &tmp); constexpr bool kIsTxFP16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; +#ifndef PADDLE_WITH_XPU_KP if (use_cub_reduce) { CubTensorReduceImpl( x_data, y_data, transform, config.reduce_num, x.place(), stream); return; } +#endif using MPType = typename kps::details::MPTypeTrait::Type; auto reducer = ReduceOp(); @@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, config.reduce_num % config.blocking_size, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceHigherDimKernel, - TransformOp><<<8, 128, stream>>>(x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); + TransformOp><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); #else ReduceHigherDimKernel< Tx, @@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<<8, 128, stream>>>( + kps::IdentityFunctor><<<8, 64, 0, stream>>>( config.output_data, y_data, reducer, @@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, template class ReduceOp, template class TransformOp> -void Reduce(const GPUContext& dev_ctx, +void Reduce(const KPDevice& dev_ctx, const DenseTensor& x, bool reduce_all, const std::vector& dims, @@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx, reduce_num *= (x.dims())[i]; } - gpuStream_t stream = dev_ctx.stream(); + KPStream stream = dev_ctx.stream(); if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { auto tmp_tensor = phi::Cast(dev_ctx, x, out_dtype); -- GitLab From d9884e2077d024a2439b8864b21885402f228af7 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 3 Mar 2022 10:06:11 +0800 Subject: [PATCH 007/261] adjust the args checking of backward in yaml (#40091) --- python/paddle/utils/code_gen/backward_api_gen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 2d33cd5b181..125ebed82de 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI): # check the attributes of backward for attr in self.attrs['names']: - assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \ - f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \ + assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \ + self.attrs['attr_info'][attr][1] is not None, \ + f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \ Please check the args of {self.api} in yaml." # check the output of backward -- GitLab From da47544cc2bbc829b1c0f54854b532582d867156 Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Thu, 3 Mar 2022 10:13:22 +0800 Subject: [PATCH 008/261] Support slim eager (#39874) * eager, test=develop * fix bug, test=develop * eager, test=develop * merge legacy to fluid * eager, test=develop * eager, test=develop * Refactor TensorAdd func by template and remove gradient_accumulation in eager * Remove needless target name * eager, test=develop * eager, test=develop * Use overload instead of template * Remove legacy code * Remove legacy code * selectedrows, test=develop * Remove DataType test * eager, test=develop * eager, test=develop * support gan, test=develop * Using Tensor directly instead of using EagerTensor * support gradient_accumulation * make test_imperative_lod_tensor_to_selected_rows longer * make test_imperative_lod_tensor_to_selected_rows longer * refine code * ptb, test=develop * Rename all EagerTensor to Tensor * Rename some EagerTensor to Tensor * rename EagerTensor to EagerVariable * eager, test=develop * eager, test=develop * eager, test=develop * eager, test=develop * add more test * eager, test=develop * Support copiable selected rows and merge develop * save load, eager, test=develop * save load, eager, test=develop * refine, test=develop * remove useless _set_value method * refine, test=develop * refine, test=develop * revert static_runner, test=develop * EagerTensor to Tensor, test=develop * refine, test=develop * refine, test=develop * clear grad, test=develop * merge, develop * merge, develop * merge, test=develop * merge, test=develop * Support quant and part of slice * support legacy static save * extend slim tests time * remove imperative on inference * remove imperative on inference * merge develop * fix typo * fix typo * split slice related code into 2 part for imperative and eager * split slice from inference * split slice from inference * fix test_tensor_register_hook Co-authored-by: Wang Huan Co-authored-by: Weilong Wu Co-authored-by: wanghuancoder --- .../eager/accumulation/accumulation_node.h | 5 +- .../eager_generated/backwards/scale_node.h | 2 +- .../auto_code_generator/eager_generator.cc | 67 ++-- .../final_state_generator/eager_gen.py | 7 +- paddle/fluid/eager/backward.cc | 7 +- paddle/fluid/eager/grad_node_info.cc | 9 +- paddle/fluid/eager/grad_node_info.h | 4 +- .../data_structure_tests/grad_node_test.h | 1 + paddle/fluid/eager/utils.cc | 15 +- paddle/fluid/pybind/eager_method.cc | 142 ++++++++- paddle/fluid/pybind/eager_utils.cc | 9 + paddle/fluid/pybind/eager_utils.h | 5 +- paddle/fluid/pybind/imperative.cc | 284 ++--------------- paddle/fluid/pybind/pybind.cc | 5 +- paddle/fluid/pybind/slice_utils.h | 294 ++++++++++++++++++ .../fluid/contrib/slim/tests/CMakeLists.txt | 8 +- .../slim/tests/test_imperative_out_scale.py | 16 +- .../contrib/slim/tests/test_imperative_ptq.py | 15 +- .../contrib/slim/tests/test_imperative_qat.py | 9 +- .../slim/tests/test_imperative_qat_amp.py | 2 +- .../tests/test_imperative_qat_user_defined.py | 9 +- .../slim/tests/test_imperative_skip_op.py | 9 +- python/paddle/fluid/dygraph/base.py | 17 +- .../dygraph_to_static/partial_program.py | 99 ++++-- python/paddle/fluid/dygraph/jit.py | 53 ++-- .../fluid/dygraph/varbase_patch_methods.py | 4 +- python/paddle/fluid/io.py | 2 +- python/paddle/fluid/layers/nn.py | 3 + python/paddle/fluid/layers/tensor.py | 4 +- .../tests/unittests/test_egr_python_api.py | 8 +- .../unittests/test_tensor_register_hook.py | 8 +- 31 files changed, 700 insertions(+), 422 deletions(-) create mode 100644 paddle/fluid/pybind/slice_utils.h diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 734cabdc3dc..07fa4016516 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + VLOG(6) << "Construct GradNodeAccumulation"; weak_grad_ = meta->WeakGrad(); SetDefaultGradInOutMeta(); } - ~GradNodeAccumulation() override = default; + ~GradNodeAccumulation() override { + VLOG(6) << "Destruct GradNodeAccumulation"; + } // Functor: perform backward computations virtual std::vector> operator()( diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index c0150a1730d..247fde6ed1f 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { const std::vector& tensors); void SetAttributes_scale(float scale); - + std::string name() override { return ""; } // Members: define fwd input tensors // For Scale there is no fwd input tensor needed private: diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 102fad56373..2fc846cccc2 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent( // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + // If single output slotname and not duplicable, + // then generate: "egr::AutogradMeta* p_autograd_out = + // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" + for (const proto::OpProto::Var& output : out_vars) { + const std::string& output_name = output.name(); + const std::string& output_autograd_name = "p_autograd_" + output_name; + + if (output.duplicable()) { + const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = + " std::vector %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } + } + VLOG(6) << "Generated outputs autograd_meta"; + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent( } VLOG(6) << "Generated inputs autograd_meta"; - // If single output slotname and not duplicable, - // then generate: "egr::AutogradMeta* p_autograd_out = - // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" - for (const proto::OpProto::Var& output : out_vars) { - const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - - // Skip Intermediate Tensor - - if (output.duplicable()) { - const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } - } - VLOG(6) << "Generated outputs autograd_meta"; - std::string prepare_autograd_meta_str = ""; prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += "\n"; @@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent( " %s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" " egr::EagerUtils::PassStopGradient(%s);\n" "%s\n }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, pass_stop_gradient_args, + compute_require_grad_args, op_type, pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; @@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents( const char* GRAD_NODE_TEMPLATE = "class GradNode%s : public egr::GradNodeBase {\n" " public:\n" - " GradNode%s() : egr::GradNodeBase() {}\n" + " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct " + "GradNode%s \"; }\n" " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " - "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" - " ~GradNode%s() override = default;\n" + "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" " + "Construct GradNode%s \"; }\n" + " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" "\n" " virtual std::vector> " "operator()(const " "std::vector>& grads) " "override;\n" "\n" + " std::string name() override { return \" GradNode%s \"; } \n " + "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" - " std::string name() { return \"GradNode%s\"; }\n" - "\n" " private:\n" " // TensorWrappers\n" "%s\n" @@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents( VLOG(6) << "Generated TensorWrapper"; std::string grad_node_str = paddle::string::Sprintf( - GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, op_type, + GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, + op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index f2088dcda76..af9540b6fb3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -538,7 +538,7 @@ class {} : public egr::GradNodeBase {{ virtual std::vector> operator()( const std::vector>& grads) override; - + std::string name() override {{ return \" {} \"; }} // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes @@ -553,8 +553,9 @@ class {} : public egr::GradNodeBase {{ """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - set_tensor_wrapper_methods_str, set_attribute_methods_str, - tensor_wrapper_members_str, attribute_members_str) + grad_node_name, set_tensor_wrapper_methods_str, + set_attribute_methods_str, tensor_wrapper_members_str, + attribute_members_str) return node_declaration_str diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 356fdcaf054..934497d7d17 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -48,12 +48,16 @@ std::unordered_map getInDegreeMap( } visited.insert(node); + PADDLE_ENFORCE_NOT_NULL( + node, + paddle::platform::errors::Fatal( + "We got null node when we traverse the backward graph, and this " + "should not happened please check your code and contact us.")); // Find and append next nodes const std::vector>& edges = node->GetEdges(); for (const auto& edge_list : edges) { for (const Edge& edge : edge_list) { GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -67,7 +71,6 @@ std::unordered_map getInDegreeMap( } } } - return node_in_degree_map; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index b1189106b8f..427be83c3bb 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -30,6 +30,7 @@ namespace egr { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { + VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); // adj_edges has the same num as backward outputs @@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { meta->SetGradNode(std::make_shared(meta)); + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "inputs's slot num.")); if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index eeac1cca4ac..16513f05e07 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -76,10 +76,10 @@ class GradSlotMeta { class GradNodeBase { public: - GradNodeBase() = default; + GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; } GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); // TODO(jiabin): Should we have other constructor here? - virtual ~GradNodeBase() = default; + virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; } /** * operator() designed to contian the real backward execution logic, it should diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index bb84e2dda81..535c93ac53b 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode(float val, int in_num, int out_num) : GradNodeBase(in_num, out_num), val_(val) {} GradTestNode() : GradNodeBase() { val_ = 1.0; } + std::string name() override { return "GradTestNode"; } std::vector> operator()( const std::vector>& grads) override { diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 39861c80522..8a57d269453 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " - "detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } autograd_meta->SetGradNode(grad_node); } @@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector* autograd_metas, void EagerUtils::SetHistory(AutogradMeta* autograd_meta, const std::shared_ptr& grad_node) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) - << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } - autograd_meta->SetGradNode(grad_node); } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index f11a2ab2517..e5f22338dc6 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" @@ -30,10 +31,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/slice_utils.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" + namespace paddle { namespace pybind { @@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self, extern PyTypeObject* p_tensor_type; +Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) { + if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { + VLOG(6) << "Call GetSliceIndexFromTensor in Eager"; + paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0); + PADDLE_ENFORCE_EQ( + tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "We can only support initialized tensor in slice, however we got " + "uninitialized tensor %s, please check your code.", + tensor.name())); + return GetSliceIndexFromTensor((*static_cast( + CastPyArg2Tensor(obj, 0).impl().get()))); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We should only get paddle::experimental::Tensor or VarBase in this " + "method, when you reach this means we got another type index.")); + } +} + +bool PyCheckTensor(PyObject* obj) { + return PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type)); +} + static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -// NOTE(wuweilong): Set value and not change self's original place -static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, - PyObject* kwargs) { +static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, + PyObject* args, + PyObject* kwargs) { EAGER_TRY - VLOG(4) << "Value " << self->tensor.name(); - pybind11::object numpy_value = - pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true); - InitTensorWithNumpyValue(self, numpy_value, false); - Py_INCREF(Py_None); - return Py_None; + PyObject* _index = PyTuple_GET_ITEM(args, 0); + VLOG(4) << "Call _getitem_index_not_tensor"; + std::vector slice_axes, slice_starts, slice_ends, slice_strides, + decrease_axis, none_axes, infer_flags, list_select_idxs; + // if index is a list, list_select_flag will be true + bool list_select_flag = false; + PADDLE_ENFORCE_EQ( + self->tensor.is_initialized(), true, + platform::errors::InvalidArgument( + "tensor %s has not been initialized, we can only slice initialized " + "tensor please init it first with numpy or other tensor.", + self->tensor.name())); + auto tensor = static_cast(self->tensor.impl().get()); + ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends, + &slice_strides, &decrease_axis, &none_axes, &infer_flags, + &list_select_idxs, &list_select_flag); + + auto out = slice_axes.empty() && !list_select_flag + ? self->tensor + : paddle::experimental::Tensor( + egr::Controller::Instance().GenerateUniqueName()); + + if (!slice_axes.empty()) { + framework::AttributeMap attrs = {{"axes", slice_axes}, + {"starts", slice_starts}, + {"ends", slice_ends}, + {"infer_flags", infer_flags}, + {"decrease_axis", decrease_axis}}; + std::string op_type = "slice"; + for (auto stride : slice_strides) { + if (stride != 1) { + op_type = "strided_slice"; + attrs.insert({"strides", slice_strides}); + attrs.erase("decrease_axis"); + break; + } + } + if (op_type == "slice") { + out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(), + paddle::experimental::Tensor(), + std::move(attrs)); + } else if (op_type == "strided_slice") { + out = strided_slice_dygraph_function(self->tensor, attrs); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Slice is only support slice and strided_slice, but we got %s which " + "is impossible, please check your code first or contact us by " + "issue. ", + op_type)); + } + } + + if (!none_axes.empty()) { + // Deal with cases when all axes are decreased. + // After slice, the shape of out is [1], which should have been + // [], but Paddle doesn't support scalar. + // In order to ensure the correctness of the final shape of out, + // one dimension of out needs to be decreased. + // For example: + // # x.shape: (2,3,4) + // out = x[0, 1, 1, None] # out.shape : (1) + if (static_cast(decrease_axis.size()) == tensor->dims().size()) { + none_axes.pop_back(); + } + if (!none_axes.empty()) { + // Deal with cases that decrease_axes is not empty + // For example: + // # x.shape: (2,3,4) + // out = x[0, 0:2, None] # out.shape : (2, 1, 4) + for (auto& axis : none_axes) { + int len = 0; + for (int da : decrease_axis) { + if (da < axis) { + len++; + } + } + axis -= len; + } + + paddle::experimental::Tensor new_out; + framework::AttributeMap attrs = {{"axes", none_axes}}; + new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs))); + return ToPyObject(new_out); + } + } + + // the index is a list + if (list_select_flag) { + auto select_index = paddle::experimental::Tensor( + egr::Controller::Instance().GenerateUniqueName()); + auto idx_tensor = std::make_shared(); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get( + egr::Controller::Instance().GetExpectedPlace()); + paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx, + idx_tensor.get()); + framework::AttributeMap attrs = {{"dim", 0}}; + out = index_select_dygraph_function(self->tensor, select_index, + std::move(attrs)); + } + + return ToPyObject(out); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = { {"get_tensor", (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, + {"_getitem_index_not_tensor", + (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, {"_register_grad_hook", (PyCFunction)(void (*)(void))tensor_register_grad_hook, diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index c1e8822eec2..57f37621d3b 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" @@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { } } +std::shared_ptr CastPyArg2VarBase(PyObject* obj, + ssize_t arg_pos) { + return py::cast>(obj); +} + std::vector CastPyArg2VectorOfTensor( PyObject* obj, ssize_t arg_pos) { std::vector result; @@ -737,5 +745,6 @@ std::vector GetTensorPtrListFromArgs( return result; } + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 0c721d61247..92afc3ae487 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" - namespace paddle { namespace pybind { @@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); +std::shared_ptr CastPyArg2VarBase(PyObject* obj, + ssize_t arg_pos); std::vector CastPyArg2VectorOfTensor( PyObject* obj, ssize_t arg_pos); platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos); @@ -112,5 +113,7 @@ std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); +// end of Slice related methods + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 8c5ed2d1183..3da17b95a66 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -54,6 +54,7 @@ limitations under the License. */ #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" namespace paddle { @@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) { } } +Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) { + if (py::isinstance(obj)) { + VLOG(6) << "Call GetSliceIndexFromTensor in Imperative"; + return GetSliceIndexFromTensor( + py::cast>(obj) + ->Var() + .Get()); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We should only get paddle::experimental::Tensor or VarBase in this " + "method, when you reach this means we got another type index.")); + } +} + +bool PyCheckTensor(PyObject *obj) { + return py::isinstance(obj); +} using PyNameVarBaseMap = std::unordered_map; // NOTE(zjl): py::handle is a very light wrapper of PyObject *. @@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { return result; } -static bool IsNumpyType(PyObject *obj) { - // It is not a good way to judge the type of obj by its type'name. Maybe using - // `PyArray_IsScalar` will be better. However, this interface cannot be used - // by including pybind11, and it needs to compile with numpy. - auto type_name = std::string(Py_TYPE(obj)->tp_name); - return type_name == "numpy.int64" || type_name == "numpy.longlong" || - type_name == "numpy.int32" || type_name == "numpy.int16"; -} - -static bool PyCheckTensor(PyObject *obj) { - return py::isinstance(obj); -} // cast numpy type form S to T, this may allocate new memory template @@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( return result; } -static bool PyCheckInteger(PyObject *obj) { -#if PY_VERSION_HEX < 0x03000000 - return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj); -#else - return PyLong_Check(obj) && !PyBool_Check(obj); -#endif -} - -static Py_ssize_t GetSliceIndexFromTensor( - const std::shared_ptr &tensor_index) { - const auto &tensor = tensor_index->Var().Get(); - if (tensor.numel() == 1) { - if (framework::TransToProtoVarType(tensor.dtype()) == - framework::proto::VarType::INT32) { - return static_cast(operators::GetValue(&tensor)); - } else if (framework::TransToProtoVarType(tensor.dtype()) == - framework::proto::VarType::INT64) { - return static_cast(operators::GetValue(&tensor)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, the type of tensor in slice indices only allows " - "int32 and int64, please check the type of index tensor.")); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, tensor in slice indices only allows 1 element, " - "but received %d.", - tensor.numel())); - } -} - -// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From: -// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103 -// Original PySlice_GetIndices return wrong result when -// slice_item contains long int, such as arr[:180L]. -// NOT sure why this happens !!! -// Besides, PySlice_GetIndices cannot raise error when float in slice item. -// So, I make a revised version of PySlice_GetIndices, named to -// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than -// PySlice_GetIndices in the future. -static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length, - Py_ssize_t *start, Py_ssize_t *stop, - Py_ssize_t *step) { - /* XXX support long ints */ - if (r->step == Py_None) { - *step = 1; - } else { - if (PyCheckInteger(r->step) || IsNumpyType(r->step)) { - *step = PyLong_AsLong(r->step); - } else if (PyCheckTensor(r->step)) { - *step = GetSliceIndexFromTensor( - py::cast>(r->step)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->step)->tp_name))); - } - } - if (r->start == Py_None) { - *start = *step < 0 ? length - 1 : 0; - } else { - if (PyCheckInteger(r->start) || IsNumpyType(r->start)) { - *start = PyLong_AsLong(r->start); - } else if (PyCheckTensor(r->start)) { - *start = GetSliceIndexFromTensor( - py::cast>(r->start)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->start)->tp_name))); - } - if (*start < 0) *start += length; - *start = std::max(*start, static_cast(0)); - } - if (r->stop == Py_None) { - *stop = *step < 0 ? -1 : length; - } else { - if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) { - *stop = PyLong_AsLong(r->stop); - } else if (PyCheckTensor(r->stop)) { - *stop = GetSliceIndexFromTensor( - py::cast>(r->stop)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->stop)->tp_name))); - } - if (0 < *step && *stop < 0) *stop += length; - *stop = std::min(*stop, length); - } - if (*stop > length) return -1; - if (*start >= length) return -1; - if (*step == 0) return -1; - return 0; -} - -static void ParseIndexingSlice( - framework::LoDTensor *tensor, PyObject *_index, - std::vector *slice_axes, std::vector *slice_starts, - std::vector *slice_ends, std::vector *slice_strides, - std::vector *decrease_axis, std::vector *none_axes, - std::vector *infer_flags, std::vector *list_select_idxs, - bool *list_select_flag) { - // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those - // types, and list of Bool and Integers. - // wrap to tuple - - // NOTE(zhiqiu): PyTuple_Pack increases refcount. - PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index; - DEFINE_PADDLE_SCOPE_GUARD([index, _index]() { - if (!PyTuple_Check(_index)) { - Py_DECREF(index); - VLOG(4) << "Call Py_DECREF"; - } - }); - PADDLE_ENFORCE_EQ( - tensor->IsInitialized(), true, - platform::errors::InvalidArgument("tensor has not been initialized")); - const auto &shape = tensor->dims(); - const int rank = shape.size(); - const int size = PyTuple_GET_SIZE(index); - - // specified_dims is the number of dimensions which indexed by Interger, - // Slices. - int specified_dims = 0; - int ell_count = 0; - for (int dim = 0; dim < size; ++dim) { - PyObject *slice_item = PyTuple_GetItem(index, dim); - if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) { - specified_dims++; - } else if (slice_item == Py_Ellipsis) { - ell_count++; - } - } - - PADDLE_ENFORCE_LE(ell_count, 1, - platform::errors::InvalidArgument( - "An index can only have a single ellipsis ('...')")); - int none_count = 0; - for (int i = 0, dim = 0; i < size; ++i) { - PyObject *slice_item = PyTuple_GetItem(index, i); - - infer_flags->push_back(1); - int dim_len = shape[dim]; - if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) { - // integer, PyLong_AsLong supports both int and long - int start = static_cast(PyLong_AsLong(slice_item)); - auto s_t = start; - start = start < 0 ? start + dim_len : start; - if (start >= dim_len || start < 0) { - std::string str_error_message = - "The starting index " + std::to_string(s_t) + - " of slice is out of bounds in tensor " + std::to_string(dim) + - "-th axis, it shound be in the range of [" + - std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; - // py::index_error is corresponding to IndexError in Python - // Used to indicate out of bounds access in __getitem__, __setitem__ - throw py::index_error(str_error_message); - } - slice_axes->push_back(dim); - slice_starts->push_back(start); - slice_ends->push_back(start + 1); - slice_strides->push_back(1); - decrease_axis->push_back(dim); - dim++; - } else if (PySlice_Check(slice_item)) { - // slice item - Py_ssize_t start, end, step; - PySliceObject *p = reinterpret_cast(slice_item); - _PySlice_GetIndices(p, dim_len, &start, &end, &step); - - // :: or : or 0:dim_len:1 - if (start == 0 && end == dim_len && step == 1) { - dim++; - continue; - } - slice_axes->push_back(dim); - slice_starts->push_back(start); - slice_ends->push_back(end); - slice_strides->push_back(step); - dim++; - } else if (slice_item == Py_Ellipsis) { - dim += rank - specified_dims; - } else if (slice_item == Py_None) { - none_axes->push_back(dim + none_count); - none_count++; - } else if (PyList_Check(slice_item)) { - *list_select_flag = true; - PADDLE_ENFORCE_EQ( - size, 1, - platform::errors::InvalidArgument( - "When index contains a list, its length is excepted to 1, " - "but received %d", - size)); - bool all_bool = true; - int list_size = PyList_GET_SIZE(slice_item); - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (PyCheckInteger(list_item)) { - all_bool = false; - } else if (!PyBool_Check(list_item)) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support int or bool in index list.")); - } - } - if (all_bool) { - PADDLE_ENFORCE_EQ( - list_size, shape[0], - platform::errors::InvalidArgument( - "The dimension of bool index doesn't match indexed array along " - "dimension 0, the target dimension is %d, but received %d.", - shape[0], list_size)); - - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (list_item == Py_True) { - list_select_idxs->push_back(j); - } - } - } else { - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (PyCheckInteger(list_item)) { - list_select_idxs->push_back( - static_cast(PyLong_AsLong(list_item))); - } else if (list_item == Py_True) { - list_select_idxs->push_back(1); - } else { - list_select_idxs->push_back(0); - } - } - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, Tensor.__indices__() only allows indexing " - "by Integers, Slices, Ellipsis, None, tuples of these types " - "and list of Bool and Integers, but received " - "%s in %dth slice item", - std::string(Py_TYPE(slice_item)->tp_name), i + 1)); - } - } - - // valid_index is the number of dimensions exclude None index - const int valid_indexs = size - none_axes->size() - ell_count; - PADDLE_ENFORCE_EQ(valid_indexs <= rank, true, - platform::errors::InvalidArgument( - "Too many indices (%d) for tensor of dimension %d.", - valid_indexs, rank)); -} - template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2d9272dd0ed..ffc42dc30ed 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -80,6 +80,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/io.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/lod_utils.h" @@ -101,7 +102,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/gloo_context_py.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h" #include "paddle/fluid/pybind/heter_wrapper_py.h" -#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/metrics_py.h" @@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif + BindImperative(&m); BindEager(&m); BindCudaStream(&m); @@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); - BindImperative(&m); - py::class_ framework_tensor(m, "Tensor", py::buffer_protocol()); g_framework_tensor_pytype = diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h new file mode 100644 index 00000000000..a037fa13eb5 --- /dev/null +++ b/paddle/fluid/pybind/slice_utils.h @@ -0,0 +1,294 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope_guard.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +static bool PyCheckTensor(PyObject* obj); +static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj); +// Slice related methods +static bool PyCheckInteger(PyObject* obj) { +#if PY_VERSION_HEX < 0x03000000 + return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj); +#else + return PyLong_Check(obj) && !PyBool_Check(obj); +#endif +} + +static bool IsNumpyType(PyObject* obj) { + // It is not a good way to judge the type of obj by its type'name. Maybe using + // `PyArray_IsScalar` will be better. However, this interface cannot be used + // by including pybind11, and it needs to compile with numpy. + auto type_name = std::string(Py_TYPE(obj)->tp_name); + return type_name == "numpy.int64" || type_name == "numpy.longlong" || + type_name == "numpy.int32" || type_name == "numpy.int16"; +} + +static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) { + if (tensor.numel() == 1) { + if (framework::TransToProtoVarType(tensor.type()) == + framework::proto::VarType::INT32) { + return static_cast(operators::GetValue(&tensor)); + } else if (framework::TransToProtoVarType(tensor.type()) == + framework::proto::VarType::INT64) { + return static_cast(operators::GetValue(&tensor)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, the type of tensor in slice indices only allows " + "int32 and int64, please check the type of index tensor.")); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, tensor in slice indices only allows 1 element, " + "but received %d.", + tensor.numel())); + } +} + +// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From: +// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103 +// Original PySlice_GetIndices return wrong result when +// slice_item contains long int, such as arr[:180L]. +// NOT sure why this happens !!! +// Besides, PySlice_GetIndices cannot raise error when float in slice item. +// So, I make a revised version of PySlice_GetIndices, named to +// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than +// PySlice_GetIndices in the future. +static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length, + Py_ssize_t* start, Py_ssize_t* stop, + Py_ssize_t* step) { + /* XXX support long ints */ + if (r->step == Py_None) { + *step = 1; + } else { + if (PyCheckInteger(r->step) || IsNumpyType(r->step)) { + *step = PyLong_AsLong(r->step); + } else if (PyCheckTensor(r->step)) { + *step = GetSliceIndexFromPyObject(r->step); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->step)->tp_name))); + } + } + if (r->start == Py_None) { + *start = *step < 0 ? length - 1 : 0; + } else { + if (PyCheckInteger(r->start) || IsNumpyType(r->start)) { + *start = PyLong_AsLong(r->start); + } else if (PyCheckTensor(r->start)) { + *start = GetSliceIndexFromPyObject(r->start); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->start)->tp_name))); + } + if (*start < 0) *start += length; + *start = std::max(*start, static_cast(0)); + } + if (r->stop == Py_None) { + *stop = *step < 0 ? -1 : length; + } else { + if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) { + *stop = PyLong_AsLong(r->stop); + } else if (PyCheckTensor(r->stop)) { + *stop = GetSliceIndexFromPyObject(r->stop); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->stop)->tp_name))); + } + if (0 < *step && *stop < 0) *stop += length; + *stop = std::min(*stop, length); + } + if (*stop > length) return -1; + if (*start >= length) return -1; + if (*step == 0) return -1; + return 0; +} + +static void ParseIndexingSlice( + framework::LoDTensor* tensor, PyObject* _index, + std::vector* slice_axes, std::vector* slice_starts, + std::vector* slice_ends, std::vector* slice_strides, + std::vector* decrease_axis, std::vector* none_axes, + std::vector* infer_flags, std::vector* list_select_idxs, + bool* list_select_flag) { + // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those + // types, and list of Bool and Integers. + // wrap to tuple + + // NOTE(zhiqiu): PyTuple_Pack increases refcount. + PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index; + DEFINE_PADDLE_SCOPE_GUARD([index, _index]() { + if (!PyTuple_Check(_index)) { + Py_DECREF(index); + VLOG(4) << "Call Py_DECREF"; + } + }); + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("tensor has not been initialized")); + const auto& shape = tensor->dims(); + const int rank = shape.size(); + const int size = PyTuple_GET_SIZE(index); + + // specified_dims is the number of dimensions which indexed by Interger, + // Slices. + int specified_dims = 0; + int ell_count = 0; + for (int dim = 0; dim < size; ++dim) { + PyObject* slice_item = PyTuple_GetItem(index, dim); + if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) { + specified_dims++; + } else if (slice_item == Py_Ellipsis) { + ell_count++; + } + } + + PADDLE_ENFORCE_LE(ell_count, 1, + platform::errors::InvalidArgument( + "An index can only have a single ellipsis ('...')")); + int none_count = 0; + for (int i = 0, dim = 0; i < size; ++i) { + PyObject* slice_item = PyTuple_GetItem(index, i); + + infer_flags->push_back(1); + int dim_len = shape[dim]; + if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) { + // integer, PyLong_AsLong supports both int and long + int start = static_cast(PyLong_AsLong(slice_item)); + auto s_t = start; + start = start < 0 ? start + dim_len : start; + if (start >= dim_len || start < 0) { + std::string str_error_message = + "The starting index " + std::to_string(s_t) + + " of slice is out of bounds in tensor " + std::to_string(dim) + + "-th axis, it shound be in the range of [" + + std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; + // py::index_error is corresponding to IndexError in Python + // Used to indicate out of bounds access in __getitem__, __setitem__ + throw py::index_error(str_error_message); + } + slice_axes->push_back(dim); + slice_starts->push_back(start); + slice_ends->push_back(start + 1); + slice_strides->push_back(1); + decrease_axis->push_back(dim); + dim++; + } else if (PySlice_Check(slice_item)) { + // slice item + Py_ssize_t start, end, step; + PySliceObject* p = reinterpret_cast(slice_item); + _PySlice_GetIndices(p, dim_len, &start, &end, &step); + + // :: or : or 0:dim_len:1 + if (start == 0 && end == dim_len && step == 1) { + dim++; + continue; + } + slice_axes->push_back(dim); + slice_starts->push_back(start); + slice_ends->push_back(end); + slice_strides->push_back(step); + dim++; + } else if (slice_item == Py_Ellipsis) { + dim += rank - specified_dims; + } else if (slice_item == Py_None) { + none_axes->push_back(dim + none_count); + none_count++; + } else if (PyList_Check(slice_item)) { + *list_select_flag = true; + PADDLE_ENFORCE_EQ( + size, 1, + platform::errors::InvalidArgument( + "When index contains a list, its length is excepted to 1, " + "but received %d", + size)); + bool all_bool = true; + int list_size = PyList_GET_SIZE(slice_item); + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (PyCheckInteger(list_item)) { + all_bool = false; + } else if (!PyBool_Check(list_item)) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support int or bool in index list.")); + } + } + if (all_bool) { + PADDLE_ENFORCE_EQ( + list_size, shape[0], + platform::errors::InvalidArgument( + "The dimension of bool index doesn't match indexed array along " + "dimension 0, the target dimension is %d, but received %d.", + shape[0], list_size)); + + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (list_item == Py_True) { + list_select_idxs->push_back(j); + } + } + } else { + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (PyCheckInteger(list_item)) { + list_select_idxs->push_back( + static_cast(PyLong_AsLong(list_item))); + } else if (list_item == Py_True) { + list_select_idxs->push_back(1); + } else { + list_select_idxs->push_back(0); + } + } + } + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, Tensor.__indices__() only allows indexing " + "by Integers, Slices, Ellipsis, None, tuples of these types " + "and list of Bool and Integers, but received " + "%s in %dth slice item", + std::string(Py_TYPE(slice_item)->tp_name), i + 1)); + } + } + + // valid_index is the number of dimensions exclude None index + const int valid_indexs = size - none_axes->size() - ell_count; + PADDLE_ENFORCE_EQ(valid_indexs <= rank, true, + platform::errors::InvalidArgument( + "Too many indices (%d) for tensor of dimension %d.", + valid_indexs, rank)); +} + +} // namespace pybind +} // namespace paddle diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index f75a0fa50a5..807f7c15196 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -351,10 +351,10 @@ endif() set_tests_properties(test_graph PROPERTIES TIMEOUT 120) set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120) -set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120) +set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200) +set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200) if(LINUX AND WITH_MKLDNN) set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index c4318b8bf8e..7b9cd7958b2 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -26,7 +26,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid import core from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.framework import IrGraph +from paddle.fluid.framework import IrGraph, _test_eager_guard from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX @@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer): class TestImperativeOutSclae(unittest.TestCase): - def test_out_scale_acc(self): + def func_out_scale_acc(self): seed = 1000 lr = 0.001 @@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase): loss_list[i] > loss_list[i + 1], msg='Failed to do the imperative qat.') + def test_out_scale_acc(self): + with _test_eager_guard(): + self.func_out_scale_acc() + self.func_out_scale_acc() + class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): - def test_save_quantized_model(self): + def func_save_quantized_model(self): lr = 0.001 load_param_path = "test_save_quantized_model/lenet.pdparams" @@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): loss_list[i] > loss_list[i + 1], msg='Failed to do the imperative qat.') + def test_save_quantized_model(self): + with _test_eager_guard(): + self.func_save_quantized_model() + self.func_save_quantized_model() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py index fb92b12cb0d..fad4c8f9d58 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py @@ -29,6 +29,7 @@ import paddle.fluid as fluid from paddle.fluid.contrib.slim.quantization import * from paddle.fluid.log_helper import get_logger from paddle.dataset.common import download +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn from imperative_test_utils import ImperativeLinearBn_hook @@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase): break return top1_correct_num / total_num - def test_ptq(self): + def func_ptq(self): start_time = time.time() self.set_vars() @@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase): end_time = time.time() print("total time: %ss \n" % (end_time - start_time)) + def test_ptq(self): + with _test_eager_guard(): + self.func_ptq() + self.func_ptq() + class TestImperativePTQfuse(TestImperativePTQ): - def test_ptq(self): + def func_ptq(self): start_time = time.time() self.set_vars() @@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ): end_time = time.time() print("total time: %ss \n" % (end_time - start_time)) + def test_ptq(self): + with _test_eager_guard(): + self.func_ptq() + self.func_ptq() + class TestImperativePTQHist(TestImperativePTQ): def set_vars(self): diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 677ccb52e24..5db720b028f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose - +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet paddle.enable_static() @@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase): self.activation_quantize_type = 'moving_average_abs_max' print('weight_quantize_type', self.weight_quantize_type) - def test_qat(self): + def func_qat(self): self.set_vars() imperative_qat = ImperativeQuantAware( @@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase): np.allclose(after_save, before_save.numpy()), msg='Failed to save the inference quantized model.') + def test_qat(self): + with _test_eager_guard(): + self.func_qat() + self.func_qat() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py index d1bf76f4724..2dcf7a6f168 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py @@ -27,7 +27,7 @@ import paddle.fluid as fluid from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.log_helper import get_logger from paddle.dataset.common import download - +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet os.environ["CPU_NUM"] = "1" diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 270e8ee566a..0bc80694a12 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D from paddle.fluid.dygraph import Linear from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger - +from paddle.fluid.framework import _test_eager_guard os.environ["CPU_NUM"] = "1" _logger = get_logger( @@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase): _logger.info("test act_preprocess") self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT) - def test_quant_aware_training(self): + def func_quant_aware_training(self): imperative_qat = self.imperative_qat seed = 1 np.random.seed(seed) @@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase): train(lenet) test(lenet) + def test_quant_aware_training(self): + with _test_eager_guard(): + self.func_quant_aware_training() + self.func_quant_aware_training() + class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess): def setUp(self): diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py index 8d2e0f753c0..d77134d72a9 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py @@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.log_helper import get_logger from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant +from paddle.fluid.framework import _test_eager_guard os.environ["CPU_NUM"] = "1" if core.is_compiled_with_cuda(): @@ -42,7 +43,8 @@ _logger = get_logger( class TestImperativeOutSclae(unittest.TestCase): - def test_out_scale_acc(self): + def func_out_scale_acc(self): + paddle.disable_static() seed = 1000 lr = 0.1 @@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase): if find_matmul: self.assertTrue(matmul_skip_count == 1) + def test_out_scale_acc(self): + with _test_eager_guard(): + self.func_out_scale_acc() + self.func_out_scale_acc() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 8c2ff140ea4..8149d69d36a 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -99,18 +99,19 @@ def param_guard(parameters): yield -def _convert_into_variable(var_base): +def _convert_into_variable(tensor): """ Convert Varbase into Variable. """ - if isinstance(var_base, core.VarBase): + if isinstance(tensor, (core.eager.Tensor, core.VarBase)): # Check whether has been created before. - new_var = var_base.block._find_var_recursive(var_base.name) + new_var = tensor.block._find_var_recursive(tensor.name) if new_var is not None: assert isinstance(new_var, framework.Variable) # Convert ParamBase into Parameter with same attributes in dy2stat. - elif isinstance(var_base, framework.ParamBase): - new_var = var_base._to_static_var(to_parameter=True) + elif isinstance(tensor, + (framework.EagerParamBase, framework.ParamBase)): + new_var = tensor._to_static_var(to_parameter=True) else: # Note(Aurelius84): Convert VarBase in self._buffers into Variable with # same attributes and set persistable=True to allow saving this var. @@ -120,13 +121,13 @@ def _convert_into_variable(var_base): # But if its shape is empty while created from `create_variable()`, we consider this buffer # non-persistable. See case of `drop_state` in lstm api. - is_persistable = len(var_base.shape) > 0 + is_persistable = len(tensor.shape) > 0 - new_var = var_base._to_static_var( + new_var = tensor._to_static_var( to_parameter=False, persistable=is_persistable) return new_var else: - return var_base + return tensor def enabled(): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 94fc5558ab1..a442a8b92b6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -61,7 +61,8 @@ class NestSequence(object): def _get_var_ids(self): var_ids = [] for idx, var in enumerate(self.__input_list): - if isinstance(var, (framework.Variable, core.VarBase)): + if isinstance(var, (framework.Variable, core.VarBase, + core.eager.Tensor)): var_ids.append(idx) return var_ids @@ -73,7 +74,8 @@ class NestSequence(object): if need_check: warning_types = set() for var in self.__input_list: - if not isinstance(var, (framework.Variable, core.VarBase)): + if not isinstance(var, (framework.Variable, core.VarBase, + core.eager.Tensor)): warning_types.add(type(var)) if warning_types: logging_utils.warn( @@ -301,10 +303,17 @@ class PartialProgramLayer: for name in block.vars: if "@GRAD" in name: var_desc = block.vars[name].desc - var_base = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), - var_desc.type(), False) + var_base = None + if not core._in_eager_mode(): + var_base = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) + else: + var_base = core.eager.Tensor(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) double_grads.append(var_base) return self._valid_vars(double_grads) @@ -386,13 +395,22 @@ class PartialProgramLayer: expected_place = framework._current_expected_place() for i, value in enumerate(flatten_inputs): if isinstance(value, np.ndarray): - var = core.VarBase( - value=value, - name=self._inputs[i].desc.name(), - persistable=False, - place=expected_place, - zero_copy=True) - elif isinstance(value, core.VarBase): + var = None + if not core._in_eager_mode(): + var = core.VarBase( + value=value, + name=self._inputs[i].desc.name(), + persistable=False, + place=expected_place, + zero_copy=True) + else: + var = core.eager.Tensor( + value=value, + name=self._inputs[i].desc.name(), + persistable=False, + place=expected_place, + zero_copy=True) + elif isinstance(value, (core.VarBase, core.eager.Tensor)): # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times # into CUDAPlace when it's as input of multi Ops. so we move it in advance # to avoid this problem. @@ -411,9 +429,16 @@ class PartialProgramLayer: var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc - var_base = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), var_desc.type(), False) + varbase = None + if not core._in_eager_mode(): + var_base = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) + else: + var_base = core.eager.Tensor(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) return var_base # Create VarBase to receive output data. @@ -423,12 +448,19 @@ class PartialProgramLayer: def _create_scope_vec(self): # Hold forward variables - tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], - "program_out_scope", - core.VarDesc.VarType.STEP_SCOPES, True) - - inner_scope = core.Scope() - tmp_scope_vec.value().set_scope(inner_scope) + tmp_scope_vec = None + if not core._in_eager_mode(): + tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], + "program_out_scope", + core.VarDesc.VarType.STEP_SCOPES, True) + # TODO(jiabin): Support this later. + # else: + # tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [], + # "program_out_scope", + # core.VarDesc.VarType.STEP_SCOPES, True) + + inner_scope = core.Scope() + tmp_scope_vec.value().set_scope(inner_scope) return tmp_scope_vec def _restore_out(self, out_vars): @@ -450,7 +482,8 @@ class PartialProgramLayer: return main_program.clone(for_test=True) def _is_no_value(self, var): - if isinstance(var, core.VarBase) and var.shape == [1]: + if isinstance(var, + (core.VarBase, core.eager.Tensor)) and var.shape == [1]: # NOTE: .numpy() will insert MemcpySync operation, it hits performance. if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM: return True @@ -460,7 +493,7 @@ class PartialProgramLayer: """ Removes invalid value for various-length return statement """ - if isinstance(out_vars, core.VarBase): + if isinstance(out_vars, (core.VarBase, core.eager.Tensor)): if self._is_no_value(out_vars): return None return out_vars @@ -527,7 +560,7 @@ class PartialProgramLayer: param_and_buffer_names_set = set() for i, var in enumerate(self._params): # self._params constains parameters and buffers with persistable=True. - if not isinstance(var, core.VarBase): + if not isinstance(var, (core.VarBase, core.eager.Tensor)): raise TypeError( 'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'. format(i, type(var))) @@ -559,10 +592,18 @@ def _create_fake_var(): """ Create a fake_var (force on CPU) to handle empty input or output """ - return [ - core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", - core.VarDesc.VarType.RAW, False) - ] + if not core._in_eager_mode(): + return [ + core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + else: + return [] + # TODO(jiabin): Support this later + # return [ + # core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + # core.VarDesc.VarType.RAW, False) + # ] def partial_program_from(concrete_program): diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 4bfdc3c27fa..b1865691b24 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -25,7 +25,7 @@ import threading import six import paddle -from paddle.fluid import core +from paddle.fluid import core, dygraph from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy from paddle.fluid.data_feeder import check_type from paddle.fluid.layers.utils import flatten, pack_sequence_as @@ -898,30 +898,33 @@ def save(layer, path, input_spec=None, **configs): state_var_dict[var.name] = var # 3. share parameters from Layer to scope & record var info - for param_or_buffer in concrete_program.parameters: - # share to scope - if param_or_buffer.type == core.VarDesc.VarType.VOCAB: - scr_tensor = param_or_buffer.value().get_map_tensor() - tgt_var = scope.var(param_or_buffer.name) - tgt_var.set_vocab(scr_tensor) - else: - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[param_or_buffer.name].value( - ).get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) - # record var info - if param_or_buffer.name not in extra_var_info: - extra_info_dict = dict() - if param_or_buffer.name in state_names_dict: - extra_info_dict['structured_name'] = state_names_dict[ - param_or_buffer.name] - extra_info_dict[ - 'stop_gradient'] = param_or_buffer.stop_gradient - if isinstance(param_or_buffer, ParamBase): - extra_info_dict['trainable'] = param_or_buffer.trainable - extra_var_info[param_or_buffer.name] = extra_info_dict + with dygraph.guard(): + for param_or_buffer in concrete_program.parameters: + # share to scope + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[param_or_buffer.name].value( + ).get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) + # record var info + if param_or_buffer.name not in extra_var_info: + extra_info_dict = dict() + if param_or_buffer.name in state_names_dict: + extra_info_dict[ + 'structured_name'] = state_names_dict[ + param_or_buffer.name] + extra_info_dict[ + 'stop_gradient'] = param_or_buffer.stop_gradient + if isinstance(param_or_buffer, ParamBase): + extra_info_dict[ + 'trainable'] = param_or_buffer.trainable + extra_var_info[param_or_buffer.name] = extra_info_dict # 4. build input & output of save_infernece_model # NOTE(chenweihang): [ Get input variables name ] diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 65bfba3f6c3..6843c0e4c3f 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -94,7 +94,7 @@ def monkey_patch_varbase(): # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None). attr_not_need_keys = ['grad', 'T'] - if isinstance(self, ParamBase): + if isinstance(self, (ParamBase, EagerParamBase)): attr_kwargs = self.__dict__.copy() else: attr_names = [] @@ -111,7 +111,7 @@ def monkey_patch_varbase(): attr_kwargs.update(kwargs) - if to_parameter or isinstance(self, ParamBase): + if to_parameter or isinstance(self, (ParamBase, EagerParamBase)): del attr_kwargs['persistable'] # NOTE(Aurelius84): All parameters should be placed into global block. attr_kwargs['block'] = attr_kwargs['block'].program.global_block() diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 4bbc0ba03c9..a48cfd9150c 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj): @static_only def _legacy_save(param_dict, model_path, protocol=2): def get_tensor(var): - if isinstance(var, core.VarBase): + if isinstance(var, (core.VarBase, core.eager.Tensor)): return var.numpy() elif isinstance(var, core.LoDTensor): return np.array(var) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f022e1791da..fd7226c4866 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None): check_variable_and_dtype( x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'], 'flatten') + if in_dygraph_mode(): + return _C_ops.flatten2(x, 'axis', axis)[0] + helper = LayerHelper('flatten', **locals()) if not (isinstance(x, Variable)): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 76414ea9424..c63ad42288f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -663,7 +663,9 @@ def assign(input, output=None): }) if is_inplace and in_dygraph_mode(): - output._bump_inplace_version() + # TODO(jiabin): Remove this when we support inplace + if not core._in_eager_mode(): + output._bump_inplace_version() return output diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 252482fa6d2..156fdcb9b0a 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr)) ori_place = egr_tensor.place - new_arr = np.random.rand(4, 4, 16, 32).astype('float32') + new_arr = np.random.rand(4, 16, 16, 32).astype('float32') self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr)) - egr_tensor._set_value(new_arr) + egr_tensor.set_value(new_arr) self.assertEqual(egr_tensor.stop_gradient, True) self.assertTrue(egr_tensor.place._equals(ori_place)) - self.assertEqual(egr_tensor.shape, [4, 4, 16, 32]) + self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr)) @@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase): new_weight = np.ones([1, 3]).astype('float32') self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight)) - linear.weight._set_value(new_weight) + linear.weight.set_value(new_weight) self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight)) self.assertTrue(linear.weight.place._equals(ori_place)) diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 3238876b894..aac8b6a99b6 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -533,12 +533,8 @@ class TestTensorRegisterHook(unittest.TestCase): size=[self.batch_size, self.in_size]).astype('float32') data_t = paddle.to_tensor(data) - if _in_eager_mode(): - with self.assertRaises(TypeError): - out = jit_net(data_t) - else: - with self.assertRaises(AssertionError): - out = jit_net(data_t) + with self.assertRaises(AssertionError): + out = jit_net(data_t) def test_register_hook_in_dy2static_mode(self): with _test_eager_guard(): -- GitLab From 34d93bee16ece807bc8dc4f24dbbed64ab40d8fb Mon Sep 17 00:00:00 2001 From: zhangxiaoci Date: Thu, 3 Mar 2022 10:19:49 +0800 Subject: [PATCH 009/261] bugfix in is_xpu_support_op (#40070) --- paddle/fluid/platform/device/xpu/xpu_op_list.cc | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 6127fcfa8de..b20e8ac9785 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -23,12 +23,9 @@ namespace paddle { namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { - auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == phi::backends::xpu::XPUVersion::XPU2) { - ops = get_kl2_ops(); - } - + auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops() + : get_kl2_ops(); if (ops.find(op_name) != ops.end() && ops[op_name].find(type) != ops[op_name].end()) { return true; @@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) { #ifdef PADDLE_WITH_XPU_KP bool is_xpu_kp_support_op(const std::string& op_name, const pOpKernelType& type) { - auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == phi::backends::xpu::XPUVersion::XPU2) { - ops = get_kp_ops(); - } - + auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops() + : get_kp_ops(); if (ops.find(op_name) != ops.end() && ops[op_name].find(type) != ops[op_name].end()) { return true; -- GitLab From 815f7a670a459eea9213cfe46bfde47ad07c1efb Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:45:21 +0800 Subject: [PATCH 010/261] change_ASP_sharding_option (#40028) --- .../paddle/distributed/fleet/base/fleet_base.py | 16 ++++++++++++++++ python/paddle/fluid/contrib/sparsity/asp.py | 9 +++++---- .../asp/test_fleet_with_asp_sharding.py | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index bc59b87e2ff..236322ccfca 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -1430,6 +1430,22 @@ class Fleet(object): # cache original feed forward program self.origin_main_program = loss.block.program + # add distributed attr + if not hasattr(self.origin_main_program, "distributed_info_"): + setattr(self.origin_main_program, "distributed_info_", dict()) + self.origin_main_program.distributed_info_[ + "dp_degree"] = self._user_defined_strategy.sharding_configs[ + "dp_degree"] + self.origin_main_program.distributed_info_[ + "mp_degree"] = self._user_defined_strategy.sharding_configs[ + "mp_degree"] + self.origin_main_program.distributed_info_[ + "pp_degree"] = self._user_defined_strategy.sharding_configs[ + "pp_degree"] + self.origin_main_program.distributed_info_[ + "sharding_degree"] = self._user_defined_strategy.sharding_configs[ + "sharding_degree"] + context["origin_main_program"] = self.origin_main_program context["loss"] = loss if startup_program == None: diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index 937fcdf0463..ffa12ac7046 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -155,8 +155,7 @@ def prune_model(main_program=None, n=2, m=4, mask_algo='mask_1d', - with_mask=True, - sharding=False): + with_mask=True): r""" Pruning parameters of supported layers in :attr:`main_program` via specified mask generation function given by :attr:`mask_algo`. This @@ -179,7 +178,6 @@ def prune_model(main_program=None, mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`. The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'. with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True. - sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False. Returns: dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable. Examples: @@ -221,7 +219,10 @@ def prune_model(main_program=None, # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model` sparsity.prune_model(main_program, mask_algo='mask_2d_best') """ - if sharding: + if main_program is not None and hasattr( + main_program, + "distributed_info_") and main_program.distributed_info_[ + "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda(): gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = paddle.CUDAPlace(gpu_id) else: diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py index 26170015ae8..d9ddd6c88d7 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py +++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py @@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase): feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) exe.run(startup_prog) - sparsity.prune_model(train_prog, sharding=True) + sparsity.prune_model(train_prog) data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) exe.run(train_prog, feed=feeder.feed([data])) -- GitLab From 00bbb8c59a2150e4cda68e0fae7a362e5cb663f5 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:51:17 +0800 Subject: [PATCH 011/261] [Phi] move gaussian_random (#39932) [Phi] move gaussian_random kernel --- paddle/fluid/operators/gaussian_random_op.cc | 23 ---- paddle/fluid/operators/gaussian_random_op.cu | 52 -------- .../phi/kernels/cpu/gaussian_random_kernel.cc | 53 +++++++++ paddle/phi/kernels/gaussian_random_kernel.h | 32 +++++ .../phi/kernels/gpu/gaussian_random_kernel.cu | 111 ++++++++++++++++++ paddle/phi/ops/compat/gaussian_random_sig.cc | 45 +++++++ 6 files changed, 241 insertions(+), 75 deletions(-) create mode 100644 paddle/phi/kernels/cpu/gaussian_random_kernel.cc create mode 100644 paddle/phi/kernels/gaussian_random_kernel.h create mode 100644 paddle/phi/kernels/gpu/gaussian_random_kernel.cu create mode 100644 paddle/phi/ops/compat/gaussian_random_sig.cc diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 774ff0bd065..6b559885c56 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -26,27 +26,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - - std::normal_distribution dist(mean, std); - auto shape = GetShape(context); - tensor->Resize(shape); - int64_t size = tensor->numel(); - T* data = tensor->mutable_data(context.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - } -}; // namespace operators template class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { @@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker); -REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel, - ops::CPUGaussianRandomKernel); REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 21d827c7920..d419bd70e67 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -52,53 +52,6 @@ struct GaussianGenerator { } }; -template -class GPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - auto shape = GetShape(context); - tensor->Resize(shape); - - auto& dev_cxt = - context.template device_context(); - T* data = tensor->mutable_data(dev_cxt.GetPlace()); - - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - distribution::normal_distribution dist; - distribution::normal_transform trans(mean, std); - distribution::distribution_and_transform(dev_cxt, tensor, dist, - trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = - GaussianGenerator(mean, std, seed_offset.first, gen_offset); - IndexKernel>(dev_cxt, tensor, func); - } - } else { - auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); - } - } -}; - template class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { public: @@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - gaussian_random, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel); REGISTER_OP_CUDA_KERNEL( gaussian_random_batch_size_like, paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc new file mode 100644 index 00000000000..7e336f18bf8 --- /dev/null +++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gaussian_random_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/generator.h" + +namespace phi { + +template +void GaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + std::normal_distribution dist(mean, std); + + tensor->Resize(phi::make_ddim(shape.GetData())); + int64_t size = tensor->numel(); + T* data = dev_ctx.template Alloc(tensor); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gaussian_random, + CPU, + ALL_LAYOUT, + phi::GaussianRandomKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h new file mode 100644 index 00000000000..2903d80d22d --- /dev/null +++ b/paddle/phi/kernels/gaussian_random_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void GaussianRandomKernel(const Context& ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu new file mode 100644 index 00000000000..d5acc60a360 --- /dev/null +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gaussian_random_kernel.h" + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +#include "paddle/fluid/framework/generator.h" + +DECLARE_bool(use_curand); + +namespace phi { + +template +struct GaussianGenerator { + T mean_, std_; + unsigned int seed_; + unsigned int offset_ = 0; + + __host__ __device__ GaussianGenerator(T mean, T std, int seed) + : mean_(mean), std_(std), seed_(seed) {} + + __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset) + : mean_(mean), std_(std), seed_(seed), offset_(offset) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + using MT = typename phi::kps::details::MPTypeTrait::Type; + thrust::normal_distribution dist(mean_, std_); + unsigned int new_n = n + offset_; + rng.discard(new_n); + MT out = dist(rng); + return static_cast(out); + } +}; + +template +void GaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + tensor->Resize(phi::make_ddim(shape.GetData())); + + T* data = dev_ctx.template Alloc(tensor); + + int64_t size = tensor->numel(); + + int device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); + + using MT = typename phi::kps::details::MPTypeTrait::Type; + if (gen_cuda->GetIsInitPy() && seed_flag) { + if (FLAGS_use_curand) { + funcs::normal_distribution dist; + funcs::normal_transform trans(mean, std); + funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + auto func = + GaussianGenerator(mean, std, seed_offset.first, gen_offset); + IndexKernel>(dev_ctx, tensor, func); + } + } else { + auto func = GaussianGenerator(mean, std, seed); + IndexKernel>(dev_ctx, tensor, func); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gaussian_random, + GPU, + ALL_LAYOUT, + phi::GaussianRandomKernel, + phi::dtype::float16, + float, + double) {} diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc new file mode 100644 index 00000000000..cddcb80ebea --- /dev/null +++ b/paddle/phi/ops/compat/gaussian_random_sig.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GaussianRandomOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("gaussian_random", + {}, + {"ShapeTensorList", "mean", "std", "seed", "dtype"}, + {"Out"}); + } + + const auto& shape = paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("gaussian_random", + {}, + {"ShapeTensor", "mean", "std", "seed", "dtype"}, + {"Out"}); + } + + return KernelSignature("gaussian_random", + {}, + {"shape", "mean", "std", "seed", "dtype"}, + {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gaussian_random, + phi::GaussianRandomOpArgumentMapping); -- GitLab From 3779e8077faad2f986f1c251265e82e6ab667582 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:55:00 +0800 Subject: [PATCH 012/261] move gather_tree infer shape (#40082) --- paddle/fluid/operators/gather_tree_op.cc | 23 ++++++++--------------- paddle/phi/infermeta/binary.cc | 13 +++++++++++++ paddle/phi/infermeta/binary.h | 4 ++++ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 2868c3697ed..7f6c82032fe 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree"); - OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree"); - - auto ids_dims = ctx->GetInputDim("Ids"); - auto parents_dims = ctx->GetInputDim("Parents"); - PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true, - platform::errors::InvalidArgument( - "The shape of Input(Parents) must be same with the " - "shape of Input(Ids).")); - ctx->SetOutputDim("Out", ids_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -72,4 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); +DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PT_INFER_META(phi::GatherTreeMeta)); + +REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, + GatherTreeInferShapeFunctor); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 675e68af743..7682f6b3d49 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out) { + auto ids_dims = ids.dims(); + auto parents_dims = parents.dims(); + PADDLE_ENFORCE_EQ(ids_dims == parents_dims, + true, + phi::errors::InvalidArgument( + "The shape of Input(Parents) must be same with the " + "shape of Input(Ids).")); + out->set_dims(ids_dims); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index a0140c9a579..5906e06b293 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input, const MetaTensor& label, MetaTensor* out, MetaConfig config = MetaConfig()); + +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out); } // namespace phi -- GitLab From b8a169119bbf8bffcd06fcf68e5634defbe217f8 Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 3 Mar 2022 11:13:03 +0800 Subject: [PATCH 013/261] Workqueue threadnames (#40035) * add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * Set thread name for WorkQueue * Add thread names * fix ut Co-authored-by: liutiexing --- .../new_executor/workqueue/nonblocking_threadpool.h | 2 +- paddle/fluid/platform/init.cc | 3 +++ paddle/fluid/platform/os_info_test.cc | 3 +-- paddle/fluid/platform/profiler/host_event_recorder.h | 8 ++++++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 7b3916bafc9..bc65231abe7 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -409,7 +409,7 @@ class ThreadPoolTempl { return false; } platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 71fd0d20143..372bfbce2ac 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_XPU @@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) { #endif void InitDevices() { + // set name at the entry point of Paddle + platform::SetCurrentThreadName("MainThread"); // CUPTI attribute should be set before any CUDA context is created (see CUPTI // documentation about CUpti_ActivityAttribute). #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc index b309bb98512..b3311f1d19e 100644 --- a/paddle/fluid/platform/os_info_test.cc +++ b/paddle/fluid/platform/os_info_test.cc @@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) { using paddle::platform::GetCurrentThreadName; using paddle::platform::SetCurrentThreadName; using paddle::platform::GetAllThreadNames; - EXPECT_EQ("unset", GetCurrentThreadName()); - EXPECT_TRUE(SetCurrentThreadName("MainThread")); + SetCurrentThreadName("MainThread"); EXPECT_FALSE(SetCurrentThreadName("MainThread")); auto names = GetAllThreadNames(); EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end()); diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index 49f93625275..afd41352465 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -189,7 +189,10 @@ struct ThreadEventSection { class ThreadEventRecorder { public: - ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); } + ThreadEventRecorder() { + thread_id_ = GetCurrentThreadSysId(); + thread_name_ = GetCurrentThreadName(); + } DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); @@ -202,7 +205,7 @@ class ThreadEventRecorder { ThreadEventSection GatherEvents() { ThreadEventSection thr_sec; - thr_sec.thread_name = GetCurrentThreadName(); + thr_sec.thread_name = thread_name_; thr_sec.thread_id = thread_id_; thr_sec.events = std::move(base_evt_cntr_.Reduce()); return thr_sec; @@ -210,6 +213,7 @@ class ThreadEventRecorder { private: uint64_t thread_id_; + std::string thread_name_; EventContainer base_evt_cntr_; }; -- GitLab From 31d3d8574149787768ef49a5f49b2e0f271dc185 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 3 Mar 2022 11:19:17 +0800 Subject: [PATCH 014/261] [PHI] Code auto-generate for Sparse API (#40060) * suppport sparse api in yaml * support auto-gen code of sparse api * do some refactor * add unittest test_sparse_conv_api * add unitest file Co-authored-by: zkh2016 --- .gitignore | 2 + paddle/phi/api/lib/CMakeLists.txt | 33 +- paddle/phi/api/lib/api_custom_impl.cc | 2 +- .../api/lib/{api_utils.h => api_gen_utils.cc} | 65 ++-- paddle/phi/api/lib/api_gen_utils.h | 74 +++++ ...parse_api.cc => sparse_api_custom_impl.cc} | 29 +- .../sparse_api_custom_impl.h} | 10 +- paddle/phi/kernels/sparse/cpu/convolution.h | 4 +- paddle/phi/tests/api/CMakeLists.txt | 1 + paddle/phi/tests/api/test_sparse_conv_api.cc | 174 +++++++++++ python/paddle/utils/code_gen/api_base.py | 12 +- python/paddle/utils/code_gen/api_gen.py | 2 +- .../paddle/utils/code_gen/backward_api_gen.py | 2 +- python/paddle/utils/code_gen/sparse_api.yaml | 21 ++ .../paddle/utils/code_gen/sparse_api_gen.py | 282 ++++++++++++++++++ 15 files changed, 644 insertions(+), 69 deletions(-) rename paddle/phi/api/lib/{api_utils.h => api_gen_utils.cc} (62%) create mode 100644 paddle/phi/api/lib/api_gen_utils.h rename paddle/phi/api/lib/{sparse_api.cc => sparse_api_custom_impl.cc} (86%) rename paddle/phi/api/{include/sparse_api.h => lib/sparse_api_custom_impl.h} (74%) create mode 100644 paddle/phi/tests/api/test_sparse_conv_api.cc create mode 100644 python/paddle/utils/code_gen/sparse_api.yaml create mode 100644 python/paddle/utils/code_gen/sparse_api_gen.py diff --git a/.gitignore b/.gitignore index debec551d9c..a2009a1ed30 100644 --- a/.gitignore +++ b/.gitignore @@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h paddle/phi/api/include/api.h +paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc +paddle/phi/api/lib/sparse_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 5edb83f8c3f..4f449c578ba 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc) set(bw_api_header_file_tmp ${bw_api_header_file}.tmp) set(bw_api_source_file_tmp ${bw_api_source_file}.tmp) +# sparse api file +set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py) +set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) +set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) +set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) +set(sparse_api_header_file_tmp ${api_header_file}.tmp) +set(sparse_api_source_file_tmp ${api_source_file}.tmp) + # wrapped infermeta file set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) @@ -73,6 +81,19 @@ add_custom_command( DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base} VERBATIM) +# generate sparse api +add_custom_command( + OUTPUT ${sparse_api_header_file} ${sparse_api_source_file} + COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} + --api_yaml_path ${sparse_api_yaml_file} + --api_header_path ${sparse_api_header_file_tmp} + --api_source_path ${sparse_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file} + COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" + DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} + VERBATIM) + # generate wrapped infermeta add_custom_command( OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} @@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) +cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) -cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) +cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) -cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl) -cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl) +cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) +cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) +cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 19b113838ea..fc1afb26bf4 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_custom_impl.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_gen_utils.cc similarity index 62% rename from paddle/phi/api/lib/api_utils.h rename to paddle/phi/api/lib/api_gen_utils.cc index 6c1fa97c0f5..f04e74b45fc 100644 --- a/paddle/phi/api/lib/api_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/api/lib/api_gen_utils.h" namespace paddle { namespace experimental { /* ------------------ for input ----------------------- */ -inline std::shared_ptr TensorToDenseTensor( - const Tensor& tensor) { +std::shared_ptr TensorToDenseTensor(const Tensor& tensor) { return std::dynamic_pointer_cast(tensor.impl()); } -inline std::shared_ptr TensorToDenseTensor( +std::shared_ptr TensorToDenseTensor( const paddle::optional& tensor) { if (tensor) { return std::dynamic_pointer_cast(tensor->impl()); @@ -39,7 +31,7 @@ inline std::shared_ptr TensorToDenseTensor( return nullptr; } -inline std::unique_ptr> TensorToDenseTensor( +std::unique_ptr> TensorToDenseTensor( const std::vector& tensors) { auto pt_tensors = std::make_unique>(); pt_tensors->reserve(tensors.size()); @@ -52,12 +44,11 @@ inline std::unique_ptr> TensorToDenseTensor( return std::move(pt_tensors); } -inline std::shared_ptr TensorToSelectedRows( - const Tensor& tensor) { +std::shared_ptr TensorToSelectedRows(const Tensor& tensor) { return std::dynamic_pointer_cast(tensor.impl()); } -inline std::shared_ptr TensorToSelectedRows( +std::shared_ptr TensorToSelectedRows( const paddle::optional& tensor) { if (tensor) { return std::dynamic_pointer_cast(tensor->impl()); @@ -67,11 +58,11 @@ inline std::shared_ptr TensorToSelectedRows( /* ----------------- for infer_meta --------------------- */ -inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { +phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } -inline paddle::optional MakeMetaTensor( +paddle::optional MakeMetaTensor( const paddle::optional& tensor) { if (tensor) { return {phi::MetaTensor(*tensor)}; @@ -79,7 +70,7 @@ inline paddle::optional MakeMetaTensor( return {paddle::none}; } -inline std::vector MakeMetaTensor( +std::vector MakeMetaTensor( const std::vector& tensors) { std::vector meta_tensors; meta_tensors.reserve(tensors.size()); @@ -89,11 +80,11 @@ inline std::vector MakeMetaTensor( return meta_tensors; } -inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { +phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } -inline paddle::optional MakeMetaTensor( +paddle::optional MakeMetaTensor( const paddle::optional& tensor) { if (tensor) { return {phi::MetaTensor(*tensor)}; @@ -103,7 +94,7 @@ inline paddle::optional MakeMetaTensor( /* ------------------ for output ----------------------- */ -inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { +phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { if (!out->initialized()) { auto dense_tensor = std::make_shared( phi::make_intrusive(phi::TransToPhiPlace(backend)), @@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { return static_cast(out->impl().get()); } -inline std::vector SetKernelOutput( - size_t out_size, Backend backend, std::vector* out) { +std::vector SetKernelOutput(size_t out_size, + Backend backend, + std::vector* out) { out->reserve(out_size); std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { @@ -129,8 +121,7 @@ inline std::vector SetKernelOutput( return results; } -inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, - Tensor* out) { +phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) { if (!out->initialized()) { auto select_rows = std::make_shared(); out->set_impl(select_rows); @@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, return static_cast(out->impl().get()); } +phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) { + if (!out->initialized()) { + if (type == TensorType::SPARSE_COO) { + auto sparse_tensor = std::make_shared( + phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1}); + out->set_impl(sparse_tensor); + return sparse_tensor.get(); + } else if (type == TensorType::SPARSE_CSR) { + auto sparse_tensor = + std::make_shared(phi::DenseTensor(), + phi::DenseTensor(), + phi::DenseTensor(), + phi::DDim{-1}); + out->set_impl(sparse_tensor); + return sparse_tensor.get(); + } else { + auto dense_tensor = std::make_shared(); + out->set_impl(dense_tensor); + return dense_tensor.get(); + } + } + return out->impl().get(); +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h new file mode 100644 index 00000000000..109c6e7ab71 --- /dev/null +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace paddle { +namespace experimental { + +enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO }; + +/* ------------------ for input ----------------------- */ + +std::shared_ptr TensorToDenseTensor(const Tensor& tensor); + +std::shared_ptr TensorToDenseTensor( + const paddle::optional& tensor); + +std::unique_ptr> TensorToDenseTensor( + const std::vector& tensors); + +std::shared_ptr TensorToSelectedRows(const Tensor& tensor); + +std::shared_ptr TensorToSelectedRows( + const paddle::optional& tensor); + +/* ----------------- for infer_meta --------------------- */ + +phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); + +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + +std::vector MakeMetaTensor( + const std::vector& tensors); + +phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); + +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + +/* ------------------ for output ----------------------- */ + +phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out); + +std::vector SetKernelOutput(size_t out_size, + Backend backend, + std::vector* out); + +phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out); + +phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type); + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc similarity index 86% rename from paddle/phi/api/lib/sparse_api.cc rename to paddle/phi/api/lib/sparse_api_custom_impl.cc index 9e1f59c0aa7..832c19361e5 100644 --- a/paddle/phi/api/lib/sparse_api.cc +++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" #include #include "glog/logging.h" @@ -20,31 +20,14 @@ limitations under the License. */ #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/infermeta/unary.h" - -PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT); -#endif namespace paddle { namespace experimental { namespace sparse { -PADDLE_API Tensor to_sparse_coo(const Tensor& x, - Backend backend, - const int64_t sparse_dim) { +Tensor to_sparse_coo_impl(const Tensor& x, + Backend backend, + const int64_t sparse_dim) { if (x.layout() == phi::DataLayout::SPARSE_COO) { return x; } @@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x, return out; } -PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { +Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { if (x.layout() == phi::DataLayout::SPARSE_CSR) { return x; } @@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { return out; } -PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { +Tensor to_dense_impl(const Tensor& x, Backend backend) { if (x.layout() != phi::DataLayout::SPARSE_CSR && x.layout() != phi::DataLayout::SPARSE_COO) { return x; diff --git a/paddle/phi/api/include/sparse_api.h b/paddle/phi/api/lib/sparse_api_custom_impl.h similarity index 74% rename from paddle/phi/api/include/sparse_api.h rename to paddle/phi/api/lib/sparse_api_custom_impl.h index a131804cd6f..293b2cfa3d3 100644 --- a/paddle/phi/api/include/sparse_api.h +++ b/paddle/phi/api/lib/sparse_api_custom_impl.h @@ -21,13 +21,13 @@ namespace paddle { namespace experimental { namespace sparse { -PADDLE_API Tensor to_sparse_coo(const Tensor& x, - Backend backend, - const int64_t sparse_dim); +Tensor to_dense_impl(const Tensor& x, Backend backend); -PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend); +Tensor to_sparse_coo_impl(const Tensor& x, + Backend backend, + const int64_t sparse_dim); -PADDLE_API Tensor to_dense(const Tensor& x, Backend backend); +Tensor to_sparse_csr_impl(const Tensor& x, Backend backend); } // namespace sparse } // namespace experimental diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index ab2fef5320f..1031f769179 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx, f_calc_rulebook(nullptr); // alloc the rulebook - rulebook->ResizeAndAllocate({3, rulebook_len}); + DenseTensorMeta rulebook_meta( + DataType::INT32, {3, rulebook_len}, DataLayout::NCHW); + rulebook->set_meta(rulebook_meta); dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); int* rulebook_ptr = rulebook->data(); f_calc_rulebook(rulebook_ptr); diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index cde085423e4..be12960d1d6 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_ cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils) diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc new file mode 100644 index 00000000000..16d7cb66f4c --- /dev/null +++ b/paddle/phi/tests/api/test_sparse_conv_api.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See +the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/api/include/api.h" + +#include "paddle/phi/api/include/sparse_api.h" + +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/sparse_coo_tensor.h" + +template +void TestConv3dBase(const std::vector& indices, + const std::vector& features, + const phi::DDim& x_dims, + const std::vector& kernel, + const phi::DDim& kernel_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const phi::DDim& correct_out_dims, + const int non_zero_num, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3) { + const auto alloc = std::make_unique( + paddle::platform::CPUPlace()); + + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + + phi::DenseTensor indices_tensor( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW)); + memcpy( + indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); + + phi::DenseTensor features_tensor( + alloc.get(), + phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + phi::DataLayout::NHWC)); + memcpy( + features_tensor.data(), features.data(), features.size() * sizeof(T)); + + auto x_tensor = std::make_shared( + indices_tensor, features_tensor, x_dims); + paddle::experimental::Tensor x(x_tensor); + + auto kernel_tensor = std::make_shared( + alloc.get(), + phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + kernel_dims, + phi::DataLayout::NHWC)); + paddle::experimental::Tensor weight(kernel_tensor); + + memcpy(kernel_tensor->mutable_data(paddle::platform::CPUPlace()), + kernel.data(), + kernel.size() * sizeof(T)); + + if (!std::is_same::value) { + auto outs = paddle::experimental::sparse::conv3d( + x, weight, paddings, dilations, strides, 1); + + auto out = std::dynamic_pointer_cast( + std::get<0>(outs).impl()); + ASSERT_EQ(correct_out_dims.size(), out->dims().size()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], out->dims()[i]); + } + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz()); + + int cmp_indices = memcmp(correct_out_indices.data(), + out->non_zero_indices().data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices, 0); + + for (uint64_t i = 0; i < correct_out_features.size(); i++) { + float tmp = std::fabs(static_cast( + correct_out_features[i] - out->non_zero_elements().data()[i])); + ASSERT_LT(tmp, diff); + } + } +} + +void TestConv3d(const std::vector& indices, + const std::vector& features, + const phi::DDim& x_dims, + const std::vector& kernel, + const phi::DDim& kernel_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const phi::DDim& correct_out_dims, + const int non_zero_num, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations) { + // test float + TestConv3dBase(indices, + features, + x_dims, + kernel, + kernel_dims, + correct_out_indices, + correct_out_features, + correct_out_dims, + non_zero_num, + paddings, + strides, + dilations); +} + +TEST(API, sparse_conv2d) { + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + const int in_channels = 1; + const int out_channels = 1; + phi::DDim x_dims = {1, 1, 5, 5, in_channels}; + phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels}; + phi::DDim out_dims = {1, 1, 3, 3, out_channels}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4}; + + std::vector features = {-0.79394531, -0.3125, -0.55029297}; + // 3*3*3=27 + std::vector kernel = {0.65820312, + 0.75048828, + 0.21411133, + 0.17370605, + 0.85546875, + 0.53076172, + 0.28833008, + 0.71044922, + 0.00659943}; + + std::vector out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 1, 2, 0, 1, 2}; + + std::vector out_features = { + -0.17004, -0.71338, -0.00206, -0.22205, -0.09009}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations); +} diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index cfd817c24c7..6c07cdec2ee 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -43,7 +43,9 @@ class BaseAPI(object): self.is_base_api = False self.invoke = api_item_yaml['invoke'] else: - self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta']) + if 'infer_meta' in api_item_yaml: + self.infer_meta = self.parse_infer_meta(api_item_yaml[ + 'infer_meta']) self.kernel = self.parse_kernel(api_item_yaml['kernel']) self.support_selected_rows_kernel = False if len(self.kernel[ 'func']) == 1 else True @@ -182,9 +184,9 @@ class BaseAPI(object): 'Tensor': 'Tensor', 'Tensor[]': 'std::vector' } - if re.search(r'\(\w*\)', output_item): + if re.search(r'\([a-zA-Z0-9_@]*\)', output_item): result = re.search( - r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P\w+)\)", + r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P[a-zA-Z0-9_@]+)\)", output_item) out_type = result.group('out_type') assert out_type in output_type_map, \ @@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. def get_kernel_args(self, code_indent): input_trans_map = { 'const Tensor&': 'const phi::DenseTensor&', - 'const Tensor &': 'const phi::DenseTensor&', 'const std::vector&': 'const std::vector&', - 'const std::vector &': - 'const std::vector&', 'const paddle::optional&': 'paddle::optional', 'const paddle::optional>&': @@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. def get_selected_rows_kernel_args(self, code_indent): input_trans_map = { 'const Tensor&': 'const phi::SelectedRows&', - 'const Tensor &': 'const phi::SelectedRows&', 'const paddle::optional&': 'paddle::optional' } diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index a26630ad041..1bdfa8b6697 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -105,7 +105,7 @@ def source_include(header_file_path): #include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 125ebed82de..b9f991f9b0f 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -146,7 +146,7 @@ def source_include(header_file_path): #include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml new file mode 100644 index 00000000000..135989121cc --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -0,0 +1,21 @@ +- sparse_api : conv3d + args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) + output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) + kernel : + func : sparse_conv3d + layout : x + +- sparse_api : to_dense + args : (Tensor x, Backend backend) + output : Tensor(out@DenseTensor) + invoke : to_dense_impl(x, backend) + +- sparse_api : to_sparse_coo + args : (Tensor x, Backend backend, int64_t sparse_dim) + output : Tensor(out@SparseCooTensor) + invoke : to_sparse_coo_impl(x, backend, sparse_dim) + +- sparse_api : to_sparse_csr + args : (Tensor x, Backend backend) + output : Tensor(out@SparseCsrTensor) + invoke : to_sparse_csr_impl(x, backend) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py new file mode 100644 index 00000000000..99c5a4f49f8 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -0,0 +1,282 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import yaml +import argparse +import re + +from api_base import BaseAPI + + +class SparseAPI(BaseAPI): + def __init__(self, api_item_yaml): + super(SparseAPI, self).__init__(api_item_yaml) + + def get_api_name(self, api_item_yaml): + return api_item_yaml['sparse_api'] + + def get_api_func_name(self): + return self.api + + def get_return_type(self, out_type_list): + return out_type_list[0] if len( + out_type_list) == 1 else "std::tuple<" + ",".join( + out_type_list) + ">" + + def gene_api_declaration(self): + return f""" +// {", ".join(self.outputs['names'])} +PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']}); +""" + + def get_kernel_tensor_out_type(self, output_name): + sparse_type = 'TensorType::DENSE_TENSOR' + if output_name.endswith('@SparseCooTensor'): + sparse_type = 'TensorType::SPARSE_COO' + elif output_name.endswith('@SparseCsrTensor'): + sparse_type = 'TensorType::SPARSE_CSR' + return sparse_type + + def gene_output(self, + output_type_list, + set_out_func, + code_indent, + inplace_flag=False): + kernel_output = "" + output_names = [] + output_create = "" + + if len(output_type_list) == 1: + kernel_output = 'kernel_out' + output_names.append('kernel_out') + inplace_assign = " = " + self.inplace_map[self.outputs['names'][ + 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][0] in self.inplace_map else "" + output_create = f""" + {self.outputs['return_type']} out{inplace_assign}; + auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.outputs['return_type']} out;""" + + for i in range(len(output_type_list)): + kernel_output = kernel_output + f'kernel_out_{i}, ' + output_names.append(f'kernel_out_{i}') + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};""" + + output_create = output_create + f""" + auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.api)) + + return kernel_output, output_names, output_create + + def gen_sparse_kernel_context(self, kernel_output_names): + input_trans_map = { + 'const Tensor&': 'const phi::TenseBase&', + 'const std::vector&': 'const std::vector&', + 'const paddle::optional&': + 'paddle::optional' + } + out_trans_map = { + 'Tensor': 'phi::TenseBase*', + 'std::vector': 'std::vector' + } + input_names = self.inputs['names'] + input_infos = self.inputs['input_info'] + + attr_names = self.attrs['names'] + kernel_param = self.kernel['param'] + if kernel_param is None: + kernel_param = input_names + attr_names + + kernel_context_code = "" + for param in kernel_param: + if param in input_names: + if param in self.optional_vars: + raise ValueError( + f"{self.api} : Unsupport optional input({param}) for sparse api." + ) + else: + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackInput({param}.impl().get());""" + + continue + if param in attr_names: + # set attr for kernel_context + if 'ScalarArray' in self.attrs['attr_info'][param][0]: + param = 'phi::ScalarArray(' + param + ')' + elif 'Scalar' in self.attrs['attr_info'][param][0]: + param = 'phi::Scalar(' + param + ')' + elif isinstance(param, bool): + param = str(param).lower() + else: + param + str(param) + ", " + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackAttr({param});""" + + for out_name in kernel_output_names: + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackOutput({out_name});""" + + return kernel_context_code + + def gen_sparse_kernel_code(self, inplace_flag=False): + _, kernel_output_names, output_create = self.gene_output( + self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag) + + kernel_context_code = self.gen_sparse_kernel_context( + kernel_output_names) + + return f""" + auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); + VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; + VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + auto kernel_context = phi::KernelContext(dev_ctx); +{output_create} +{kernel_context_code} + phi_kernel(&kernel_context); + + return out;""" + + def gene_base_api_code(self, inplace_flag=False): + api_func_name = self.get_api_func_name() + return f""" +PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{ +{self.gene_kernel_select()} +{self.gen_sparse_kernel_code(inplace_flag)} +}} +""" + + +def header_include(): + return """ +#include + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" +""" + + +def source_include(header_file_path): + return f""" +#include "{header_file_path}" +#include + +#include "glog/logging.h" + +#include "paddle/phi/api/lib/api_registry.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/data_transform.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/declarations.h" +""" + + +def api_register(): + return """ +PD_REGISTER_API(Test); +""" + + +def api_namespace(): + return (""" +namespace paddle { +namespace experimental { +namespace sparse { + +""", """ + +} // namespace sparse +} // namespace experimental +} // namespace paddle +""") + + +def generate_api(api_yaml_path, header_file_path, source_file_path): + + with open(api_yaml_path, 'r') as f: + apis = yaml.load(f, Loader=yaml.FullLoader) + header_file = open(header_file_path, 'w') + source_file = open(source_file_path, 'w') + + namespace = api_namespace() + + header_file.write("#pragma once\n") + header_file.write(header_include()) + header_file.write(namespace[0]) + + include_header_file = "paddle/phi/api/include/sparse_api.h" + source_file.write(source_include(include_header_file)) + source_file.write(namespace[0]) + + for api in apis: + sparse_api = SparseAPI(api) + header_file.write(sparse_api.gene_api_declaration()) + source_file.write(sparse_api.gene_api_code()) + + header_file.write(namespace[1]) + source_file.write(namespace[1]) + + source_file.write(api_register()) + + header_file.close() + source_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle C++ Sparse API files') + parser.add_argument( + '--api_yaml_path', + help='path to sparse api yaml file', + default='python/paddle/utils/code_gen/sparse_api.yaml') + + parser.add_argument( + '--api_header_path', + help='output of generated api header code file', + default='paddle/phi/api/include/sparse_api.h') + + parser.add_argument( + '--api_source_path', + help='output of generated api source code file', + default='paddle/phi/api/lib/sparse_api.cc') + + options = parser.parse_args() + + api_yaml_path = options.api_yaml_path + header_file_path = options.api_header_path + source_file_path = options.api_source_path + + generate_api(api_yaml_path, header_file_path, source_file_path) + + +if __name__ == '__main__': + main() -- GitLab From 2ffa643644241b1cecb1a0255dddbfbf1688c16c Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 3 Mar 2022 11:23:43 +0800 Subject: [PATCH 015/261] fix output var may be nullptr and cause segment fault bug (#40079) --- paddle/fluid/framework/operator.cc | 20 ++++++++++++-------- paddle/fluid/imperative/prepared_operator.h | 19 +++++++++++-------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8ebc64e5f2c..b68748a687c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2106,15 +2106,19 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 3b5762720e7..30dbe07d7af 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -314,15 +314,18 @@ void BuildDygraphPhiKernelContext( phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]->MutableVar(); - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); -- GitLab From b565b349752d0917fd5ca3f118ad1c618a098db9 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 3 Mar 2022 11:44:59 +0800 Subject: [PATCH 016/261] add communication api for ProcessGroupNCCL (#40097) --- .../distributed/collective/ProcessGroup.h | 29 ++++ .../collective/ProcessGroupNCCL.cc | 143 ++++++++++++++++++ .../distributed/collective/ProcessGroupNCCL.h | 14 ++ paddle/fluid/distributed/collective/Types.h | 9 ++ paddle/fluid/pybind/distributed_py.cc | 57 +++++++ .../tests/unittests/process_group_nccl.py | 100 +++++++++++- 6 files changed, 345 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index e4f27205202..e43d0e8c183 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -117,6 +117,35 @@ class ProcessGroup { "ProcessGroup%s does not support receive", GetBackendName())); } + virtual std::shared_ptr AllGather( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather", GetBackendName())); + } + + virtual std::shared_ptr AllToAll( + std::vector& in /* tensors */, // NOLINT + std::vector& out /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllToAll", GetBackendName())); + } + + virtual std::shared_ptr Reduce( + std::vector& tensors /* tensors */, // NOLINT + const ReduceOptions& opts) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Reduce", GetBackendName())); + } + + virtual std::shared_ptr Scatter( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */, // NOLINT + const ScatterOptions&) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Scatter", GetBackendName())); + } + protected: const int rank_; const int size_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 5d96e730aa4..88d8fb69eb6 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -473,5 +473,148 @@ std::shared_ptr ProcessGroupNCCL::Recv( return task; } +std::shared_ptr ProcessGroupNCCL::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllGather( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), comm, stream); + }, + CommType::ALLGATHER); +} + +void* GetPointerByOffset(void* raw_pointer, size_t offset, + experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + +std::shared_ptr ProcessGroupNCCL::AllToAll( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + input_tensor->data(), output_tensor->data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); + }, + CommType::REDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + if (rank_ == opts.root_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + } + }, + CommType::SCATTER); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index cfeb6467f0d..d63a5e76838 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup { std::shared_ptr Recv(std::vector& tensors, int src_rank) override; + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 699222ac452..973f7c64354 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -36,5 +36,14 @@ struct BarrierOptions { std::vector place_ids; }; +struct ReduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; + int root_rank = 0; +}; + +struct ScatterOptions { + int root_rank = 0; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 3b5644764a5..17512863357 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -77,6 +77,11 @@ void BindDistributed(py::module *m) { .def(py::init<>()) .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); + py::class_(*m, "ReduceOptions") + .def(py::init<>()) + .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op) + .def_readwrite("source_root", &distributed::ReduceOptions::root_rank); + auto ProcessGroup = py::class_>(*m, "ProcessGroup") @@ -134,6 +139,58 @@ void BindDistributed(py::module *m) { return self.Recv(tensors, src); }, py::arg("tensor"), py::arg("src"), + py::call_guard()) + + .def("all_gather", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.AllGather(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def("alltoall", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.AllToAll(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def("reduce", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + int dst, distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + distributed::ReduceOptions opts; + opts.reduce_op = op; + opts.root_rank = dst; + std::vector tensors = {in_tensor}; + return self.Reduce(tensors, opts); + }, + py::arg("tensor"), py::arg("dst"), + py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def("scatter", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor, int src) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + distributed::ScatterOptions opts; + opts.root_rank = src; + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.Scatter(in_tensors, out_tensors, opts); + }, + py::arg("in"), py::arg("out"), py::arg("src"), py::call_guard()); #if defined(PADDLE_WITH_NCCL) diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py index 8ec5d13c569..4833cea9a8d 100644 --- a/python/paddle/fluid/tests/unittests/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py @@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase): print("test barrier api ok\n") - # test send/recv + # test allgather # rank 0 x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], + [out_shape[0]]) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test alltoall + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], + [self.shape[0]]) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], + [self.shape[0] // 2]) if pg.rank() == 0: - task = pg.send(tensor_x, dst=1) + task = pg.alltoall(tensor_x, tensor_out1) task.wait() paddle.device.cuda.synchronize() # rank 1 else: - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - task = pg.recv(tensor_y, src=0) + task = pg.alltoall(tensor_y, tensor_out2) task.wait() paddle.device.cuda.synchronize() - assert np.array_equal(tensor_x, tensor_y) - print("test send/recv api ok\n") + out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], + [self.shape[0]]) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], + [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") class TestProcessGroupFp16(TestProcessGroupFp32): -- GitLab From 9f74b84eea01c9286640a8be79190a628abd9eed Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 3 Mar 2022 12:15:14 +0800 Subject: [PATCH 017/261] [phi] transfer pad kernel into phi and pass the test_pad_op (#40012) * add pad forward * fix error * transfer pad and pass the test_pad_op --- paddle/fluid/operators/conv_cudnn_op.cu | 42 +++++------ .../operators/conv_transpose_cudnn_op.cu | 48 +++++++------ .../fluid/operators/fused/conv_fusion_op.cu | 12 ++-- paddle/fluid/operators/pad_constant_like_op.h | 12 ++-- paddle/fluid/operators/pad_op.cc | 39 +---------- paddle/fluid/operators/pad_op.h | 63 ----------------- paddle/fluid/operators/spectral_op.h | 7 +- paddle/phi/kernels/cpu/pad_grad_kernel.cc | 28 ++++++++ paddle/phi/kernels/cpu/pad_kernel.cc | 30 ++++++++ .../math => phi/kernels/funcs}/padding.h | 70 +++++++++++-------- paddle/phi/kernels/gpu/pad_grad_kernel.cu | 29 ++++++++ paddle/phi/kernels/gpu/pad_kernel.cu | 31 ++++++++ .../phi/kernels/impl/pad_grad_kernel_impl.h | 33 +++++++++ paddle/phi/kernels/impl/pad_kernel_impl.h | 32 +++++++++ paddle/phi/kernels/pad_grad_kernel.h | 28 ++++++++ paddle/phi/kernels/pad_kernel.h | 28 ++++++++ paddle/phi/ops/compat/pad_sig.cc | 28 ++++++++ 17 files changed, 372 insertions(+), 188 deletions(-) delete mode 100644 paddle/fluid/operators/pad_op.h create mode 100644 paddle/phi/kernels/cpu/pad_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/pad_kernel.cc rename paddle/{fluid/operators/math => phi/kernels/funcs}/padding.h (67%) create mode 100644 paddle/phi/kernels/gpu/pad_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/pad_kernel.cu create mode 100644 paddle/phi/kernels/impl/pad_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/pad_kernel_impl.h create mode 100644 paddle/phi/kernels/pad_grad_kernel.h create mode 100644 paddle/phi/kernels/pad_kernel.h create mode 100644 paddle/phi/ops/compat/pad_sig.cc diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index dff60afd74c..2055bf560e6 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -25,10 +25,10 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_bool(cudnn_deterministic); DECLARE_uint64(conv_workspace_size_limit); @@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: @@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // cuDNN only supports padding the same amount on every dimension. // So we create a new padded input tensor. int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input(input->type()); Tensor transformed_input_grad(input->type()); std::vector padding_common(data_dim, 0); @@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: @@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 4b8f9d7e6ca..141a99f60f1 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_input; @@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_output_grad; @@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; case 5: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; default: @@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (dO) { - math::PadFunction( - ctx, input_pad, transformed_dO_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_dO_channel, pad_value, &transformed_dO); } if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index bb5b363fe83..5dbf4fb88b2 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h index 5df167fdf72..0aedd800e1a 100644 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/padding.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); } - math::PaddingFunctor(rank, context, pads, pad_value, - *in_y, out); + phi::funcs::PaddingFunctor( + rank, context.template device_context(), pads, pad_value, + *in_y, out); } }; @@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); } - math::PaddingGradFunctor(rank, context, pads, *in_dout, - d_y); + phi::funcs::PaddingGradFunctor( + rank, context.template device_context(), pads, *in_dout, + d_y); } }; diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 39acba7e58a..229e61ac9fe 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pad_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" namespace paddle { @@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); -REGISTER_OP_CPU_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CPU_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CUDA_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h deleted file mode 100644 index d494c954e1e..00000000000 --- a/paddle/fluid/operators/pad_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PadKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - int rank = x->dims().size(); - math::PaddingFunctor(rank, context, pads, - static_cast(pad_value), *x, out); - } -}; - -template -class PadGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x == nullptr) { - return; - } - - d_x->mutable_data(context.GetPlace()); - int rank = d_out->dims().size(); - math::PaddingGradFunctor(rank, context, pads, *d_out, - d_x); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index 2bc5124843c..a60ec5a4df5 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -23,9 +23,9 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/padding.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "thrust/device_vector.h" #endif @@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel { std::vector pads(rank * 2, 0); pads[axes.back() * 2 + 1] = zero_length; - paddle::operators::math::PaddingFunctor( - rank, ctx, pads, static_cast(0), *dy, &full_dy); + phi::funcs::PaddingFunctor( + rank, ctx.template device_context(), pads, + static_cast(0), *dy, &full_dy); fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); } diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc new file mode 100644 index 00000000000..67e6da7d0e0 --- /dev/null +++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad_grad, + CPU, + ALL_LAYOUT, + phi::PadGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc new file mode 100644 index 00000000000..f4a0acdcca2 --- /dev/null +++ b/paddle/phi/kernels/cpu/pad_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad, + CPU, + ALL_LAYOUT, + phi::PadKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/fluid/operators/math/padding.h b/paddle/phi/kernels/funcs/padding.h similarity index 67% rename from paddle/fluid/operators/math/padding.h rename to paddle/phi/kernels/funcs/padding.h index 529d39c9ba5..6d10ff2dfcf 100644 --- a/paddle/fluid/operators/math/padding.h +++ b/paddle/phi/kernels/funcs/padding.h @@ -15,21 +15,26 @@ limitations under the License. */ #pragma once #include #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { -template -using EigenTensor = framework::EigenTensor; +using EigenTensor = EigenTensor; template -void PadFunction(const framework::ExecutionContext& context, - const std::vector& pads, const framework::Tensor& src, - T pad_value, framework::Tensor* out) { +void PadFunction(const DeviceContext& context, + const std::vector& pads, + const DenseTensor& src, + T pad_value, + DenseTensor* out) { std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { @@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context, auto src_tensor = EigenTensor::From(src); auto out_tensor = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); + auto& place = *(context.eigen_device()); EigenPad, T, D>::Eval( place, out_tensor, src_tensor, paddings, pad_value); } template -void PadGradFunction(const framework::ExecutionContext& context, - const std::vector& pads, const framework::Tensor& src, - framework::Tensor* d_out) { +void PadGradFunction(const DeviceContext& context, + const std::vector& pads, + const DenseTensor& src, + DenseTensor* d_out) { std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { paddings[i].first = -pads[i * 2]; @@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context, auto d_out_tensor = EigenTensor::From(*d_out); auto src_tensor = EigenTensor::From(src); - auto& place = - *context.template device_context().eigen_device(); + auto& place = *(context.eigen_device()); EigenPad, T, D>::Eval( place, d_out_tensor, src_tensor, paddings, static_cast(0)); } template -void PaddingFunctor(int rank, const framework::ExecutionContext& context, - const std::vector& pads, T pad_value, - const framework::Tensor& src, framework::Tensor* out) { +void PaddingFunctor(int rank, + const DeviceContext& context, + const std::vector& pads, + T pad_value, + const DenseTensor& src, + DenseTensor* out) { switch (rank) { case 1: PadFunction(context, pads, src, pad_value, out); @@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context, PadFunction(context, pads, src, pad_value, out); break; default: - PADDLE_THROW(platform::errors::Unimplemented( - "PadOp only support tensors with no more" - " than 6 dimensions currently.")); + PADDLE_THROW( + phi::errors::Unimplemented("PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } template -void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, +void PaddingGradFunctor(int rank, + const DeviceContext& context, const std::vector& pads, - const framework::Tensor& src, framework::Tensor* out) { + const DenseTensor& src, + DenseTensor* out) { switch (rank) { case 1: PadGradFunction(context, pads, src, out); @@ -118,9 +127,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, PadGradFunction(context, pads, src, out); break; default: - PADDLE_THROW(platform::errors::Unimplemented( - "PadOp only support tensors with no more" - " than 6 dimensions currently.")); + PADDLE_THROW( + phi::errors::Unimplemented("PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } @@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector& pads, } return is_sys_pad; } -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu new file mode 100644 index 00000000000..a25472d122b --- /dev/null +++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad_grad, + GPU, + ALL_LAYOUT, + phi::PadGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu new file mode 100644 index 00000000000..2b77a5f1aeb --- /dev/null +++ b/paddle/phi/kernels/gpu/pad_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/complex.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_kernel_impl.h" +#include "paddle/phi/kernels/pad_kernel.h" + +PD_REGISTER_KERNEL(pad, + GPU, + ALL_LAYOUT, + phi::PadKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h new file mode 100644 index 00000000000..91f198f9fb6 --- /dev/null +++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/padding.h" +namespace phi { +template +void PadGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const std::vector& paddings, + float pad_value, + DenseTensor* d_x) { + if (d_x == nullptr) { + return; + } + dev_ctx.template Alloc(d_x); + int rank = d_out.dims().size(); + phi::funcs::PaddingGradFunctor( + rank, dev_ctx, paddings, d_out, d_x); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h new file mode 100644 index 00000000000..8e3ebb0dfe0 --- /dev/null +++ b/paddle/phi/kernels/impl/pad_kernel_impl.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/padding.h" +namespace phi { +template +void PadKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& paddings, + float pad_value, + DenseTensor* out) { + dev_ctx.template Alloc(out); + int rank = x.dims().size(); + funcs::PaddingFunctor( + rank, dev_ctx, paddings, static_cast(pad_value), x, out); +} +} // namespace phi diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h new file mode 100644 index 00000000000..f39d87e5c0e --- /dev/null +++ b/paddle/phi/kernels/pad_grad_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PadGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const std::vector& paddings, + float pad_value, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h new file mode 100644 index 00000000000..511e8cf73df --- /dev/null +++ b/paddle/phi/kernels/pad_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PadKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& paddings, + float pad_value, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc new file mode 100644 index 00000000000..4eadbfa98be --- /dev/null +++ b/paddle/phi/ops/compat/pad_sig.cc @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pad_grad", + {GradVarName("Out")}, + {"paddings", "pad_value"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping); -- GitLab From 3e56e8167f634e67005b864ad56970bcc6cc3048 Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:03:44 +0800 Subject: [PATCH 018/261] Add support of int16 for gather op. (#40052) * add support of int16 for gather op. * Recover formats. * Recover formats. * fix. * Fix format. * Fix format. --- paddle/fluid/operators/gather_op.cu | 8 ++++++++ python/paddle/tensor/manipulation.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 8f1d9284c50..e0db2f26d3e 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { axis = static_cast(cpu_axis.data()[0]); } else if (axis_type == framework::proto::VarType::INT64) { axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT16) { + axis = static_cast(cpu_axis.data()[0]); } } const auto &place = ctx.GetPlace(); @@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel { } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GatherV2CUDAFunction(x, index, axis, output, dev_ctx); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GPUGather(dev_ctx, *x, *index, output); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index fbd6197c1b9..32ccecbc6d9 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None): return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False) check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], + x, 'x', + ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'], 'gather') check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather') -- GitLab From 5d9e11a4ce1cec37ab7dfbd6a044b1baf90bae22 Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:33:46 +0800 Subject: [PATCH 019/261] Modified sigmoid by the elementwise interface. (#39898) * Modified sigmoid by elementwise interface. * using TensorReduceImpl to repalce Sum function * using reduceimpl to calculate the norm variable * Removed useless code --- .../sigmoid_cross_entropy_with_logits_op.cu | 207 ++++++++++++------ 1 file changed, 139 insertions(+), 68 deletions(-) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 40476d5e11f..18402d908c4 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -20,9 +20,11 @@ namespace cub = hipcub; #endif #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace paddle { namespace operators { @@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) { } template -__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, - const int ignore_index, const int limit, - T *out_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); +struct NonzeroFunctor { + HOSTDEVICE explicit inline NonzeroFunctor() {} + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(static_cast(x) != 0); + } +}; + +template +struct SigmoidFwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label) { + T counts; + T out_data; + + T diff = label - static_cast(ignore_index_); if ((diff > -eps) && (diff < eps)) { - out_data[i] = static_cast(0.); - counts[i] = 0; + out_data = static_cast(0.); + counts = 0; } else { T term1 = (x > 0) ? x : 0; T term2 = x * label; T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - out_data[i] = term1 - term2 + term3; - counts[i] = 1; + + out_data = term1 - term2 + term3; + counts = 1; } - } -} + phi::Array outs; -template -__global__ void Sum(const T *counts, int num, const T eps, T *sum) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T in = 0; - for (int i = threadIdx.x; i < num; i += BlockDim) { - in += counts[i]; + outs[0] = out_data; + outs[1] = counts; + return outs; } - __syncthreads(); - auto out = - BlockReduce(temp_storage).Reduce(static_cast(in), cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - T a = out > eps ? out : eps; - sum[0] = a; - } -} +}; template -__global__ void Div(T *loss, const int num, const T *norm) { - CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } -} +struct SigmoidBwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); -template -__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, - const int ignore_index, const T *dout_data, - const int limit, T *dx_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T dout = dout_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); + HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label, + const T dout) { + T counts; + T dx_data; + + T diff = label - static_cast(ignore_index_); if ((diff > -eps) && (diff < eps)) { - dx_data[i] = static_cast(0.); - counts[i] = 0; + dx_data = static_cast(0.); + counts = 0; } else { T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); T diff = simoid_x - label; - dx_data[i] = dout * diff; - counts[i] = 1; + dx_data = dout * diff; + counts = 1; } + phi::Array outs; + + outs[0] = dx_data; + outs[1] = counts; + return outs; } -} +}; + +template +struct DivFunctor { + const T norm_; + HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} + + HOSTDEVICE inline T operator()(T loss) { + loss /= norm_; + return loss; + } +}; // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template @@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { bool normalize = context.Attr("normalize"); // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - + Tensor *counts_tensor = new Tensor(); + counts_tensor->mutable_data(context.GetPlace(), + Labels->numel() * sizeof(T)); + counts_tensor->Resize(Out->dims()); int limit = Out->numel(); int blocks = NumBlocks(limit); int threads = kNumCUDAThreads; - GPUSigmoidForward<<>>( - X->data(), Labels->data(), ignore_index, limit, out_data, counts); + std::vector ins = {X, Labels}; + std::vector outs = {Out, counts_tensor}; + auto functor = SigmoidFwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel(dev_ctx, ins, + &outs, functor); if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(out_data, limit, norm); + T *counts = counts_tensor->mutable_data(context.GetPlace()); + Tensor *norm_tensor = new Tensor(); + norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + TensorReduceImpl>( + context.cuda_device_context(), *counts_tensor, norm_tensor, + NonzeroFunctor(), reduce_dim, dev_ctx.stream()); + T *norm = norm_tensor->mutable_data(context.GetPlace()); + auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, + sizeof(T), dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {Out}; + std::vector div_outs = {Out}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, + div_functor); + + delete norm_tensor; + delete counts_tensor; } } }; @@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel auto &dev_ctx = context.cuda_device_context(); // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); + Tensor *counts_tensor = new Tensor(); + counts_tensor->mutable_data(context.GetPlace(), + Labels->numel() * sizeof(T)); + counts_tensor->Resize(dX->dims()); int limit = dX->numel(); int blocks = NumBlocks(limit); int threads = kNumCUDAThreads; - GPUSigmoidBackward<<>>( - X->data(), Labels->data(), ignore_index, dOut->data(), limit, - dx_data, counts); + std::vector ins = {X, Labels, dOut}; + std::vector outs = {dX, counts_tensor}; + auto functor = SigmoidBwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel(dev_ctx, ins, + &outs, functor); bool normalize = context.Attr("normalize"); if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(dx_data, limit, norm); + T *counts = counts_tensor->mutable_data(context.GetPlace()); + Tensor *norm_tensor = new Tensor(); + norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + TensorReduceImpl>( + context.cuda_device_context(), *counts_tensor, norm_tensor, + NonzeroFunctor(), reduce_dim, dev_ctx.stream()); + T *norm = norm_tensor->mutable_data(context.GetPlace()); + auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, + sizeof(T), dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {dX}; + std::vector div_outs = {dX}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, + div_functor); + delete norm_tensor; } } }; -- GitLab From c3f3643b26a5bf62e4dfea0d694c15d0cb397af9 Mon Sep 17 00:00:00 2001 From: wenbin Date: Thu, 3 Mar 2022 13:56:16 +0800 Subject: [PATCH 020/261] EmbEltwiseLayernorm fix (#40015) * emb fix * fix trt6 compile * fix half * absolute error fix --- paddle/fluid/inference/tensorrt/engine.h | 2 ++ .../operators/tensorrt/tensorrt_engine_op.h | 36 ++++++++++++++++--- .../test_trt_convert_emb_eltwise_layernorm.py | 16 ++------- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b2764ca61c1..d53a8923af6 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kFLOAT; case FluidDT::VarType_Type_INT32: return TRT_DT::kINT32; + case FluidDT::VarType_Type_FP16: + return TRT_DT::kHALF; default: return TRT_DT::kINT32; } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e05b4de6521..0a71875d893 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector runtime_input_shape, model_input_shape_str, runtime_input_shape_str)); } +static paddle::experimental::DataType TRT2FluidDataType( + nvinfer1::DataType type) { + switch (type) { + case nvinfer1::DataType::kFLOAT: + return paddle::experimental::DataType::FLOAT32; + case nvinfer1::DataType::kINT32: + return paddle::experimental::DataType::INT32; + case nvinfer1::DataType::kHALF: + return paddle::experimental::DataType::FLOAT16; + case nvinfer1::DataType::kINT8: + return paddle::experimental::DataType::INT8; +#if IS_TRT_VERSION_GE(7000) + case nvinfer1::DataType::kBOOL: + return paddle::experimental::DataType::BOOL; +#endif + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "unknown fluid datatype in Fluid op converter")); + return paddle::experimental::DataType::FLOAT32; + } +} + static void RuntimeDynamicShapeCheck( const std::string &x, const std::vector &runtime_input_shape, const std::vector &min_input_shape, @@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT32) { buffers[bind_index] = static_cast(t.data()); + } else if (type == framework::proto::VarType::FP16) { + buffers[bind_index] = static_cast(t.data()); } else { - PADDLE_THROW(platform::errors::Fatal( - "The TRT Engine OP only support float/int32_t/int64_t input.")); + PADDLE_THROW( + platform::errors::Fatal("The TRT Engine OP only support " + "float/int32_t/int64_t/float16 input.")); } } @@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase { "than the number of bindings, but got binding " "index = %d, number of bindings = %d.", bind_index, num_bindings)); - buffers[bind_index] = - static_cast(fluid_t->mutable_data(dev_place)); - + auto trt_type = engine->engine()->getBindingDataType(bind_index); + // get adr and set type + buffers[bind_index] = static_cast( + fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type))); output_index += 1; } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py index 356a2c942df..1eecf9c0497 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py @@ -244,28 +244,16 @@ class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest): self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), (0, 5), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (0, 5), 1e-5 + yield self.create_inference_config(), (0, 5), 2e-2 # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), (1, 4), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 4), 1e-5 - - def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if self.trt_param.precision == paddle_infer.PrecisionType.Half and len( - self.dynamic_shape.min_input_shape) != 0: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "The output has diff between gpu and trt when dynamic fp16 mode.") + yield self.create_inference_config(), (1, 4), 2e-2 def test(self): - self.add_skip_trt_case() self.run_test() -- GitLab From cac00e0bba6b189f21207fde89e27f682913e32f Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 3 Mar 2022 14:01:58 +0800 Subject: [PATCH 021/261] [Phi]Delete kernel registry of elementwise_sub op in Fluid (#40039) * delete elementwise_sub kernel registry * fix compile bugs in xpu ci * fix bugs when run inference ci --- .../elementwise/elementwise_op_npu_test.cc | 2 +- .../elementwise/elementwise_pow_op_xpu.cc | 1 - .../elementwise/elementwise_sub_op.cc | 55 ++--------- .../elementwise/elementwise_sub_op.cu | 63 ------------ .../elementwise/elementwise_sub_op.h | 96 ------------------- .../elementwise/elementwise_sub_op_npu.cc | 2 +- .../elementwise/elementwise_sub_op_xpu.cc | 1 - paddle/phi/kernels/math_kernel.cc | 3 +- paddle/phi/ops/compat/elementwise_sig.cc | 9 ++ 9 files changed, 20 insertions(+), 212 deletions(-) delete mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.cu delete mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.h diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 1f8a95f0286..fc128a88f20 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -33,7 +33,7 @@ namespace p = paddle::platform; USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_OP(elementwise_sub); +USE_OP_ITSELF(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); template diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc index 14b20baae1b..78855dd3957 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc @@ -14,7 +14,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index b2cef95d1a3..d15a7c27275 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" @@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); -REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub); namespace ops = paddle::operators; +REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp, + ::paddle::operators::ElementwiseSubOpMaker, + ::paddle::operators::ElementwiseOpInferVarType, + elementwise_subGradMaker<::paddle::framework::OpDesc>, + elementwise_subGradMaker<::paddle::imperative::OpBase>, + ::paddle::operators::ElementwiseOpInplaceInferer); + REGISTER_OPERATOR( elementwise_sub_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer, @@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad, ops::ElementwiseDoubleGradOpInplaceInferer, ops::ElementwiseDoubleGradNoBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_sub) .AddCheckpoint( R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu deleted file mode 100644 index 2c962af9877..00000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h deleted file mode 100644 index 15c547b493a..00000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/place.h" - -#include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" -namespace paddle { -namespace operators { - -template -class ElementwiseSubKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - phi::SubtractRawKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, axis, z); - } -}; - -template -class ElementwiseSubGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - phi::SubtractGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, axis, dx, dy); - } -}; - -template -class ElementwiseSubDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* ddout = ctx.Output("DDOut"); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - paddle::optional ddx_optional = paddle::none; - paddle::optional ddy_optional = paddle::none; - if (ddx != nullptr) { - ddx_optional = *ddx; - } - if (ddy != nullptr) { - ddy_optional = *ddy; - } - phi::SubtractDoubleGradKernel( - static_cast::TYPE&>(dev_ctx), - *y, ddx_optional, ddy_optional, *dout, axis, ddout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index b68d38d6df1..4169a938f2d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc index d12c6fc30ce..87c494b0e10 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index 480eb56c8b0..8b17d8bd250 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -197,7 +197,8 @@ PD_REGISTER_KERNEL(subtract, int64_t, phi::dtype::float16, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide, GPU, ALL_LAYOUT, diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index cddebcbce12..89846ea0563 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -100,6 +100,12 @@ KernelSignature ElementwiseSubGradOpArgumentMapping( return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -110,6 +116,7 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad); PD_REGISTER_ARG_MAPPING_FN(elementwise_add, phi::ElementwiseAddOpArgumentMapping); @@ -127,3 +134,5 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad, phi::ElementwiseAddTripleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad, phi::ElementwiseSubGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad, + phi::ElementwiseSubDoubleGradOpArgumentMapping); -- GitLab From 831b69d95f975cd20bb227d3ad193c9ba180dbd3 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:08:47 +0800 Subject: [PATCH 022/261] reduce size of max_input_shape so that the ut can pass on win6 (#40088) --- .../tests/unittests/ir/inference/test_trt_convert_gather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py index 9bcbbf95990..852bb2ffa84 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py @@ -138,7 +138,7 @@ class TrtConvertGatherTest(TrtLayerAutoScanTest): "index_data": [1] } self.dynamic_shape.max_input_shape = { - "input_data": [128, 256, 128, 256], + "input_data": [128, 256, 64, 128], "index_data": [4] } self.dynamic_shape.opt_input_shape = { -- GitLab From 756af9fff53245d264b7cc550e88e4360b9750e9 Mon Sep 17 00:00:00 2001 From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:11:42 +0800 Subject: [PATCH 023/261] modify infershape of multiclass nms (#40059) * modify infershape of multiclass nms --- paddle/fluid/operators/detection/multiclass_nms_op.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 7927410ef37..83cf6e5fd30 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. if (score_size == 3) { - ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } else { ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } @@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp { void InferShape(framework::InferShapeContext* ctx) const override { MultiClassNMSOp::InferShape(ctx); - auto box_dims = ctx->GetInputDim("BBoxes"); auto score_dims = ctx->GetInputDim("Scores"); auto score_size = score_dims.size(); if (score_size == 3) { - ctx->SetOutputDim("Index", {box_dims[1], 1}); + ctx->SetOutputDim("Index", {-1, 1}); } else { ctx->SetOutputDim("Index", {-1, 1}); } -- GitLab From b4665d23a766627965328e2adcbe167072c3d197 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Thu, 3 Mar 2022 14:26:49 +0800 Subject: [PATCH 024/261] [CustomRuntime] migrate CustomRuntime into phi (#39908) --- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/custom_kernel.cc | 47 ----- paddle/fluid/framework/custom_kernel.h | 26 --- paddle/fluid/framework/garbage_collector.cc | 10 +- paddle/fluid/framework/garbage_collector.h | 6 +- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/imperative/tracer.cc | 2 +- paddle/fluid/inference/api/CMakeLists.txt | 2 +- .../memory/allocation/allocator_facade.cc | 15 +- .../memory/allocation/custom_allocator.cc | 7 +- .../allocation/naive_best_fit_allocator.cc | 17 +- paddle/fluid/memory/detail/buddy_allocator.cc | 4 +- .../fluid/memory/detail/system_allocator.cc | 6 +- paddle/fluid/memory/memcpy.cc | 20 +-- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/platform/device/CMakeLists.txt | 20 --- .../platform/device/custom/CMakeLists.txt | 4 - .../platform/device/custom/enforce_custom.h | 5 +- paddle/fluid/platform/device/device_wrapper.h | 10 +- paddle/fluid/platform/device_context.cc | 2 +- paddle/fluid/platform/device_context.h | 6 +- paddle/fluid/platform/init.cc | 12 +- paddle/fluid/pybind/pybind.cc | 14 +- paddle/fluid/pybind/tensor_py.h | 4 +- paddle/phi/backends/CMakeLists.txt | 7 + .../backends}/callback_manager.cc | 12 +- .../backends}/callback_manager.h | 6 +- paddle/phi/backends/custom/CMakeLists.txt | 2 + paddle/phi/backends/custom/custom_context.cc | 10 +- .../backends}/custom/custom_device.cc | 167 +++++++++++------- .../backends}/custom/custom_device_test.cc | 30 ++-- .../backends}/custom/fake_cpu_device.h | 22 ++- .../device => phi/backends}/device_base.cc | 88 +++++---- .../device => phi/backends}/device_base.h | 44 +++-- .../device => phi/backends}/device_ext.h | 89 +++++++--- .../device => phi/backends}/device_guard.cc | 8 +- .../device => phi/backends}/device_guard.h | 12 +- .../device => phi/backends}/device_manager.cc | 100 ++++++----- .../device => phi/backends}/device_manager.h | 43 +++-- .../platform/device => phi/backends}/event.cc | 14 +- .../platform/device => phi/backends}/event.h | 6 +- .../device => phi/backends}/stream.cc | 19 +- .../platform/device => phi/backends}/stream.h | 11 +- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/compat/convert_utils.cc | 6 +- paddle/phi/core/custom_kernel.cc | 24 +++ paddle/phi/core/custom_kernel.h | 2 + python/setup.py.in | 5 +- 48 files changed, 513 insertions(+), 462 deletions(-) delete mode 100644 paddle/fluid/framework/custom_kernel.cc delete mode 100644 paddle/fluid/framework/custom_kernel.h delete mode 100644 paddle/fluid/platform/device/custom/CMakeLists.txt rename paddle/{fluid/platform/device => phi/backends}/callback_manager.cc (84%) rename paddle/{fluid/platform/device => phi/backends}/callback_manager.h (94%) rename paddle/{fluid/platform/device => phi/backends}/custom/custom_device.cc (81%) rename paddle/{fluid/platform/device => phi/backends}/custom/custom_device_test.cc (86%) rename paddle/{fluid/platform/device => phi/backends}/custom/fake_cpu_device.h (90%) rename paddle/{fluid/platform/device => phi/backends}/device_base.cc (68%) rename paddle/{fluid/platform/device => phi/backends}/device_base.h (80%) rename paddle/{fluid/platform/device => phi/backends}/device_ext.h (78%) rename paddle/{fluid/platform/device => phi/backends}/device_guard.cc (83%) rename paddle/{fluid/platform/device => phi/backends}/device_guard.h (82%) rename paddle/{fluid/platform/device => phi/backends}/device_manager.cc (83%) rename paddle/{fluid/platform/device => phi/backends}/device_manager.h (83%) rename paddle/{fluid/platform/device => phi/backends}/event.cc (84%) rename paddle/{fluid/platform/device => phi/backends}/event.h (94%) rename paddle/{fluid/platform/device => phi/backends}/stream.cc (84%) rename paddle/{fluid/platform/device => phi/backends}/stream.h (89%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 02d90b9c6da..e486799495c 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -440,11 +440,10 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc deleted file mode 100644 index 49a1e0774a6..00000000000 --- a/paddle/fluid/framework/custom_kernel.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - -#include "paddle/fluid/framework/custom_kernel.h" -#include "paddle/phi/core/custom_kernel.h" - -namespace paddle { -namespace framework { - -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { -#ifdef _LINUX - typedef phi::CustomKernelMap& get_custom_kernel_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetCustomKernelMap")); - - if (func == nullptr) { - LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetCustomKernelMap symbol in this lib."; - return; - } - auto& custom_kernel_map = func(); - phi::RegisterCustomKernels(custom_kernel_map); - LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; -#else - VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; -#endif - return; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h deleted file mode 100644 index 31084a34413..00000000000 --- a/paddle/fluid/framework/custom_kernel.h +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle { -namespace framework { - -// Load custom kernel lib and register -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle); - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 9f2bdeffecf..c1f8041cc1e 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback( CustomStreamGarbageCollector::CustomStreamGarbageCollector( const platform::CustomPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::DeviceGuard guard(place); - stream_.reset(new platform::stream::Stream); + phi::DeviceGuard guard(place); + stream_.reset(new phi::stream::Stream); stream_->Init(place); - callback_manager_.reset(new platform::CallbackManager(stream_.get())); + callback_manager_.reset(new phi::CallbackManager(stream_.get())); } CustomStreamGarbageCollector::~CustomStreamGarbageCollector() { - platform::DeviceGuard guard(this->dev_ctx_->GetPlace()); + phi::DeviceGuard guard(this->dev_ctx_->GetPlace()); stream_->Synchronize(); stream_->Destroy(); } -platform::stream::Stream *CustomStreamGarbageCollector::stream() const { +phi::stream::Stream *CustomStreamGarbageCollector::stream() const { return stream_.get(); } diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index a67860c6087..f0027c67605 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector { void Wait() const override; - platform::stream::Stream *stream() const; + phi::stream::Stream *stream() const; protected: void ClearCallback(const std::function &callback) override; private: - std::unique_ptr stream_; - std::unique_ptr callback_manager_; + std::unique_ptr stream_; + std::unique_ptr callback_manager_; }; #endif diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b68748a687c..eff6d9a9102 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with CustomDevice support.", place)); #else - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #endif } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 85bcbd1458f..4336a5c77c1 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -253,7 +253,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, #endif } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CustomDevice if use " diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 87efe5ec519..6eeb5d64253 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4d0e4852851..6b7828236a8 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -193,10 +193,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitNaiveBestFitCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id)); @@ -240,10 +240,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitAutoGrowthCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); @@ -738,7 +738,7 @@ class AllocatorFacadePrivate { auto custom_allocator = std::make_shared(p); allocators_[p] = std::make_shared( - custom_allocator, platform::DeviceManager::GetMinChunkSize(p), + custom_allocator, phi::DeviceManager::GetMinChunkSize(p), allow_free_idle_chunk); } #endif @@ -814,11 +814,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) { places.emplace_back(platform::CustomPlace(dev_type, dev_id)); } } diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc index bd52c8f4ad2..e53d7b1cc76 100644 --- a/paddle/fluid/memory/allocation/custom_allocator.cc +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) { } phi::Allocation* CustomAllocator::AllocateImpl(size_t size) { - std::call_once(once_flag_, - [this] { platform::DeviceManager::SetDevice(place_); }); + std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); }); void* ptr = - platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); + phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); if (LIKELY(ptr)) { return new Allocation(ptr, size, place_); } size_t avail, total; - platform::DeviceManager::MemoryStats(place_, &total, &avail); + phi::DeviceManager::MemoryStats(place_, &total, &avail); auto dev_type = platform::PlaceHelper::GetDeviceType(place_); auto dev_id = platform::PlaceHelper::GetDeviceId(place_); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index ea6d7019be6..0bfbe2c6962 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -739,7 +739,7 @@ class BuddyAllocatorList { private: explicit BuddyAllocatorList(const std::string &device_type) : device_type_(device_type) { - auto devices = platform::DeviceManager::GetDeviceList(device_type); + auto devices = phi::DeviceManager::GetDeviceList(device_type); for (auto dev_id : devices) { init_flags_[dev_id].reset(new std::once_flag()); } @@ -766,15 +766,15 @@ class BuddyAllocatorList { device_type_, dev_id)); std::call_once(*init_flags_[dev_id], [this, dev_id] { - platform::DeviceManager::SetDevice(device_type_, dev_id); + phi::DeviceManager::SetDevice(device_type_, dev_id); platform::CustomPlace place(device_type_, dev_id); allocators_[dev_id].reset(new BuddyAllocator( std::unique_ptr( new detail::CustomAllocator(device_type_, dev_id)), - platform::DeviceManager::GetMinChunkSize(place), - platform::DeviceManager::GetMaxChunkSize(place), - platform::DeviceManager::GetExtraPaddingSize(place), device_type_)); + phi::DeviceManager::GetMinChunkSize(place), + phi::DeviceManager::GetMaxChunkSize(place), + phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); }); return allocators_[dev_id].get(); @@ -808,9 +808,9 @@ void *Alloc(const platform::CustomPlace &place, auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - platform::DeviceGuard guard(place); + phi::DeviceGuard guard(place); size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "%s. ", @@ -819,8 +819,7 @@ void *Alloc(const platform::CustomPlace &place, string::HumanReadableSize(total - avail))); } else { if (FLAGS_init_allocated_mem) { - platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, - size); + phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size); } } VLOG(10) << " pointer=" << ptr; diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index d7bbfba932c..cdaa2b7b1df 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -43,11 +43,11 @@ BuddyAllocator::BuddyAllocator( #ifdef PADDLE_WITH_CUSTOM_DEVICE if (!dev_type.empty()) { init_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetInitAllocSize( + return phi::DeviceManager::GetInitAllocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; re_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetReallocSize( + return phi::DeviceManager::GetReallocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; } else { diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index a61f98c4e1a..37ac0b44832 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void* p; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); p = device->MemoryAllocate(size); if (LIKELY(p)) { VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size; @@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { } else { size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on %s %d. " "total memory is %s, used memory is %s, " @@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) { size, plug_alloc_size)); plug_alloc_size -= size; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); device->MemoryDeallocate(p, size); } diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 166cdd0b5d6..3198b4f8d93 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -44,9 +44,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( dst, src, num, &stream_wrapper); } @@ -62,9 +62,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(dst_place); - platform::stream::Stream stream_wrapper(dst_place, stream); - platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( + phi::DeviceManager::SetDevice(dst_place); + phi::stream::Stream stream_wrapper(dst_place, stream); + phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( dst, src, num, &stream_wrapper); } @@ -82,16 +82,16 @@ void Copy( << dst_place << ", stream=" << stream; if (src_type == dst_type) { - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); auto src_id = platform::PlaceHelper::GetDeviceId(src_place); auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place); if (src_id == dst_id) { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( dst, src, num, &stream_wrapper); } else { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( dst_place, dst, src, num, &stream_wrapper); } } else { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 04c8a329e5e..5a47443fd0b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -117,7 +117,7 @@ endif() cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # seperate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context custom_kernel) +cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt index ecad5340d71..cbf3fdd263b 100644 --- a/paddle/fluid/platform/device/CMakeLists.txt +++ b/paddle/fluid/platform/device/CMakeLists.txt @@ -1,15 +1,3 @@ -IF(WITH_CUSTOM_DEVICE) -cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place) - -cc_library(device_guard SRCS device_guard.cc DEPS enforce place) - -cc_library(stream SRCS stream.cc DEPS callback_manager) - -cc_library(event SRCS event.cc DEPS enforce place) - -cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags) - -ENDIF() set(DEV_LIBS custom_device) @@ -37,11 +25,3 @@ ENDIF() IF(WITH_MLU) add_subdirectory(mlu) ENDIF() - -# CUSTOM -IF(WITH_CUSTOM_DEVICE) - add_subdirectory(custom) - - cc_library(device_manager SRCS device_manager.cc DEPS custom_device) - set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library") -ENDIF() diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt deleted file mode 100644 index f39c60c0c68..00000000000 --- a/paddle/fluid/platform/device/custom/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -IF(WITH_CUSTOM_DEVICE) -cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context) -cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context ) -ENDIF() diff --git a/paddle/fluid/platform/device/custom/enforce_custom.h b/paddle/fluid/platform/device/custom/enforce_custom.h index fbdb4627aba..ba92b4ac7de 100644 --- a/paddle/fluid/platform/device/custom/enforce_custom.h +++ b/paddle/fluid/platform/device/custom/enforce_custom.h @@ -14,7 +14,10 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/device_ext.h" +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/device_ext.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index ba3461d8c14..6803a39a4fd 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -40,10 +40,10 @@ limitations under the License. */ #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/callback_manager.h" #include "paddle/fluid/platform/device/custom/enforce_custom.h" -#include "paddle/fluid/platform/device/device_guard.h" -#include "paddle/fluid/platform/device/device_manager.h" -#include "paddle/fluid/platform/device/event.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/phi/backends/callback_manager.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/backends/event.h" +#include "paddle/phi/backends/stream.h" #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6a7956628f8..f60cbc48694 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -903,7 +903,7 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( CustomDeviceContext::CustomDeviceContext(CustomPlace place) : phi::CustomContext(place) { Init(); - stream_.reset(new platform::stream::Stream(place, stream())); + stream_.reset(new phi::stream::Stream(place, stream())); } CustomDeviceContext::~CustomDeviceContext() {} diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e9124dfc1f8..29b6477b683 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -72,8 +72,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/npu_stream.h" #endif -#include "paddle/fluid/platform/device/device_ext.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/phi/backends/device_ext.h" +#include "paddle/phi/backends/stream.h" #if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__) #include "unsupported/Eigen/CXX11/Tensor" @@ -838,7 +838,7 @@ class CustomDeviceContext : public phi::CustomContext { void WaitStreamCallback() const { return stream_->WaitCallback(); } private: - std::shared_ptr stream_; + std::shared_ptr stream_; }; template <> struct DefaultDeviceContextType { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 372bfbce2ac..cf85dede8e8 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -55,7 +55,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif -#include "paddle/fluid/framework/custom_kernel.h" +#include "paddle/phi/core/custom_kernel.h" DECLARE_int32(paddle_num_threads); PADDLE_DEFINE_EXPORTED_int32( @@ -145,7 +145,7 @@ void InitCupti() { #ifdef PADDLE_WITH_CUSTOM_DEVICE void LoadCustomDevice(const std::string &library_dir) { LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]"; - std::vector libs = platform::ListAllLibraries(library_dir); + std::vector libs = phi::ListAllLibraries(library_dir); for (const auto &lib_path : libs) { auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW); PADDLE_ENFORCE_NOT_NULL( @@ -153,8 +153,8 @@ void LoadCustomDevice(const std::string &library_dir) { platform::errors::InvalidArgument( "Fail to open library: %s with error: %s", lib_path, dlerror())); - platform::LoadCustomRuntimeLib(lib_path, dso_handle); - framework::LoadCustomKernelLib(lib_path, dso_handle); + phi::LoadCustomRuntimeLib(lib_path, dso_handle); + phi::LoadCustomKernelLib(lib_path, dso_handle); } LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir << "]"; @@ -259,9 +259,9 @@ void InitDevices(const std::vector devices) { LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root; LoadCustomDevice(custom_kernel_root); - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (auto &dev_type : device_types) { - auto device_count = platform::DeviceManager::GetDeviceCount(dev_type); + auto device_count = phi::DeviceManager::GetDeviceCount(dev_type); LOG(INFO) << "CustomDevice: " << dev_type << ", visible devices count: " << device_count; for (size_t i = 0; i < device_count; i++) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ffc42dc30ed..c016321ef80 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1668,7 +1668,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_all_device_type", []() { std::vector device_types; #ifdef PADDLE_WITH_CUSTOM_DEVICE - device_types = platform::DeviceManager::GetAllDeviceTypes(); + device_types = phi::DeviceManager::GetAllDeviceTypes(); #else LOG(WARNING) << string::Sprintf( "Cannot use get_all_device_type because you have installed" @@ -1682,7 +1682,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_all_custom_device_type", []() { std::vector device_types; #ifdef PADDLE_WITH_CUSTOM_DEVICE - device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); #else LOG(WARNING) << string::Sprintf( "Cannot use get_all_custom_device_type because you have installed" @@ -1696,7 +1696,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_available_device", [] { std::vector devices; #ifdef PADDLE_WITH_CUSTOM_DEVICE - devices = platform::DeviceManager::GetAllDeviceList(); + devices = phi::DeviceManager::GetAllDeviceList(); #else LOG(WARNING) << string::Sprintf( "Cannot use get_available_device because you have installed" @@ -1710,7 +1710,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_available_custom_device", [] { std::vector devices; #ifdef PADDLE_WITH_CUSTOM_DEVICE - devices = platform::DeviceManager::GetAllCustomDeviceList(); + devices = phi::DeviceManager::GetAllCustomDeviceList(); #else LOG(WARNING) << string::Sprintf( "Cannot use get_available_custom_device because you have " @@ -1747,10 +1747,10 @@ All parameter, weight, gradient are variables in Paddle. std::exit(-1); } - if (LIKELY(platform::DeviceManager::HasDeviceType(device_type) && - platform::DeviceManager::IsCustom(device_type))) { + if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) && + phi::DeviceManager::IsCustom(device_type))) { int dev_count = static_cast( - platform::DeviceManager::GetDeviceCount(device_type)); + phi::DeviceManager::GetDeviceCount(device_type)); if (UNLIKELY(dev_id >= dev_count)) { if (dev_count == 0) { LOG(ERROR) << "Cannot use " << device_type diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index e7abd64ec44..c593c7df3e0 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -393,10 +393,10 @@ void SetTensorFromPyArrayT( } else if (paddle::platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE platform::Place tmp_place = place; - platform::DeviceGuard guard(tmp_place); + phi::DeviceGuard guard(tmp_place); auto dst = self->mutable_data(place); - platform::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D( + phi::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D( reinterpret_cast(dst), const_cast(reinterpret_cast(array.data())), array.nbytes()); diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 43e477ef32e..5f616155546 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -24,4 +24,11 @@ endif() if(WITH_CUSTOM_DEVICE) add_dependencies(phi_context custom_context) + cc_library(callback_manager SRCS callback_manager.cc DEPS enforce place) + cc_library(device_guard SRCS device_guard.cc DEPS enforce place) + cc_library(stream SRCS stream.cc DEPS callback_manager) + cc_library(event SRCS event.cc DEPS enforce place) + cc_library(device_base SRCS device_base.cc DEPS stream event callback_manager device_guard device_context flags) + cc_library(device_manager SRCS device_manager.cc DEPS custom_device) + set(GLOB_DEV_LIB device_manager custom_device CACHE INTERNAL "Global DEV library") endif() diff --git a/paddle/fluid/platform/device/callback_manager.cc b/paddle/phi/backends/callback_manager.cc similarity index 84% rename from paddle/fluid/platform/device/callback_manager.cc rename to paddle/phi/backends/callback_manager.cc index c677bc0262f..e21e8502d8f 100644 --- a/paddle/fluid/platform/device/callback_manager.cc +++ b/paddle/phi/backends/callback_manager.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/callback_manager.h" +#include "paddle/phi/backends/callback_manager.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" -namespace paddle { -namespace platform { +namespace phi { CallbackManager::CallbackManager(stream::Stream *stream) : stream_(stream), thread_pool_(1) {} @@ -32,12 +31,12 @@ void CallbackManager::AddCallback(std::function callback) const { }); }); - platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) ->AddCallback(stream_, func); } void CallbackManager::Wait() const { - platform::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) + phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) ->SynchronizeStream(stream_); { @@ -48,5 +47,4 @@ void CallbackManager::Wait() const { } } -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/callback_manager.h b/paddle/phi/backends/callback_manager.h similarity index 94% rename from paddle/fluid/platform/device/callback_manager.h rename to paddle/phi/backends/callback_manager.h index 0edc694c94b..a15cb075668 100644 --- a/paddle/fluid/platform/device/callback_manager.h +++ b/paddle/phi/backends/callback_manager.h @@ -32,8 +32,7 @@ #include "paddle/fluid/platform/enforce.h" -namespace paddle { -namespace platform { +namespace phi { namespace stream { class Stream; @@ -58,5 +57,4 @@ class CallbackManager { mutable std::future last_future_; }; -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt index cb54d367568..5b46afb4ce9 100644 --- a/paddle/phi/backends/custom/CMakeLists.txt +++ b/paddle/phi/backends/custom/CMakeLists.txt @@ -1,3 +1,5 @@ if (WITH_CUSTOM_DEVICE) cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager) + cc_library(custom_device SRCS custom_device.cc DEPS device_base device_context) + cc_test(custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context) endif() diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc index bde3b6a0853..e34e0f94b70 100644 --- a/paddle/phi/backends/custom/custom_context.cc +++ b/paddle/phi/backends/custom/custom_context.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/backends/custom/custom_context.h" -#include "paddle/fluid/platform/device/device_guard.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/stream.h" namespace phi { @@ -25,8 +25,8 @@ struct CustomContext::Impl { ~Impl() {} void Init() { - paddle::platform::DeviceGuard guard(place_); - stream_.reset(new paddle::platform::stream::Stream()); + phi::DeviceGuard guard(place_); + stream_.reset(new phi::stream::Stream()); stream_->Init(place_); } @@ -40,7 +40,7 @@ struct CustomContext::Impl { Place place_; - std::shared_ptr stream_; + std::shared_ptr stream_; }; void CustomContext::Init() { impl_->Init(); } diff --git a/paddle/fluid/platform/device/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc similarity index 81% rename from paddle/fluid/platform/device/custom/custom_device.cc rename to paddle/phi/backends/custom/custom_device.cc index 09f0421a878..df757b286a6 100644 --- a/paddle/fluid/platform/device/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -12,23 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/device_base.h" -#include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/device/event.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/fluid/platform/device/custom/enforce_custom.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/callback_manager.h" +#include "paddle/phi/backends/device_base.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/backends/event.h" +#include "paddle/phi/backends/stream.h" static bool operator==(const C_Device_st& d1, const C_Device_st& d2) { return d1.id == d2.id; } -namespace paddle { -namespace platform { +namespace phi { class CustomDevice : public DeviceInterface { public: - CustomDevice(const std::string& type, int priority, bool is_custom, - std::unique_ptr pimpl, void* dso_handle) + CustomDevice(const std::string& type, + int priority, + bool is_custom, + std::unique_ptr pimpl, + void* dso_handle) : DeviceInterface(type, priority, is_custom), pimpl_(std::move(pimpl)), dso_handle_(dso_handle) { @@ -122,14 +127,15 @@ class CustomDevice : public DeviceInterface { return device.id; } - void CreateStream(size_t dev_id, stream::Stream* stream, + void CreateStream(size_t dev_id, + stream::Stream* stream, const stream::Stream::Priority& priority = stream::Stream::Priority::kNormal, const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag) override { if (priority != stream::Stream::Priority::kNormal || flag != stream::Stream::Flag::kDefaultFlag) { - PADDLE_THROW(platform::errors::Unavailable( + PADDLE_THROW(phi::errors::Unavailable( "priority != stream::Stream::Priority::kNormal || flag != " "stream::Stream::Flag::kDefaultFlag is not allowed on " "CustomDevice.")); @@ -162,23 +168,28 @@ class CustomDevice : public DeviceInterface { SynchronizeStream(dev_id, stream); return true; } - if (pimpl_->query_stream(device, reinterpret_cast( - stream->raw_stream())) == C_SUCCESS) { + if (pimpl_->query_stream( + device, reinterpret_cast(stream->raw_stream())) == + C_SUCCESS) { return true; } return false; } - void AddCallback(size_t dev_id, stream::Stream* stream, + void AddCallback(size_t dev_id, + stream::Stream* stream, stream::Stream::Callback* callback) override { if (!pimpl_->stream_add_callback) { - PADDLE_THROW(platform::errors::Unavailable( + PADDLE_THROW(phi::errors::Unavailable( "AddCallback is not supported on %s.", Type())); } else { const auto device = &devices_pool[dev_id]; PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_add_callback( - device, reinterpret_cast(stream->raw_stream()), - [](C_Device device, C_Stream stream, void* user_data, + device, + reinterpret_cast(stream->raw_stream()), + [](C_Device device, + C_Stream stream, + void* user_data, C_Status* status) { std::unique_ptr> func( reinterpret_cast*>(user_data)); @@ -188,7 +199,8 @@ class CustomDevice : public DeviceInterface { } } - void CreateEvent(size_t dev_id, event::Event* event, + void CreateEvent(size_t dev_id, + event::Event* event, event::Event::Flag flags) override { const auto device = &devices_pool[dev_id]; C_Event c_event; @@ -205,13 +217,15 @@ class CustomDevice : public DeviceInterface { device, reinterpret_cast(event->raw_event()))); } - void RecordEvent(size_t dev_id, const event::Event* event, + void RecordEvent(size_t dev_id, + const event::Event* event, const stream::Stream* stream) override { const auto device = &devices_pool[dev_id]; - PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->record_event( - device, reinterpret_cast(stream->raw_stream()), - reinterpret_cast(event->raw_event()))); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->record_event(device, + reinterpret_cast(stream->raw_stream()), + reinterpret_cast(event->raw_event()))); } void SynchronizeEvent(size_t dev_id, const event::Event* event) override { @@ -228,78 +242,93 @@ class CustomDevice : public DeviceInterface { SynchronizeEvent(dev_id, event); return true; } - if (pimpl_->query_event(device, reinterpret_cast( - event->raw_event())) == C_SUCCESS) { + if (pimpl_->query_event(device, + reinterpret_cast(event->raw_event())) == + C_SUCCESS) { return true; } return false; } - void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + void StreamWaitEvent(size_t dev_id, + const stream::Stream* stream, const event::Event* event) override { const auto device = &devices_pool[dev_id]; PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->stream_wait_event( - device, reinterpret_cast(stream->raw_stream()), + device, + reinterpret_cast(stream->raw_stream()), reinterpret_cast(event->raw_event()))); } - void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, size_t size, + void MemoryCopyH2D(size_t dev_id, + void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr) override { const auto device = &devices_pool[dev_id]; - auto place = platform::CustomPlace(Type(), dev_id); + auto place = CustomPlace(Type(), dev_id); if (stream && stream->raw_stream() && pimpl_->async_memory_copy_h2d) { C_Stream c_stream = reinterpret_cast(stream->raw_stream()); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size)); } else { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); pool.Get(place)->Wait(); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->memory_copy_h2d(device, dst, src, size)); } } - void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, size_t size, + void MemoryCopyD2H(size_t dev_id, + void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr) override { const auto device = &devices_pool[dev_id]; - auto place = platform::CustomPlace(Type(), dev_id); + auto place = CustomPlace(Type(), dev_id); if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2h) { C_Stream c_stream = reinterpret_cast(stream->raw_stream()); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size)); } else { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); pool.Get(place)->Wait(); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->memory_copy_d2h(device, dst, src, size)); } } - void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, size_t size, + void MemoryCopyD2D(size_t dev_id, + void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr) override { const auto device = &devices_pool[dev_id]; - auto place = platform::CustomPlace(Type(), dev_id); + auto place = CustomPlace(Type(), dev_id); if (stream && stream->raw_stream() && pimpl_->async_memory_copy_d2d) { C_Stream c_stream = reinterpret_cast(stream->raw_stream()); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size)); } else { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); pool.Get(place)->Wait(); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->memory_copy_d2d(device, dst, src, size)); } } - void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_dev_id, - const void* src, size_t size, + void MemoryCopyP2P(const Place& dst_place, + void* dst, + size_t src_dev_id, + const void* src, + size_t size, const stream::Stream* stream = nullptr) override { int dst_dev_id = PlaceToId(dst_place); auto dst_device = &devices_pool[dst_dev_id]; @@ -310,8 +339,12 @@ class CustomDevice : public DeviceInterface { MemoryCopyP2P(dst_place, dst, src_dev_id, src, size); } else { PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->async_memory_copy_p2p( - dst_device, src_device, - reinterpret_cast(stream->raw_stream()), dst, src, size)); + dst_device, + src_device, + reinterpret_cast(stream->raw_stream()), + dst, + src, + size)); } } else { if (!pimpl_->memory_copy_p2p) { @@ -319,9 +352,9 @@ class CustomDevice : public DeviceInterface { MemoryCopyD2H(src_dev_id, tmp.get(), src, size); MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size); } else { - auto src_place = platform::CustomPlace(Type(), src_dev_id); - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); + auto src_place = CustomPlace(Type(), src_dev_id); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); pool.Get(src_place)->Wait(); PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size)); @@ -350,8 +383,8 @@ class CustomDevice : public DeviceInterface { const auto device = &devices_pool[dev_id]; if (!pimpl_->unified_memory_allocate) { - PADDLE_THROW(platform::errors::Unavailable( - "MemoryAllocKind::Host is not supported on %s.", Type())); + PADDLE_THROW(phi::errors::Unavailable( + "MemoryAllocateHost is not supported on %s.", Type())); } else { PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->host_memory_allocate(device, &ptr, size)); @@ -363,8 +396,8 @@ class CustomDevice : public DeviceInterface { const auto device = &devices_pool[dev_id]; if (!pimpl_->host_memory_deallocate) { - PADDLE_THROW(platform::errors::Unavailable( - "MemoryAllocKind::Host is not supported on %s.", Type())); + PADDLE_THROW(phi::errors::Unavailable( + "MemoryDeallocateHost is not supported on %s.", Type())); } else { PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->host_memory_deallocate(device, ptr, size)); @@ -376,8 +409,8 @@ class CustomDevice : public DeviceInterface { const auto device = &devices_pool[dev_id]; if (!pimpl_->unified_memory_allocate) { - PADDLE_THROW(platform::errors::Unavailable( - "MemoryAllocKind::Unified is not supported on %s.", Type())); + PADDLE_THROW(phi::errors::Unavailable( + "MemoryAllocateUnified is not supported on %s.", Type())); } else { PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->unified_memory_allocate(device, &ptr, size)); @@ -389,15 +422,17 @@ class CustomDevice : public DeviceInterface { const auto device = &devices_pool[dev_id]; if (!pimpl_->unified_memory_deallocate) { - PADDLE_THROW(platform::errors::Unavailable( - "MemoryAllocKind::Host is not supported on %s.", Type())); + PADDLE_THROW(phi::errors::Unavailable( + "MemoryDeallocateUnified is not supported on %s.", Type())); } else { PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( pimpl_->unified_memory_deallocate(device, ptr, size)); } } - void MemorySet(size_t dev_id, void* ptr, uint8_t value, + void MemorySet(size_t dev_id, + void* ptr, + uint8_t value, size_t size) override { const auto device = &devices_pool[dev_id]; @@ -532,10 +567,12 @@ class CustomDevice : public DeviceInterface { inline int PlaceToId(const Place& place) { int dev_id = PlaceToIdNoCheck(place); - PADDLE_ENFORCE_NE(devices_pool.find(dev_id), devices_pool.end(), - platform::errors::NotFound( + PADDLE_ENFORCE_NE(devices_pool.find(dev_id), + devices_pool.end(), + phi::errors::NotFound( "Cannot found %s %d, please check visible devices", - Type(), dev_id)); + Type(), + dev_id)); return dev_id; } @@ -623,11 +660,14 @@ typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params); void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, std::unique_ptr device_interface, - const std::string& dso_lib_path, void* dso_handle) { + const std::string& dso_lib_path, + void* dso_handle) { if (ValidCustomCustomRuntimeParams(&runtime_params)) { - auto device = - std::make_unique(runtime_params.device_type, 255, true, - std::move(device_interface), dso_handle); + auto device = std::make_unique(runtime_params.device_type, + 255, + true, + std::move(device_interface), + dso_handle); if (false == DeviceManager::Register(std::move(device))) { LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]. Register failed!!! there may be a " @@ -665,10 +705,9 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) { "compatibility between PaddlePaddle and Custom Runtime."; return; } - LoadCustomRuntimeLib(runtime_params, std::move(device_interface), - dso_lib_path, dso_handle); + LoadCustomRuntimeLib( + runtime_params, std::move(device_interface), dso_lib_path, dso_handle); LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path; } -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc similarity index 86% rename from paddle/fluid/platform/device/custom/custom_device_test.cc rename to paddle/phi/backends/custom/custom_device_test.cc index e42fbbb9448..53b88f9b4ac 100644 --- a/paddle/fluid/platform/device/custom/custom_device_test.cc +++ b/paddle/phi/backends/custom/custom_device_test.cc @@ -17,9 +17,9 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device/custom/fake_cpu_device.h" -#include "paddle/fluid/platform/device/device_manager.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/custom/fake_cpu_device.h" +#include "paddle/phi/backends/device_manager.h" void RegisterDevice() { CustomRuntimeParams runtime_params; @@ -30,23 +30,22 @@ void RegisterDevice() { runtime_params.interface->size = sizeof(C_DeviceInterface); InitFakeCPUDevice(&runtime_params); - paddle::platform::LoadCustomRuntimeLib( + phi::LoadCustomRuntimeLib( runtime_params, std::move(device_interface), "", nullptr); } void InitDevice() { RegisterDevice(); - EXPECT_GT(static_cast( - paddle::platform::DeviceManager::GetAllDeviceTypes().size()), + EXPECT_GT(static_cast(phi::DeviceManager::GetAllDeviceTypes().size()), 0); auto place = paddle::platform::CustomPlace(DEVICE_TYPE, 0); - auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); EXPECT_NE(device, nullptr); std::vector places; - auto device_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllDeviceTypes(); for (auto dev_type : device_types) { - auto devices = paddle::platform::DeviceManager::GetDeviceList(dev_type); + auto devices = phi::DeviceManager::GetDeviceList(dev_type); for (auto dev_id : devices) { places.push_back( paddle::platform::PlaceHelper::CreatePlace(dev_type, dev_id)); @@ -60,14 +59,14 @@ void InitDevice() { void TestDeviceInterface(const paddle::platform::Place& place) { std::cout << "TestDeviceInterface on " << place << std::endl; if (paddle::platform::is_custom_place(place)) { - auto device = paddle::platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); auto dev_type = paddle::platform::PlaceHelper::GetDeviceType(place); - auto p1 = device->MemoryAllocate( - paddle::platform::DeviceManager::GetMinChunkSize(place)); + auto p1 = + device->MemoryAllocate(phi::DeviceManager::GetMinChunkSize(place)); EXPECT_NE(p1, nullptr); - paddle::platform::DeviceManager::SetDevice(place); - auto dev_id = paddle::platform::DeviceManager::GetDevice(dev_type); + phi::DeviceManager::SetDevice(place); + auto dev_id = phi::DeviceManager::GetDevice(dev_type); EXPECT_EQ(dev_id, place.GetDeviceId()); } } @@ -168,11 +167,10 @@ void TestTensorUtils(const paddle::platform::Place& place) { TEST(CustomDevice, Tensor) { InitDevice(); - auto dev_types = paddle::platform::DeviceManager::GetAllDeviceTypes(); + auto dev_types = phi::DeviceManager::GetAllDeviceTypes(); for (const auto& dev_type : dev_types) { std::cout << "Test on " << dev_type << std::endl; - EXPECT_GT(static_cast( - paddle::platform::DeviceManager::GetDeviceCount(dev_type)), + EXPECT_GT(static_cast(phi::DeviceManager::GetDeviceCount(dev_type)), 0); auto place = paddle::platform::PlaceHelper::CreatePlace(dev_type); diff --git a/paddle/fluid/platform/device/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h similarity index 90% rename from paddle/fluid/platform/device/custom/fake_cpu_device.h rename to paddle/phi/backends/custom/fake_cpu_device.h index c6d8ade4b08..22c344a0e04 100644 --- a/paddle/fluid/platform/device/custom/fake_cpu_device.h +++ b/paddle/phi/backends/custom/fake_cpu_device.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/fluid/platform/device/device_ext.h" +#include "paddle/phi/backends/device_ext.h" constexpr size_t global_total_memory = 1024 * 1024UL; static size_t global_free_memory = global_total_memory; @@ -43,14 +43,19 @@ C_Status GetDevicesList(size_t *device) { return C_SUCCESS; } -C_Status MemCpy(const C_Device device, void *dst, const void *src, +C_Status MemCpy(const C_Device device, + void *dst, + const void *src, size_t size) { memcpy(dst, src, size); return C_SUCCESS; } -C_Status AsyncMemCpy(const C_Device device, C_Stream stream, void *dst, - const void *src, size_t size) { +C_Status AsyncMemCpy(const C_Device device, + C_Stream stream, + void *dst, + const void *src, + size_t size) { memcpy(dst, src, size); return C_SUCCESS; } @@ -100,14 +105,16 @@ C_Status SyncStream(const C_Device device, C_Stream stream) { C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; } -C_Status StreamWaitEvent(const C_Device device, C_Stream stream, +C_Status StreamWaitEvent(const C_Device device, + C_Stream stream, C_Event event) { return C_SUCCESS; } C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; } -C_Status DeviceMemStats(const C_Device device, size_t *total_memory, +C_Status DeviceMemStats(const C_Device device, + size_t *total_memory, size_t *free_memory) { *total_memory = global_total_memory; *free_memory = global_free_memory; @@ -139,7 +146,8 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) { params->version.minor = PADDLE_CUSTOM_RUNTIME_MINOR_VERSION; params->version.patch = PADDLE_CUSTOM_RUNTIME_PATCH_VERSION; - memset(reinterpret_cast(params->interface), 0, + memset(reinterpret_cast(params->interface), + 0, sizeof(C_DeviceInterface)); params->interface->initialize = Init; diff --git a/paddle/fluid/platform/device/device_base.cc b/paddle/phi/backends/device_base.cc similarity index 68% rename from paddle/fluid/platform/device/device_base.cc rename to paddle/phi/backends/device_base.cc index 6234c961268..6f634c58af0 100644 --- a/paddle/fluid/platform/device/device_base.cc +++ b/paddle/phi/backends/device_base.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/device_base.h" +#include "paddle/phi/backends/device_base.h" #include "gflags/gflags.h" DECLARE_double(fraction_of_gpu_memory_to_use); @@ -21,26 +21,25 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); constexpr static float fraction_reserve_gpu_memory = 0.05f; -namespace paddle { -namespace platform { +namespace phi { -#define INTERFACE_UNIMPLEMENT \ - PADDLE_THROW(platform::errors::Unimplemented( \ +#define INTERFACE_UNIMPLEMENT \ + PADDLE_THROW(phi::errors::Unimplemented( \ "%s is not implemented on %s device.", __func__, Type())); // info size_t DeviceInterface::GetComputeCapability() { - VLOG(10) << Type() + " get compute capability " << 0; + VLOG(10) << Type() << " get compute capability " << 0; return 0; } size_t DeviceInterface::GetRuntimeVersion() { - VLOG(10) << Type() + " get runtime version " << 0; + VLOG(10) << Type() << " get runtime version " << 0; return 0; } size_t DeviceInterface::GetDriverVersion() { - VLOG(10) << Type() + " get driver version " << 0; + VLOG(10) << Type() << " get driver version " << 0; return 0; } @@ -62,7 +61,8 @@ void DeviceInterface::SetDevice(size_t dev_id) { INTERFACE_UNIMPLEMENT; } int DeviceInterface::GetDevice() { INTERFACE_UNIMPLEMENT; } // stream manage -void DeviceInterface::CreateStream(size_t dev_id, stream::Stream* stream, +void DeviceInterface::CreateStream(size_t dev_id, + stream::Stream* stream, const stream::Stream::Priority& priority, const stream::Stream::Flag& flag) { INTERFACE_UNIMPLEMENT; @@ -82,7 +82,8 @@ bool DeviceInterface::QueryStream(size_t dev_id, const stream::Stream* stream) { return true; } -void DeviceInterface::AddCallback(size_t dev_id, stream::Stream* stream, +void DeviceInterface::AddCallback(size_t dev_id, + stream::Stream* stream, stream::Stream::Callback* callback) { INTERFACE_UNIMPLEMENT; } @@ -94,7 +95,8 @@ void DeviceInterface::StreamWaitEvent(size_t dev_id, } // event manage -void DeviceInterface::CreateEvent(size_t dev_id, event::Event* event, +void DeviceInterface::CreateEvent(size_t dev_id, + event::Event* event, event::Event::Flag flags) { INTERFACE_UNIMPLEMENT; } @@ -103,7 +105,8 @@ void DeviceInterface::DestroyEvent(size_t dev_id, event::Event* event) { INTERFACE_UNIMPLEMENT; } -void DeviceInterface::RecordEvent(size_t dev_id, const event::Event* event, +void DeviceInterface::RecordEvent(size_t dev_id, + const event::Event* event, const stream::Stream* stream) { INTERFACE_UNIMPLEMENT; } @@ -119,23 +122,35 @@ bool DeviceInterface::QueryEvent(size_t dev_id, const event::Event* event) { } // memery manage -void DeviceInterface::MemoryCopyH2D(size_t dev_id, void* dst, const void* src, - size_t size, const stream::Stream* stream) { +void DeviceInterface::MemoryCopyH2D(size_t dev_id, + void* dst, + const void* src, + size_t size, + const stream::Stream* stream) { INTERFACE_UNIMPLEMENT; } -void DeviceInterface::MemoryCopyD2H(size_t dev_id, void* dst, const void* src, - size_t size, const stream::Stream* stream) { +void DeviceInterface::MemoryCopyD2H(size_t dev_id, + void* dst, + const void* src, + size_t size, + const stream::Stream* stream) { INTERFACE_UNIMPLEMENT; } -void DeviceInterface::MemoryCopyD2D(size_t dev_id, void* dst, const void* src, - size_t size, const stream::Stream* stream) { +void DeviceInterface::MemoryCopyD2D(size_t dev_id, + void* dst, + const void* src, + size_t size, + const stream::Stream* stream) { INTERFACE_UNIMPLEMENT; } -void DeviceInterface::MemoryCopyP2P(const Place& dst_place, void* dst, - size_t src_id, const void* src, size_t size, +void DeviceInterface::MemoryCopyP2P(const Place& dst_place, + void* dst, + size_t src_id, + const void* src, + size_t size, const stream::Stream* stream) { INTERFACE_UNIMPLEMENT; } @@ -154,7 +169,8 @@ void* DeviceInterface::MemoryAllocateHost(size_t dev_id, size_t size) { return nullptr; } -void DeviceInterface::MemoryDeallocateHost(size_t dev_id, void* ptr, +void DeviceInterface::MemoryDeallocateHost(size_t dev_id, + void* ptr, size_t size) { INTERFACE_UNIMPLEMENT; } @@ -164,12 +180,15 @@ void* DeviceInterface::MemoryAllocateUnified(size_t dev_id, size_t size) { return nullptr; } -void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, void* ptr, +void DeviceInterface::MemoryDeallocateUnified(size_t dev_id, + void* ptr, size_t size) { INTERFACE_UNIMPLEMENT; } -void DeviceInterface::MemorySet(size_t dev_id, void* ptr, uint8_t value, +void DeviceInterface::MemorySet(size_t dev_id, + void* ptr, + uint8_t value, size_t size) { INTERFACE_UNIMPLEMENT; } @@ -184,8 +203,9 @@ size_t DeviceInterface::GetMinChunkSize(size_t dev_id) { size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) { size_t available_to_alloc = AvailableAllocSize(dev_id); - PADDLE_ENFORCE_GT(available_to_alloc, 0, - platform::errors::ResourceExhausted( + PADDLE_ENFORCE_GT(available_to_alloc, + 0, + phi::errors::ResourceExhausted( "Not enough available %s memory.", Type())); // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be // allocated by fraction @@ -194,8 +214,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) { size_t alloc_bytes = (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); - PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes, - platform::errors::ResourceExhausted( + PADDLE_ENFORCE_GE(available_to_alloc, + alloc_bytes, + phi::errors::ResourceExhausted( "Not enough available %s memory.", Type())); return alloc_bytes; } @@ -217,33 +238,32 @@ size_t DeviceInterface::AvailableAllocSize(size_t dev_id) { size_t DeviceInterface::GetInitAllocSize(size_t dev_id) { size_t init_alloc_size = AllocSize(dev_id, false); - VLOG(10) << Type() + " init alloc size " << (init_alloc_size >> 20) << "M"; + VLOG(10) << Type() << " init alloc size " << (init_alloc_size >> 20) << "M"; return init_alloc_size; } size_t DeviceInterface::GetReallocSize(size_t dev_id) { size_t realloc_size = AllocSize(dev_id, true); - VLOG(10) << Type() + " realloc size " << (realloc_size >> 20) << "M"; + VLOG(10) << Type() << " realloc size " << (realloc_size >> 20) << "M"; return realloc_size; } size_t DeviceInterface::GetMaxAllocSize(size_t dev_id) { size_t max_alloc_size = std::max(GetInitAllocSize(dev_id), GetReallocSize(dev_id)); - VLOG(10) << Type() + " max alloc size " << (max_alloc_size >> 20) << "M"; + VLOG(10) << Type() << " max alloc size " << (max_alloc_size >> 20) << "M"; return max_alloc_size; } size_t DeviceInterface::GetMaxChunkSize(size_t dev_id) { size_t max_chunk_size = GetMaxAllocSize(dev_id); - VLOG(10) << Type() + " max chunk size " << (max_chunk_size >> 20) << "M"; + VLOG(10) << Type() << " max chunk size " << (max_chunk_size >> 20) << "M"; return max_chunk_size; } size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) { - VLOG(10) << Type() + " extra padding size " << 0; + VLOG(10) << Type() << " extra padding size " << 0; return 0; } -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/device_base.h b/paddle/phi/backends/device_base.h similarity index 80% rename from paddle/fluid/platform/device/device_base.h rename to paddle/phi/backends/device_base.h index d70b02be80e..b4964708dfb 100644 --- a/paddle/fluid/platform/device/device_base.h +++ b/paddle/phi/backends/device_base.h @@ -14,11 +14,10 @@ #pragma once #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/event.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/phi/backends/event.h" +#include "paddle/phi/backends/stream.h" -namespace paddle { -namespace platform { +namespace phi { class DeviceInterface { // Driver / Runtime public: @@ -66,7 +65,8 @@ class DeviceInterface { // Driver / Runtime // Stream // ! Create an asynchronous stream virtual void CreateStream( - size_t dev_id, stream::Stream* stream, + size_t dev_id, + stream::Stream* stream, const stream::Stream::Priority& priority = stream::Stream::Priority::kNormal, const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); @@ -81,19 +81,22 @@ class DeviceInterface { // Driver / Runtime virtual bool QueryStream(size_t dev_id, const stream::Stream* stream); // ! Add a callback to a compute stream. - virtual void AddCallback(size_t dev_id, stream::Stream* stream, + virtual void AddCallback(size_t dev_id, + stream::Stream* stream, stream::Stream::Callback* callback); // Event // ! Create an event. - virtual void CreateEvent(size_t dev_id, event::Event* event, + virtual void CreateEvent(size_t dev_id, + event::Event* event, event::Event::Flag flags); // ! Destroy an event. virtual void DestroyEvent(size_t dev_id, event::Event* event); // ! Records an event. - virtual void RecordEvent(size_t dev_id, const event::Event* event, + virtual void RecordEvent(size_t dev_id, + const event::Event* event, const stream::Stream* stream); // ! Waits for event to complete. @@ -102,24 +105,34 @@ class DeviceInterface { // Driver / Runtime virtual bool QueryEvent(size_t dev_id, const event::Event* event); // ! Make a compute stream wait on an event - virtual void StreamWaitEvent(size_t dev_id, const stream::Stream* stream, + virtual void StreamWaitEvent(size_t dev_id, + const stream::Stream* stream, const event::Event* event); // Memory - virtual void MemoryCopyH2D(size_t dev_id, void* dst, const void* src, + virtual void MemoryCopyH2D(size_t dev_id, + void* dst, + const void* src, size_t size, const stream::Stream* stream = nullptr); - virtual void MemoryCopyD2H(size_t dev_id, void* dst, const void* src, + virtual void MemoryCopyD2H(size_t dev_id, + void* dst, + const void* src, size_t size, const stream::Stream* stream = nullptr); - virtual void MemoryCopyD2D(size_t dev_id, void* dst, const void* src, + virtual void MemoryCopyD2D(size_t dev_id, + void* dst, + const void* src, size_t size, const stream::Stream* stream = nullptr); - virtual void MemoryCopyP2P(const Place& dst_place, void* dst, size_t src_id, - const void* src, size_t size, + virtual void MemoryCopyP2P(const Place& dst_place, + void* dst, + size_t src_id, + const void* src, + size_t size, const stream::Stream* stream = nullptr); virtual void* MemoryAllocate(size_t dev_id, size_t size); @@ -160,7 +173,6 @@ class DeviceInterface { // Driver / Runtime size_t AvailableAllocSize(size_t dev_id); }; -} // namespace platform -} // namespace paddle +} // namespace phi #endif diff --git a/paddle/fluid/platform/device/device_ext.h b/paddle/phi/backends/device_ext.h similarity index 78% rename from paddle/fluid/platform/device/device_ext.h rename to paddle/phi/backends/device_ext.h index d1e1340f74b..bbd4966b727 100644 --- a/paddle/fluid/platform/device/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -40,7 +40,9 @@ typedef struct C_Stream_st* C_Stream; typedef struct C_Event_st* C_Event; -typedef void (*C_Callback)(C_Device device, C_Stream stream, void* user_data, +typedef void (*C_Callback)(C_Device device, + C_Stream stream, + void* user_data, C_Status* status); struct C_DeviceInterface { @@ -124,8 +126,10 @@ struct C_DeviceInterface { * @param[C_Callback] callback * @param[void*] user_data */ - C_Status (*stream_add_callback)(const C_Device device, C_Stream stream, - C_Callback callback, void* user_data); + C_Status (*stream_add_callback)(const C_Device device, + C_Stream stream, + C_Callback callback, + void* user_data); /** * @brief Create an event @@ -142,7 +146,8 @@ struct C_DeviceInterface { * @param[C_Stream] stream * @param[C_Event] event */ - C_Status (*record_event)(const C_Device device, C_Stream stream, + C_Status (*record_event)(const C_Device device, + C_Stream stream, C_Event event); /** @@ -191,7 +196,8 @@ struct C_DeviceInterface { * @param[C_Stream] stream * @param[C_Event] event */ - C_Status (*stream_wait_event)(const C_Device device, C_Stream stream, + C_Status (*stream_wait_event)(const C_Device device, + C_Stream stream, C_Event event); void* reserved_dev_api[8]; @@ -207,7 +213,8 @@ struct C_DeviceInterface { * @param[void**] ptr Plugin allocate an address and fill it * @param[size_t] size */ - C_Status (*device_memory_allocate)(const C_Device device, void** ptr, + C_Status (*device_memory_allocate)(const C_Device device, + void** ptr, size_t size); /** @@ -217,7 +224,8 @@ struct C_DeviceInterface { * @param[void*] ptr * @param[size_t] size */ - C_Status (*device_memory_deallocate)(const C_Device device, void* ptr, + C_Status (*device_memory_deallocate)(const C_Device device, + void* ptr, size_t size); /** @@ -228,8 +236,10 @@ struct C_DeviceInterface { * @param[unsigned char] value * @param[size_t] size */ - C_Status (*device_memory_set)(const C_Device device, void* ptr, - unsigned char value, size_t size); + C_Status (*device_memory_set)(const C_Device device, + void* ptr, + unsigned char value, + size_t size); /** * @brief Host memory allocate @@ -238,7 +248,8 @@ struct C_DeviceInterface { * @param[void**] ptr Plugin allocate an address and fill it * @param[size_t] size */ - C_Status (*host_memory_allocate)(const C_Device device, void** ptr, + C_Status (*host_memory_allocate)(const C_Device device, + void** ptr, size_t size); /** @@ -248,7 +259,8 @@ struct C_DeviceInterface { * @param[void*] ptr * @param[size_t] size */ - C_Status (*host_memory_deallocate)(const C_Device device, void* ptr, + C_Status (*host_memory_deallocate)(const C_Device device, + void* ptr, size_t size); /** @@ -258,7 +270,8 @@ struct C_DeviceInterface { * @param[void**] ptr Plugin allocate an address and fill it * @param[size_t] size */ - C_Status (*unified_memory_allocate)(const C_Device device, void** ptr, + C_Status (*unified_memory_allocate)(const C_Device device, + void** ptr, size_t size); /** @@ -268,7 +281,8 @@ struct C_DeviceInterface { * @param[void*] ptr * @param[size_t] size */ - C_Status (*unified_memory_deallocate)(const C_Device device, void* ptr, + C_Status (*unified_memory_deallocate)(const C_Device device, + void* ptr, size_t size); /** @@ -279,7 +293,9 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*memory_copy_h2d)(const C_Device device, void* dst, const void* src, + C_Status (*memory_copy_h2d)(const C_Device device, + void* dst, + const void* src, size_t size); /** @@ -290,7 +306,9 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*memory_copy_d2h)(const C_Device device, void* dst, const void* src, + C_Status (*memory_copy_d2h)(const C_Device device, + void* dst, + const void* src, size_t size); /** @@ -301,7 +319,9 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*memory_copy_d2d)(const C_Device device, void* dst, const void* src, + C_Status (*memory_copy_d2d)(const C_Device device, + void* dst, + const void* src, size_t size); /** @@ -314,8 +334,10 @@ struct C_DeviceInterface { * @param[size_t] size */ C_Status (*memory_copy_p2p)(const C_Device dst_device, - const C_Device src_device, void* dst, - const void* src, size_t size); + const C_Device src_device, + void* dst, + const void* src, + size_t size); /** * @brief Asynchonrize memory copy from host to device @@ -326,8 +348,11 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*async_memory_copy_h2d)(const C_Device device, C_Stream stream, - void* dst, const void* src, size_t size); + C_Status (*async_memory_copy_h2d)(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size); /** * @brief Asynchonrize memory copy from device to host @@ -338,8 +363,11 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*async_memory_copy_d2h)(const C_Device device, C_Stream stream, - void* dst, const void* src, size_t size); + C_Status (*async_memory_copy_d2h)(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size); /** * @brief Asynchonrize memory copy from device to device @@ -350,8 +378,11 @@ struct C_DeviceInterface { * @param[void*] src * @param[size_t] size */ - C_Status (*async_memory_copy_d2d)(const C_Device device, C_Stream stream, - void* dst, const void* src, size_t size); + C_Status (*async_memory_copy_d2d)(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size); /** * @brief Peer asynchonrize memory copy from host to device @@ -363,8 +394,11 @@ struct C_DeviceInterface { * @param[size_t] size */ C_Status (*async_memory_copy_p2p)(const C_Device dst_device, - const C_Device src_device, C_Stream stream, - void* dst, const void* src, size_t size); + const C_Device src_device, + C_Stream stream, + void* dst, + const void* src, + size_t size); void* reserved_mem_api[8]; @@ -394,7 +428,8 @@ struct C_DeviceInterface { * @param[size_t*] free_memory * @param[size_t*] used_memory */ - C_Status (*device_memory_stats)(const C_Device device, size_t* total_memory, + C_Status (*device_memory_stats)(const C_Device device, + size_t* total_memory, size_t* free_memory); /** diff --git a/paddle/fluid/platform/device/device_guard.cc b/paddle/phi/backends/device_guard.cc similarity index 83% rename from paddle/fluid/platform/device/device_guard.cc rename to paddle/phi/backends/device_guard.cc index 55d8b9dc6a9..03eaac1fb1a 100644 --- a/paddle/fluid/platform/device/device_guard.cc +++ b/paddle/phi/backends/device_guard.cc @@ -12,11 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/phi/backends/device_guard.h" -namespace paddle { -namespace platform { +namespace phi { // Even this source file does not contains any code, it is better to keep this // source file for cmake dependency. -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/device_guard.h b/paddle/phi/backends/device_guard.h similarity index 82% rename from paddle/fluid/platform/device/device_guard.h rename to paddle/phi/backends/device_guard.h index 638e9c984b4..eb14236d251 100644 --- a/paddle/fluid/platform/device/device_guard.h +++ b/paddle/phi/backends/device_guard.h @@ -13,17 +13,16 @@ // limitations under the License. #pragma once -#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/phi/backends/device_manager.h" -namespace paddle { -namespace platform { +namespace phi { class DeviceGuard { public: explicit inline DeviceGuard(const Place& place) - : dev_type_(PlaceHelper::GetDeviceType(place)) { + : dev_type_(place.GetDeviceType()) { prev_id = DeviceManager::GetDevice(dev_type_); - cur_id = PlaceHelper::GetDeviceId(place); + cur_id = place.GetDeviceId(); if (cur_id != prev_id) { DeviceManager::SetDevice(dev_type_, cur_id); @@ -44,5 +43,4 @@ class DeviceGuard { std::string dev_type_; }; -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/device_manager.cc b/paddle/phi/backends/device_manager.cc similarity index 83% rename from paddle/fluid/platform/device/device_manager.cc rename to paddle/phi/backends/device_manager.cc index e0db97adde8..1ffe38d8e1f 100644 --- a/paddle/fluid/platform/device/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -13,7 +13,7 @@ // limitations under the License. #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/phi/backends/device_manager.h" #if !defined(_WIN32) #include @@ -24,8 +24,7 @@ #include #include -namespace paddle { -namespace platform { +namespace phi { void Device::CreateStream(stream::Stream* stream, const stream::Stream::Priority& priority, @@ -76,23 +75,32 @@ void Device::StreamWaitEvent(const stream::Stream* stream, impl_->StreamWaitEvent(dev_id_, stream, event); } -void Device::MemoryCopyH2D(void* dst, const void* src, size_t size, +void Device::MemoryCopyH2D(void* dst, + const void* src, + size_t size, const stream::Stream* stream) { impl_->MemoryCopyH2D(dev_id_, dst, src, size, stream); } -void Device::MemoryCopyD2H(void* dst, const void* src, size_t size, +void Device::MemoryCopyD2H(void* dst, + const void* src, + size_t size, const stream::Stream* stream) { impl_->MemoryCopyD2H(dev_id_, dst, src, size, stream); } -void Device::MemoryCopyD2D(void* dst, const void* src, size_t size, +void Device::MemoryCopyD2D(void* dst, + const void* src, + size_t size, const stream::Stream* stream) { impl_->MemoryCopyD2D(dev_id_, dst, src, size, stream); } -void Device::MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, - size_t size, const stream::Stream* stream) { +void Device::MemoryCopyP2P(const Place& dst_place, + void* dst, + const void* src, + size_t size, + const stream::Stream* stream) { impl_->MemoryCopyP2P(dst_place, dst, dev_id_, src, size, stream); } @@ -173,7 +181,7 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType( } else { LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n"; PADDLE_THROW( - platform::errors::Fatal("Unregistered device type %s.", device_type)); + phi::errors::Fatal("Unregistered device type %s.", device_type)); return nullptr; } } @@ -182,17 +190,21 @@ Device* DeviceManager::GetDeviceWithPlace(const Place& place) { phi::AutoRDLock lock(&_global_device_manager_rw_lock); auto& dev_map = Instance().device_map_; - auto dev_type = PlaceHelper::GetDeviceType(place); - auto dev_id = PlaceHelper::GetDeviceId(place); - PADDLE_ENFORCE_NE(dev_map.find(dev_type), dev_map.end(), - platform::errors::NotFound( - "Unable to find Device with type %s.", dev_type)); + auto dev_type = place.GetDeviceType(); + auto dev_id = place.GetDeviceId(); + PADDLE_ENFORCE_NE( + dev_map.find(dev_type), + dev_map.end(), + phi::errors::NotFound("Unable to find Device with type %s.", dev_type)); auto& dev_vec = dev_map[dev_type]; PADDLE_ENFORCE_LT( - dev_id, dev_vec.size(), - platform::errors::OutOfRange( + dev_id, + dev_vec.size(), + phi::errors::OutOfRange( "The visible devices count of type %s is %d, but dev_id is %d.", - dev_type, dev_vec.size(), dev_id)); + dev_type, + dev_vec.size(), + dev_id)); return dev_vec[dev_id].get(); } @@ -277,22 +289,22 @@ void DeviceManager::Finalize(const std::string& device_type) { } void DeviceManager::SynchronizeDevice(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); dev_impl->SynchronizeDevice(device_id); } void DeviceManager::InitDevice(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); dev_impl->InitDevice(device_id); } void DeviceManager::DeInitDevice(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); dev_impl->DeInitDevice(device_id); } @@ -304,8 +316,8 @@ void DeviceManager::SetDevice(const std::string& device_type, } void DeviceManager::SetDevice(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); DeviceManager::SetDevice(device_type, device_id); } @@ -315,51 +327,52 @@ int DeviceManager::GetDevice(const std::string& device_type) { } size_t DeviceManager::GetMinChunkSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetMinChunkSize(device_id); } size_t DeviceManager::GetMaxChunkSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetMaxChunkSize(device_id); } size_t DeviceManager::GetMaxAllocSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetMaxAllocSize(device_id); } size_t DeviceManager::GetInitAllocSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetInitAllocSize(device_id); } size_t DeviceManager::GetReallocSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetReallocSize(device_id); } size_t DeviceManager::GetExtraPaddingSize(const Place& place) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); return dev_impl->GetExtraPaddingSize(device_id); } -void DeviceManager::MemoryStats(const Place& place, size_t* total, +void DeviceManager::MemoryStats(const Place& place, + size_t* total, size_t* free) { - auto device_type = PlaceHelper::GetDeviceType(place); - auto device_id = PlaceHelper::GetDeviceId(place); + auto device_type = place.GetDeviceType(); + auto device_id = place.GetDeviceId(); auto dev_impl = GetDeviceInterfaceWithType(device_type); dev_impl->MemoryStats(device_id, total, free); } @@ -393,8 +406,8 @@ std::vector ListAllLibraries(const std::string& library_dir) { } else { while ((ptr = readdir(dir)) != nullptr) { std::string filename(ptr->d_name); - if (std::regex_match(filename.begin(), filename.end(), results, - express)) { + if (std::regex_match( + filename.begin(), filename.end(), results, express)) { libraries.push_back(library_dir + '/' + filename); VLOG(4) << "Found lib: " << libraries.back(); } @@ -405,6 +418,5 @@ std::vector ListAllLibraries(const std::string& library_dir) { return libraries; } -} // namespace platform -} // namespace paddle +} // namespace phi #endif diff --git a/paddle/fluid/platform/device/device_manager.h b/paddle/phi/backends/device_manager.h similarity index 83% rename from paddle/fluid/platform/device/device_manager.h rename to paddle/phi/backends/device_manager.h index d3aaafcddf7..c0911a0f8d5 100644 --- a/paddle/fluid/platform/device/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -15,17 +15,16 @@ #pragma once #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/device_base.h" -#include "paddle/fluid/platform/device/device_ext.h" -#include "paddle/fluid/platform/device/event.h" -#include "paddle/fluid/platform/device/stream.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/device_base.h" +#include "paddle/phi/backends/device_ext.h" +#include "paddle/phi/backends/event.h" +#include "paddle/phi/backends/stream.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/core/utils/rw_lock.h" -namespace paddle { -namespace platform { +namespace phi { class Device final { public: Device(size_t dev_id, DeviceInterface* impl) : dev_id_(dev_id), impl_(impl) {} @@ -33,8 +32,9 @@ class Device final { // Stream // ! Create an asynchronous stream void CreateStream( - stream::Stream* stream, const stream::Stream::Priority& priority = - stream::Stream::Priority::kNormal, + stream::Stream* stream, + const stream::Stream::Priority& priority = + stream::Stream::Priority::kNormal, const stream::Stream::Flag& flag = stream::Stream::Flag::kDefaultFlag); // ! Destroys an asynchronous stream. @@ -69,17 +69,26 @@ class Device final { void StreamWaitEvent(const stream::Stream* stream, const event::Event* event); // Memory - void MemoryCopyH2D(void* dst, const void* src, size_t size, + void MemoryCopyH2D(void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr); - void MemoryCopyD2H(void* dst, const void* src, size_t size, + void MemoryCopyD2H(void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr); - void MemoryCopyD2D(void* dst, const void* src, size_t size, + void MemoryCopyD2D(void* dst, + const void* src, + size_t size, const stream::Stream* stream = nullptr); - void MemoryCopyP2P(const Place& dst_place, void* dst, const void* src, - size_t size, const stream::Stream* stream = nullptr); + void MemoryCopyP2P(const Place& dst_place, + void* dst, + const void* src, + size_t size, + const stream::Stream* stream = nullptr); void* MemoryAllocate(size_t size); @@ -168,7 +177,8 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle); void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params, std::unique_ptr device_interface, - const std::string& dso_lib_path, void* dso_handle); + const std::string& dso_lib_path, + void* dso_handle); class Registrar { public: @@ -180,7 +190,6 @@ class Registrar { void Touch() {} }; -} // namespace platform -} // namespace paddle +} // namespace phi #endif diff --git a/paddle/fluid/platform/device/event.cc b/paddle/phi/backends/event.cc similarity index 84% rename from paddle/fluid/platform/device/event.cc rename to paddle/phi/backends/event.cc index 6e6316ea16d..a474536f865 100644 --- a/paddle/fluid/platform/device/event.cc +++ b/paddle/phi/backends/event.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/event.h" -#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/phi/backends/event.h" #include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/device/stream.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/stream.h" -namespace paddle { -namespace platform { +namespace phi { namespace event { event_t Event::raw_event() const { return event_; } @@ -27,7 +26,7 @@ void Event::set_event(event_t event) { event_ = event; } Event::Event(const Place& place, event_t event) : place_(place), - device_(platform::DeviceManager::GetDeviceWithPlace(place)), + device_(phi::DeviceManager::GetDeviceWithPlace(place)), event_(event), own_data_(false) {} @@ -60,5 +59,4 @@ void Event::Synchonrize() const { device_->SynchronizeEvent(this); } const Place& Event::GetPlace() const { return place_; } } // namespace event -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/event.h b/paddle/phi/backends/event.h similarity index 94% rename from paddle/fluid/platform/device/event.h rename to paddle/phi/backends/event.h index 376d73eb666..f2e86343f8f 100644 --- a/paddle/fluid/platform/device/event.h +++ b/paddle/phi/backends/event.h @@ -15,8 +15,7 @@ #pragma once #include "paddle/fluid/platform/place.h" -namespace paddle { -namespace platform { +namespace phi { class Device; @@ -57,5 +56,4 @@ class Event { }; } // namespace event -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/stream.cc b/paddle/phi/backends/stream.cc similarity index 84% rename from paddle/fluid/platform/device/stream.cc rename to paddle/phi/backends/stream.cc index 7f867e5ee77..30939f31fcc 100644 --- a/paddle/fluid/platform/device/stream.cc +++ b/paddle/phi/backends/stream.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/platform/device/stream.h" -#include "paddle/fluid/platform/device/device_guard.h" +#include "paddle/phi/backends/stream.h" #include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/device/event.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/event.h" -namespace paddle { -namespace platform { +namespace phi { namespace stream { Stream::~Stream() { Destroy(); } @@ -30,15 +29,16 @@ void Stream::set_stream(stream_t stream) { stream_ = stream; } // For compatiable Stream::Stream(const Place& place, stream_t stream) : place_(place), - device_(platform::DeviceManager::GetDeviceWithPlace(place)), + device_(phi::DeviceManager::GetDeviceWithPlace(place)), stream_(stream), callback_manager_(new CallbackManager(this)), own_data_(false) {} -bool Stream::Init(const Place& place, const Priority& priority, +bool Stream::Init(const Place& place, + const Priority& priority, const Flag& flag) { place_ = place; - device_ = platform::DeviceManager::GetDeviceWithPlace(place); + device_ = phi::DeviceManager::GetDeviceWithPlace(place); DeviceGuard guard(place_); device_->CreateStream(this, priority, flag); @@ -92,5 +92,4 @@ void Stream::Synchronize() const { device_->SynchronizeStream(this); } const Place& Stream::GetPlace() const { return place_; } } // namespace stream -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/platform/device/stream.h b/paddle/phi/backends/stream.h similarity index 89% rename from paddle/fluid/platform/device/stream.h rename to paddle/phi/backends/stream.h index 25cf705ee09..6c26ab3c2d5 100644 --- a/paddle/fluid/platform/device/stream.h +++ b/paddle/phi/backends/stream.h @@ -14,11 +14,10 @@ #pragma once -#include "paddle/fluid/platform/device/callback_manager.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/callback_manager.h" -namespace paddle { -namespace platform { +namespace phi { class Device; @@ -49,7 +48,8 @@ class Stream { ~Stream(); const stream_t& raw_stream() const; void set_stream(stream_t stream); - bool Init(const Place& place, const Priority& priority = Priority::kNormal, + bool Init(const Place& place, + const Priority& priority = Priority::kNormal, const Flag& flag = Flag::kDefaultFlag); template void AddCallback(Callback&& callback) const { @@ -75,5 +75,4 @@ class Stream { }; } // namespace stream -} // namespace platform -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 8ffacbb39bb..424c4ce2ebc 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) -cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) +cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index b85db07bd9d..67245f1da5a 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/op_utils.h" #ifdef PADDLE_WITH_CUSTOM_DEVICE -#include "paddle/fluid/platform/device/device_manager.h" +#include "paddle/phi/backends/device_manager.h" #endif namespace phi { @@ -83,9 +83,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { if (!device_type.empty()) { return phi::CustomPlace( device_type, - set_device_id - ? paddle::platform::DeviceManager::GetDevice(device_type) - : 0); + set_device_id ? phi::DeviceManager::GetDevice(device_type) : 0); } #endif PADDLE_THROW(phi::errors::Unimplemented( diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index 58f9e1c623e..a333874d03e 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -12,6 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + #include "paddle/phi/core/custom_kernel.h" namespace phi { @@ -50,6 +55,25 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { } } +void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { +#ifdef _LINUX + typedef phi::CustomKernelMap& get_custom_kernel_map_t(); + auto* func = reinterpret_cast( + dlsym(dso_handle, "PD_GetCustomKernelMap")); + + if (func == nullptr) { + LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " + << "PD_GetCustomKernelMap symbol in this lib."; + return; + } + auto& custom_kernel_map = func(); + phi::RegisterCustomKernels(custom_kernel_map); + LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; +#else + VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; +#endif + return; +} } // namespace phi #ifdef __cplusplus diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h index 20ae2b7bb73..ffd12b9dd03 100644 --- a/paddle/phi/core/custom_kernel.h +++ b/paddle/phi/core/custom_kernel.h @@ -46,4 +46,6 @@ class CustomKernelMap { */ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map); +// Load custom kernel lib and register +void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle); } // namespace phi diff --git a/python/setup.py.in b/python/setup.py.in index ec1b1cbcb95..91580614fa9 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -579,8 +579,7 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # phi core headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) + # phi backends headers # utila api headers - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) + # paddle utils headers - ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h']) + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn @@ -625,8 +624,6 @@ class InstallHeaders(Command): elif 'third_party' not in header: # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) - if 'device_ext.h' in header: - install_dir = "paddle/" else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) -- GitLab From 97ccaa796e3b401d87d6da8f27c6d22934640891 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Thu, 3 Mar 2022 14:28:27 +0800 Subject: [PATCH 025/261] [Eager][Yaml]Supported Scalar and ScalarArray for AutoCodeGen (#40080) --- .../final_state_generator/eager_gen.py | 4 +- .../final_state_generator/python_c_gen.py | 24 ++--- paddle/fluid/pybind/eager_utils.cc | 100 ++++++++++++++++-- paddle/fluid/pybind/eager_utils.h | 10 ++ 4 files changed, 111 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index af9540b6fb3..65dbb0368c6 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -31,7 +31,9 @@ yaml_types_mapping = { 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', - 'Tensor[Tensor[]]' : 'std::vector>' + 'Tensor[Tensor[]]' : 'std::vector>', + 'Scalar' : 'Scalar', + 'ScalarArray' : 'ScalarArray' } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9329dc5ffc9..9c4e102ca45 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,7 +14,7 @@ import os import argparse -from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -27,21 +27,9 @@ atype_to_parsing_function = { "long[]": "CastPyArg2Longs", "float[]": "CastPyArg2Floats", "double[]": "CastPyArg2Float64s", - "string[]": "CastPyArg2Strings" -} - -atype_to_cxx_type = { - "bool": "bool", - "int": "int", - "long": "long", - "float": "float", - "string": "std::string", - "bool[]": "std::vector", - "int[]": "std::vector", - "long[]": "std::vector", - "float[]": "std::vector", - "double[]": "std::vector", - "string[]": "std::vector" + "string[]": "CastPyArg2Strings", + "Scalar": "CastPyArg2Scalar", + "ScalarArray": "CastPyArg2ScalarArray" } @@ -56,10 +44,10 @@ def ParseArguments(): def GetCxxType(atype): - if atype not in atype_to_cxx_type.keys(): + if atype not in yaml_types_mapping.keys(): assert False - return atype_to_cxx_type[atype] + return yaml_types_mapping[atype] def FindParsingFunctionFromAttributeType(atype): diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 57f37621d3b..7647930ef07 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -587,14 +587,9 @@ paddle::optional GetOptionalTensorFromArgs( reinterpret_cast(obj)->tensor); } -// For Intermediate State Dygraph, -// we use an uninitialized Tensor to represent dispensable Tensor -paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, - const std::string& arg_name, - PyObject* args, ssize_t arg_idx, - bool dispensable) { - PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); - +static paddle::experimental::Tensor& GetTensorFromPyObject( + const std::string& op_type, const std::string& arg_name, PyObject* obj, + ssize_t arg_idx, bool dispensable) { if (PyTuple_Check(obj)) { obj = PyTuple_GET_ITEM(obj, 0); } @@ -612,6 +607,16 @@ paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, return reinterpret_cast(obj)->tensor; } +// For Intermediate State Dygraph, +// we use an uninitialized Tensor to represent dispensable Tensor +paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, + const std::string& arg_name, + PyObject* args, ssize_t arg_idx, + bool dispensable) { + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + return GetTensorFromPyObject(op_type, arg_name, obj, arg_idx, dispensable); +} + std::vector GetTensorListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable) { @@ -746,5 +751,84 @@ std::vector GetTensorPtrListFromArgs( return result; } +paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + // obj could be: int, float, bool, paddle.Tensor + PyTypeObject* type = obj->ob_type; + auto type_name = std::string(type->tp_name); + if (type_name == "int") { + int value = CastPyArg2Int(obj, op_type, arg_pos); + return paddle::experimental::Scalar(value); + } else if (type_name == "float") { + float value = CastPyArg2Float(obj, op_type, arg_pos); + return paddle::experimental::Scalar(value); + + } else if (type_name == "bool") { + bool value = CastPyArg2Boolean(obj, op_type, arg_pos); + return paddle::experimental::Scalar(value); + + } else if (type_name == "paddle.Tensor") { + paddle::experimental::Tensor& value = GetTensorFromPyObject( + op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/); + return paddle::experimental::Scalar(value); + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + // Fake a Scalar + return paddle::experimental::Scalar(1.0); +} + +paddle::experimental::ScalarArray CastPyArg2ScalarArray( + PyObject* obj, const std::string& op_type, ssize_t arg_pos) { + // In case of ScalarArray, only two possible PyObjects: + // 1. list of int + // 2. Tensor + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + // obj could be: int, float, bool, paddle.Tensor + PyTypeObject* type = obj->ob_type; + auto type_name = std::string(type->tp_name); + if (type_name == "list") { + std::vector value = CastPyArg2Ints(obj, op_type, arg_pos); + return paddle::experimental::ScalarArray(value); + + } else if (type_name == "paddle.Tensor") { + paddle::experimental::Tensor& value = GetTensorFromPyObject( + op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/); + return paddle::experimental::ScalarArray(value); + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + // Fake a ScalarArray + return paddle::experimental::ScalarArray({1}); +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 92afc3ae487..6e990691776 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -11,7 +11,10 @@ limitations under the License. */ #pragma once #include +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" + #include "pybind11/pybind11.h" #include "pybind11/stl.h" namespace paddle { @@ -90,6 +93,13 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); + +paddle::experimental::ScalarArray CastPyArg2ScalarArray( + PyObject* obj, const std::string& op_type, ssize_t arg_pos); + paddle::optional GetOptionalTensorFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); -- GitLab From 4c0511faa406cde9db59f1233f6791e0e7c4098d Mon Sep 17 00:00:00 2001 From: From00 Date: Thu, 3 Mar 2022 15:17:01 +0800 Subject: [PATCH 026/261] Support cuda graph in StreamSafeCudaAllocator (#39594) * Support cuda graph in StreamSafeCudaAllocator * Fix CI error * Arrange AllocatorFacade * Fix CI error * Fix CI error * Fix ROCM Compile error * Fix ROCM Compile error --- paddle/fluid/memory/allocation/CMakeLists.txt | 2 +- .../memory/allocation/allocator_facade.cc | 276 +++++------- .../memory/allocation/allocator_facade.h | 11 +- .../allocation/stream_safe_cuda_allocator.cc | 120 +++-- .../allocation/stream_safe_cuda_allocator.h | 28 +- paddle/fluid/memory/malloc.cc | 10 +- paddle/fluid/memory/malloc.h | 6 +- .../memory/stream_safe_cuda_alloc_test.cu | 409 +++++++++--------- 8 files changed, 436 insertions(+), 426 deletions(-) diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6cd7d873323..a7a417c29a7 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -17,7 +17,7 @@ if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 6b7828236a8..61e292a922f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -210,12 +210,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - if (FLAGS_use_stream_safe_cuda_allocator) { - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); - ++dev_id) { - InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr); - } - } else { + if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -298,6 +293,12 @@ class AllocatorFacadePrivate { } CheckAllocThreadSafe(); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + WrapCUDAGraphAllocator(); + } +#endif } inline const std::shared_ptr& GetAllocator( @@ -388,39 +389,6 @@ class AllocatorFacadePrivate { allocation.get())); return stream_safe_cuda_allocation->GetOwningStream(); } - -#ifdef PADDLE_WITH_CUDA - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); - } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; - } - - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } -#endif #endif private: @@ -439,24 +407,7 @@ class AllocatorFacadePrivate { platform::Place place_; }; - const AllocatorMap& GetAllocatorMap() { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE( - iter, cuda_graph_allocator_map_.end(), - platform::errors::PermissionDenied( - "No memory pool is prepared for CUDA Graph capturing.")); - VLOG(10) << "Choose CUDA Graph memory pool to allocate memory"; - return iter->second->allocators_; - } else { - return allocators_; - } -#else - return allocators_; -#endif - } + const AllocatorMap& GetAllocatorMap() { return allocators_; } void InitNaiveBestFitCPUAllocator() { allocators_[platform::CPUPlace()] = @@ -672,10 +623,10 @@ class AllocatorFacadePrivate { } void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - const std::shared_ptr& underlying_allocator = - cuda_allocators_[p][stream]; - cuda_allocators_[p][stream] = std::make_shared( - underlying_allocator, p, stream); + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared( + allocator, p, stream, + /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -684,10 +635,19 @@ class AllocatorFacadePrivate { retry_time, 0, platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); - std::shared_ptr allocator = cuda_allocators_[p][stream]; + std::shared_ptr& allocator = cuda_allocators_[p][stream]; allocator = std::make_shared(allocator, retry_time); } +#ifdef PADDLE_WITH_CUDA + void WrapCUDAGraphAllocator() { + for (auto& item : allocators_) { + auto& allocator = item.second; + allocator = CUDAGraphAllocator::Create(allocator); + } + } +#endif + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { for (auto& place_pair : allocators) { for (auto& stream_pair : place_pair.second) { @@ -864,10 +824,6 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; -#ifdef PADDLE_WITH_CUDA - std::unordered_map> - cuda_graph_allocator_map_; -#endif #endif AllocatorStrategy strategy_; AllocatorMap allocators_; @@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} AllocatorFacade::~AllocatorFacade() {} AllocatorFacade& AllocatorFacade::Instance() { - static AllocatorFacade instance; - return instance; + static AllocatorFacade* instance = new AllocatorFacade; + return *instance; +} + +AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + VLOG(10) << "Choose CUDA Graph memory pool"; + return iter->second.get(); + } +#endif + return m_; } const std::shared_ptr& AllocatorFacade::GetAllocator( @@ -895,19 +867,14 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - + AllocatorFacadePrivate* m = GetPrivate(); platform::CUDAPlace cuda_place(place.GetDeviceId()); - return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); + return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } void* AllocatorFacade::GetBasePtr( @@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr( "GetBasePtr() is only implemented for CUDAPlace(), not " "suppot place: %s", allocation->place())); - return m_->GetBasePtr(allocation); + return GetPrivate()->GetBasePtr(allocation); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -930,21 +897,17 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place, const gpuStream_t& stream) { if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + return GetPrivate()->GetAllocator(place, stream, + /*create_if_not_found=*/true); } - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } #endif const std::shared_ptr& AllocatorFacade::GetZeroAllocator( const platform::Place& place) { - return m_->GetAllocator(place, /* zero size */ 0); + return GetPrivate()->GetAllocator(place, /* zero size */ 0); } std::shared_ptr AllocatorFacade::AllocShared( @@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, size)->Allocate(size); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); + phi::Stream default_stream = phi::Stream(reinterpret_cast( + GetPrivate()->GetDefaultStream(cuda_place))); + return Alloc(cuda_place, size, default_stream); } #endif - - return m_->GetAllocator(place, size)->Allocate(size); + return GetPrivate()->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_ - ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) - ->Release(place); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Release(cuda_place, m_->GetDefaultStream(cuda_place)); + return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + return GetPrivate() + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -1001,71 +951,53 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return std::shared_ptr(Alloc(place, size, s)); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); -#endif + return std::shared_ptr(Alloc(place, size, stream)); } -bool AllocatorFacade::InSameStream( - const std::shared_ptr& allocation, - const phi::Stream& stream) { +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'InSameStream' function. To enable it, you can enter" + "multi-stream 'Alloc' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + platform::CUDAPlace p(place.GetDeviceId()); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + gpuStream_t s = reinterpret_cast(stream.id()); + return GetPrivate() + ->GetAllocator(p, s, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return GetPrivate()->GetAllocator(p, size)->Allocate(size); } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return s == GetStream(allocation); #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif } +bool AllocatorFacade::InSameStream( + const std::shared_ptr& allocation, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Alloc' function. To enable it, you can enter" + "multi-stream 'InSameStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } + gpuStream_t s = reinterpret_cast(stream.id()); + return s == GetStream(allocation); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif - platform::CUDAPlace p(place.GetDeviceId()); - if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) - ->Allocate(size); - } else { - return m_->GetAllocator(p, size)->Allocate(size); - } } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( @@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetAllocator(place, stream)->Release(place); + return GetPrivate()->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(std::shared_ptr allocation, @@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, "'RecordStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - m_->RecordStream(allocation, stream); + GetPrivate()->RecordStream(allocation, stream); } const gpuStream_t& AllocatorFacade::GetStream( @@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream( "'GetStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetStream(allocation); + return GetPrivate()->GetStream(allocation); } #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - return m_->PrepareMemoryPoolForCUDAGraph(id); + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - return m_->RemoveMemoryPoolOfCUDAGraph(id); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; } #endif #endif diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1722a06b01f..9066bb284e2 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -49,6 +49,8 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + AllocatorFacadePrivate* GetPrivate() const; + const std::shared_ptr& GetAllocator(const platform::Place& place); void* GetBasePtr(const std::shared_ptr& allocation); @@ -73,13 +75,14 @@ class AllocatorFacade { size_t size, const phi::Stream& stream); + AllocationPtr Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream); + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. - AllocationPtr Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); @@ -96,6 +99,10 @@ class AllocatorFacade { private: AllocatorFacade(); AllocatorFacadePrivate* m_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_map_; +#endif }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 8627e3e6f88..072c4dee3bc 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -15,56 +15,52 @@ #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + namespace paddle { namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream, + StreamSafeCUDAAllocator* allocator) : Allocation(underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), - owning_stream_(std::move(owning_stream)) {} + owning_stream_(std::move(owning_stream)), + allocator_(allocator->shared_from_this()) {} void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { - VLOG(9) << "Record the same stream of " << stream; return; } std::lock_guard lock_guard(outstanding_event_map_lock_); - gpuEvent_t record_event; - auto it = outstanding_event_map_.find(stream); - if (it == outstanding_event_map_.end()) { - gpuEvent_t new_event; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); -#endif - outstanding_event_map_[stream] = new_event; - record_event = new_event; - VLOG(9) << "Create a new event " << new_event; - } else { - record_event = it->second; - VLOG(9) << "Reuse event " << record_event; + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + graph_capturing_stream_set_.insert(stream); + return; } - -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif - VLOG(8) << "Record event " << record_event << " to stream " << stream; + + RecordStreamWithNoGraphCapturing(stream); + RecordGraphCapturingStreams(); } bool StreamSafeCUDAAllocation::CanBeFreed() { - // NOTE(Ruibiao): This function will not execute concurrently, - // so outstanding_event_lock_ is not required here +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + return graph_capturing_stream_set_.empty() && + outstanding_event_map_.empty(); + } +#endif + + RecordGraphCapturingStreams(); + for (auto it = outstanding_event_map_.begin(); it != outstanding_event_map_.end(); ++it) { gpuEvent_t& event = it->second; @@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { return owning_stream_; } +void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() { + for (gpuStream_t stream : graph_capturing_stream_set_) { + RecordStreamWithNoGraphCapturing(stream); + } + graph_capturing_stream_set_.clear(); +} + +void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( + const gpuStream_t& stream) { + gpuEvent_t record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + gpuEvent_t new_event; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); +#endif + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); +#endif + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( std::shared_ptr underlying_allocator, platform::CUDAPlace place, - gpuStream_t default_stream) + gpuStream_t default_stream, bool in_cuda_graph_capturing) : underlying_allocator_(std::move(underlying_allocator)), place_(std::move(place)), - default_stream_(std::move(default_stream)) { - std::lock_guard lock_guard(allocator_map_lock_); - allocator_map_[place].emplace_back(this); + default_stream_(std::move(default_stream)), + in_cuda_graph_capturing_(in_cuda_graph_capturing) { + if (LIKELY(!in_cuda_graph_capturing)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); + } } StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { - std::lock_guard lock_guard(allocator_map_lock_); - std::vector& allocators = allocator_map_[place_]; - allocators.erase(std::remove(allocators.begin(), allocators.end(), this), - allocators.end()); + if (LIKELY(!in_cuda_graph_capturing_)) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); + } } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } @@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( static_unique_ptr_cast(std::move(underlying_allocation)), - default_stream_); + default_stream_, this); VLOG(8) << "Allocate " << allocation->size() << " bytes at address " << allocation->ptr(); return allocation; @@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { "StreamSafeCUDAAllocation*", allocation)); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); - std::lock_guard lock_guard(unfreed_allocation_lock_); if (stream_safe_cuda_allocation->CanBeFreed()) { VLOG(9) << "Directly delete allocation"; delete stream_safe_cuda_allocation; } else { VLOG(9) << "Put into unfreed_allocation list"; + std::lock_guard lock_guard(unfreed_allocation_lock_); unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + if (UNLIKELY(in_cuda_graph_capturing_)) { + VLOG(7) << "Memory release forbidden in CUDA Graph Captruing"; + return 0; + } + std::lock_guard lock_guard(allocator_map_lock_); std::vector& allocators = allocator_map_[place]; uint64_t released_size = 0; for (StreamSafeCUDAAllocator* allocator : allocators) { - released_size += allocator->ProcessUnfreedAllocationsWithRelease(); + released_size += allocator->ProcessUnfreedAllocationsAndRelease(); } VLOG(8) << "Release " << released_size << " bytes memory from all streams"; return released_size; @@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { } } -uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { +uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() { ProcessUnfreedAllocations(); return underlying_allocator_->Release(place_); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 7354836308c..ecddff97c20 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,10 +14,9 @@ #pragma once -#include #include #include -#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/place.h" @@ -32,27 +31,38 @@ namespace paddle { namespace memory { namespace allocation { +class StreamSafeCUDAAllocator; + class StreamSafeCUDAAllocation : public Allocation { public: StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, - gpuStream_t owning_stream); + gpuStream_t owning_stream, + StreamSafeCUDAAllocator *allocator); + void RecordStream(const gpuStream_t &stream); bool CanBeFreed(); - const gpuStream_t &GetOwningStream() const; private: + void RecordGraphCapturingStreams(); + void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream); DecoratedAllocationPtr underlying_allocation_; + std::set graph_capturing_stream_set_; std::map outstanding_event_map_; gpuStream_t owning_stream_; SpinLock outstanding_event_map_lock_; + // To compatiable with CUDA Graph, hold the allocator shared_ptr so that + // Allocator will not deconstruct before Allocation + std::shared_ptr allocator_; }; -class StreamSafeCUDAAllocator : public Allocator { +class StreamSafeCUDAAllocator + : public Allocator, + public std::enable_shared_from_this { public: StreamSafeCUDAAllocator(std::shared_ptr underlying_allocator, - platform::CUDAPlace place, - gpuStream_t default_stream); + platform::CUDAPlace place, gpuStream_t default_stream, + bool in_cuda_graph_capturing = false); ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; @@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator { private: void ProcessUnfreedAllocations(); - uint64_t ProcessUnfreedAllocationsWithRelease(); + uint64_t ProcessUnfreedAllocationsAndRelease(); static std::map> allocator_map_; @@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator { gpuStream_t default_stream_; std::list unfreed_allocations_; SpinLock unfreed_allocation_lock_; + + bool in_cuda_graph_capturing_; }; } // namespace allocation diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index b60bb4fc1d1..2bca2c388a0 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -41,6 +41,11 @@ std::shared_ptr AllocShared(const platform::Place& place, stream); } +AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); +} + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream) { return allocation::AllocatorFacade::Instance().InSameStream(allocation, @@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr& allocation) { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream) { - return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); -} - uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 89b4caa5bed..601fe3f2a42 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -41,15 +41,15 @@ extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, const phi::Stream& stream); +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream); + extern bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); extern void* GetBasePtr(const std::shared_ptr& allocation); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream); - extern uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 933717f3090..5e4a4234bb4 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -12,34 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif - -#ifdef PADDLE_WITH_HIP -#include -#endif - #include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/stream.h" +#ifdef PADDLE_WITH_CUDA +#include +#include +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + namespace paddle { namespace memory { -__global__ void add_kernel(int *x, int n) { +// y += (x + 1) +__global__ void add_kernel(int *x, int *y, int n) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (int i = thread_id; i < n; i += thread_num) { - atomicAdd(x + i, thread_id); + y[i] += x[i] + 1; } } @@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) { << " there may be a memory leak problem"; } -class StreamSafeCUDAAllocTest : public ::testing::Test { - protected: - void SetUp() override { - place_ = platform::CUDAPlace(); - stream_num_ = 64; - grid_num_ = 1; - block_num_ = 32; - data_num_ = 131072; - workspace_size_ = data_num_ * sizeof(int); - - // alloc workspace for each stream - for (size_t i = 0; i < stream_num_; ++i) { - gpuStream_t stream; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); -#endif - - std::shared_ptr allocation = - AllocShared(place_, workspace_size_, - phi::Stream(reinterpret_cast(stream))); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemset(allocation->ptr(), 0, allocation->size())); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemset(allocation->ptr(), 0, allocation->size())); -#endif - - streams_.emplace_back(stream); - workspaces_.emplace_back(allocation); - } - - result_ = Alloc(place_, stream_num_ * workspace_size_); - } - - void SingleStreamRun(size_t idx) { - // for all stream i, - // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j] - for (size_t i = 0; i < stream_num_; ++i) { - int *x = reinterpret_cast(workspaces_[i]->ptr()); - add_kernel<<>>(x, data_num_); - RecordStream(workspaces_[i], streams_[idx]); - } - } - - void CopyResultAsync() { - for (size_t i = 0; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice)); -#endif - } - } - - void MultiStreamRun() { - for (size_t i = 0; i < stream_num_; ++i) { - SingleStreamRun(i); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void MultiThreadMUltiStreamRun() { - std::vector threads; - for (size_t i = 0; i < stream_num_; ++i) { - threads.push_back( - std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); - } - for (size_t i = 0; i < stream_num_; ++i) { - threads[i].join(); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void CheckResult() { - auto result_host = std::unique_ptr(new int[result_->size()]); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(), - result_->size(), - cudaMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(), - result_->size(), - hipMemcpyDeviceToHost)); -#endif - size_t thread_num = grid_num_ * block_num_; - for (size_t i = 0; i < stream_num_; ++i) { - for (size_t j = 0; j < data_num_; ++j) { - EXPECT_TRUE(result_host[i * stream_num_ + j] == - (j % thread_num) * stream_num_); - } - } - result_.reset(); - } - - void TearDown() override { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#endif - for (gpuStream_t stream : streams_) { - Release(place_, stream); - } - - for (size_t i = 1; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); -#endif - } - - CheckMemLeak(place_); - } - - size_t stream_num_; - size_t grid_num_; - size_t block_num_; - size_t data_num_; - size_t workspace_size_; - platform::CUDAPlace place_; - std::vector streams_; - std::vector> workspaces_; - allocation::AllocationPtr result_; -}; - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { - MultiStreamRun(); - CheckResult(); -} - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { - MultiThreadMUltiStreamRun(); - CheckResult(); -} - TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = - Alloc(place, alloc_size, default_stream); + Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(default_stream))); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); allocation_unique.reset(); @@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { CheckMemLeak(place); } -#ifdef PADDLE_WITH_CUDA -TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { - platform::CUDAPlace place = platform::CUDAPlace(); - size_t alloc_size = 1; - std::shared_ptr allocation = AllocShared(place, alloc_size); - - platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); - EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); - EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), - paddle::platform::EnforceNotMet); - EXPECT_THROW( - AllocShared(place, alloc_size, - phi::Stream(reinterpret_cast(nullptr))), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); - EXPECT_THROW(RecordStream(allocation, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet); - platform::EndCUDAGraphCapture(); - - allocation.reset(); - Release(place); - CheckMemLeak(place); -} -#endif - TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { // so the second alloc will fail and retry size_t alloc_size = available_size / 4 * 3; - allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1); + allocation::AllocationPtr allocation1 = Alloc( + place, alloc_size, phi::Stream(reinterpret_cast(stream1))); allocation::AllocationPtr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = Alloc(place, alloc_size, stream2); + allocation2 = Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(stream2))); }); allocation1.reset(); // free but not release th.join(); @@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { CheckMemLeak(place); } +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 32; + data_num_ = 131072; + workspace_size_ = data_num_ * sizeof(int); + + for (size_t i = 0; i < stream_num_; ++i) { + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); +#endif + + std::shared_ptr workspace_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr result_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemset(result_allocation->ptr(), 0, result_allocation->size())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemset(result_allocation->ptr(), 0, result_allocation->size())); +#endif + + streams_.emplace_back(stream); + workspaces_.emplace_back(workspace_allocation); + results_.emplace_back(result_allocation); + host_results_.emplace_back(host_result_allocation); + } + } + + void SingleStreamRun(size_t idx) { + int *y = reinterpret_cast(results_[idx]->ptr()); + int neighbouring_idx = idx > 0 ? idx - 1 : idx; + + add_kernel<<>>( + reinterpret_cast(workspaces_[idx]->ptr()), y, data_num_); + add_kernel<<>>( + reinterpret_cast(workspaces_[neighbouring_idx]->ptr()), y, + data_num_); + RecordStream(workspaces_[neighbouring_idx], streams_[idx]); + } + + void MultiStreamRun() { + // Must run in reverse order, or the workspace_[i - 1] will be released + // before streams_[i]'s kernel launch + for (int i = stream_num_ - 1; i >= 0; --i) { + SingleStreamRun(i); + workspaces_[i].reset(); // fast GC + } + } + + void MultiThreadMultiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + workspaces_.clear(); + } + + void CUDAGraphRun() { + testing_cuda_graph_ = true; + platform::BeginCUDAGraphCapture(platform::CUDAPlace(), + cudaStreamCaptureModeGlobal); + + std::shared_ptr data_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + std::shared_ptr result_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + + int *data = static_cast(data_allocation->ptr()); + int *result = static_cast(result_allocation->ptr()); + + gpuStream_t main_stream = GetStream(data_allocation); + gpuStream_t other_stream; + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream)); + + add_kernel<<>>(data, result, + data_num_); + RecordStream(data_allocation, other_stream); + + std::unique_ptr cuda_graph = + platform::EndCUDAGraphCapture(); + + int replay_times = 10; + for (int i = 0; i < replay_times; ++i) { + cuda_graph->Replay(); + } + + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + Copy(host_result_allocation->place(), host_result_allocation->ptr(), + result_allocation->place(), result_allocation->ptr(), workspace_size_, + main_stream); + cudaStreamSynchronize(main_stream); + + int *host_result = static_cast(host_result_allocation->ptr()); + for (int i = 0; i < data_num_; ++i) { + EXPECT_EQ(host_result[i], replay_times); + } + + data_allocation.reset(); + result_allocation.reset(); + cuda_graph.release(); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream)); + } + + void CheckResult() { + for (size_t i = 0; i < stream_num_; ++i) { + Copy(host_results_[i]->place(), host_results_[i]->ptr(), + results_[i]->place(), results_[i]->ptr(), workspace_size_, + streams_[i]); + } + cudaDeviceSynchronize(); + + size_t thread_num = grid_num_ * block_num_; + for (size_t i = 0; i < stream_num_; ++i) { + int *result = static_cast(host_results_[i]->ptr()); + for (size_t j = 0; j < data_num_; ++j) { + EXPECT_EQ(result[j], 2); + } + } + } + + void TearDown() override { + workspaces_.clear(); + results_.clear(); + host_results_.clear(); + for (gpuStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); +#endif + } + + // Memory release for CUDA Graph memory pool is forbidden + if (!testing_cuda_graph_) { + CheckMemLeak(place_); + } + } + + bool testing_cuda_graph_{0}; + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + size_t workspace_size_; + platform::CUDAPlace place_; + std::vector streams_; + std::vector> workspaces_; + std::vector> results_; + std::vector> host_results_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { + MultiThreadMultiStreamRun(); + CheckResult(); +} + +#ifdef PADDLE_WITH_CUDA +TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) { + MultiStreamRun(); + CUDAGraphRun(); + CheckResult(); +} +#endif + } // namespace memory } // namespace paddle -- GitLab From 0969a4eb192e61388eee315dd54469138e1ce1ea Mon Sep 17 00:00:00 2001 From: From00 Date: Thu, 3 Mar 2022 16:22:11 +0800 Subject: [PATCH 027/261] Move compare OPs to phi (#39970) * Move compare OPs to phi * Fix bug * Use BroadcastKernel and ElementwiseKernel in phi --- .../operators/controlflow/CMakeLists.txt | 2 +- .../operators/controlflow/compare_all_op.cc | 81 +-------- .../operators/controlflow/compare_all_op.cu | 92 ---------- .../operators/controlflow/compare_all_op.h | 43 ----- .../fluid/operators/controlflow/compare_op.cc | 79 +++------ .../fluid/operators/controlflow/compare_op.cu | 63 ------- .../fluid/operators/controlflow/compare_op.h | 109 ------------ .../operators/controlflow/compare_op_npu.cc | 2 +- .../operators/controlflow/compare_op_xpu.cc | 2 +- paddle/fluid/operators/matrix_rank_op.cc | 9 +- paddle/fluid/operators/matrix_rank_op.cu | 5 +- paddle/fluid/operators/matrix_rank_op.h | 1 - .../operators/metrics/accuracy_op_npu.cc | 2 +- paddle/fluid/operators/viterbi_decode_op.h | 31 ++-- paddle/phi/infermeta/binary.cc | 49 ++++++ paddle/phi/infermeta/binary.h | 9 + paddle/phi/kernels/compare_kernel.h | 47 ++++++ paddle/phi/kernels/cpu/compare_kernel.cc | 143 ++++++++++++++++ paddle/phi/kernels/funcs/compare_functors.h | 53 ++++++ paddle/phi/kernels/gpu/compare_kernel.cu | 158 ++++++++++++++++++ paddle/phi/kernels/impl/compare_kernel_impl.h | 81 +++++++++ paddle/phi/ops/compat/compare_sig.cc | 56 +++++++ 22 files changed, 654 insertions(+), 463 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/compare_all_op.cu delete mode 100644 paddle/fluid/operators/controlflow/compare_all_op.h delete mode 100644 paddle/fluid/operators/controlflow/compare_op.cu delete mode 100644 paddle/fluid/operators/controlflow/compare_op.h create mode 100644 paddle/phi/kernels/compare_kernel.h create mode 100644 paddle/phi/kernels/cpu/compare_kernel.cc create mode 100644 paddle/phi/kernels/funcs/compare_functors.h create mode 100644 paddle/phi/kernels/gpu/compare_kernel.cu create mode 100644 paddle/phi/kernels/impl/compare_kernel_impl.h create mode 100644 paddle/phi/ops/compat/compare_sig.cc diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index a974f2ec335..70937069d97 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -19,6 +19,6 @@ else() target_link_libraries(conditional_block_infer_op conditional_block_op) endif() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index ede349f737d..9f229e6f15c 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - Tensor tmp; - bool* z_data = z->mutable_data(context.GetPlace()); - - if (x->dims() != y->dims()) { - z_data[0] = false; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = tmp.mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, 0, Functor(), &tmp); - } - auto ipt = framework::EigenVector::Flatten(tmp); - auto out = framework::EigenScalar::From(*z); - auto& place = - *context.template device_context() - .eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - out.device(place) = ipt.all(reduce_dim); - } - } -}; - template class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -81,26 +46,6 @@ template class CompareReduceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - PADDLE_ENFORCE_EQ(context->HasInput("X"), true, - platform::errors::InvalidArgument( - "%s operator must have input X", comment.type)); - PADDLE_ENFORCE_EQ(context->HasInput("Y"), true, - platform::errors::InvalidArgument( - "%s operator must have input Y", comment.type)); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - PADDLE_ENFORCE_GE( - dim_x.size(), dim_y.size(), - platform::errors::InvalidArgument( - "The size of dim_y should not be greater than dim_x's.")); - - context->SetOutputDim("Out", {1}); - context->ShareLoD("X", "Out"); - } }; } // namespace operators @@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ + DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PT_INFER_META(phi::CompareAllInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>, \ ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \ ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); -#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>); REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y"); - -REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all, - paddle::operators::EqualReduceFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu deleted file mode 100644 index d96dcebe51f..00000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -template -struct BitwiseAdd { - // Bitwise add operator, returns a + b - inline T initial() { return static_cast(true); } - - __host__ __device__ __forceinline__ T operator()(const T& a, - const T& b) const { - return a & b; - } -}; - -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - bool* z_data = z->mutable_data(context.GetPlace()); - Tensor tmp; - - if (x->dims() != y->dims()) { - thrust::device_ptr z_dev_ptr(z_data); - thrust::fill(z_dev_ptr, z_dev_ptr + 1, false); - return; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - const auto& cuda_ctx = - context.template device_context(); - std::vector ins = {x, y}; - std::vector outs = {&tmp}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel( - cuda_ctx, ins, &outs, Functor()); - - // Reduce by 'bitwise and' operator - std::vector reduce_dims; - reduce_dims.resize(tmp.dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), tmp, z, kps::IdentityFunctor(), - reduce_dims, stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>); - -REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor) -#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h deleted file mode 100644 index 78a7b76e3fd..00000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -template -struct EqualReduceFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T a, const T b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } else { - return (a == b); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 657e74398bb..5d9cdc61769 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include -#include -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - - if (context->GetInputDim("X") == context->GetInputDim("Y")) { - context->ShareDim("X", /*->*/ "Out"); - context->ShareLoD("X", /*->*/ "Out"); - } else { - int max_dim = std::max(dim_x.size(), dim_y.size()); - int axis = std::abs(dim_x.size() - dim_y.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), - max_dim, axis); - context->SetOutputDim("Out", phi::make_ddim(out_dims_array)); - // to do - context->ShareLoD("X", /*->*/ "Out"); - } - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel { "In order to force fill output variable to gpu memory.", \ false)); -#define REGISTER_COMPARE_OP(op_type, _equation) \ - struct _##op_type##Comment { \ - static char type[]; \ - static char equation[]; \ - }; \ - char _##op_type##Comment::type[]{#op_type}; \ - char _##op_type##Comment::equation[]{_equation}; \ - REGISTER_OPERATOR( \ - op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ - ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ - ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); \ +#define REGISTER_COMPARE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PT_INFER_META(phi::CompareInferMeta)); \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker, \ + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); \ REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, - paddle::operators::GreaterThanFunctor); + REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, - paddle::operators::GreaterEqualFunctor); + REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); -REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor, - paddle::operators::LessThanFunctor); + REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); -REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor, - paddle::operators::LessEqualFunctor); + REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, - paddle::operators::EqualFunctor); + REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, - paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu deleted file mode 100644 index 4b9452d0f60..00000000000 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -class CompareOpKernel - : public framework::OpKernel { - public: - using InT = typename Functor::ELEM_TYPE; - using OutT = bool; - void Compute(const framework::ExecutionContext& ctx) const override { - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>); - -REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor) -#undef REGISTER_CUDA_COMPARE_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h deleted file mode 100644 index be017a01ef3..00000000000 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define COMPARE_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEM_TYPE = InT; \ - HOSTDEVICE OutT operator()(const InT a, const InT b) const { \ - return static_cast(a op b); \ - } \ - }; - -COMPARE_FUNCTOR(LessThanFunctor, <) -COMPARE_FUNCTOR(LessEqualFunctor, <=) -COMPARE_FUNCTOR(GreaterThanFunctor, >) -COMPARE_FUNCTOR(GreaterEqualFunctor, >=) -#undef COMPARE_FUNCTOR - -template -struct EqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE OutT operator()(const InT a, const InT b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return static_cast(fabs(static_cast(a - b)) < 1e-8); - } else { - return static_cast(a == b); - } - } -}; - -template -struct NotEqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE bool operator()(const InT a, const InT b) const { - return !EqualFunctor()(a, b); - } -}; - -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); - } else { - ElementwiseComputeEx( - context, x, y, axis, InverseFunctor(), z); - } - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ - REGISTER_OP_##dev##_KERNEL(op_type, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc index 7bc4ca09771..7377d7cf8d3 100644 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc index 698bd051613..2de8b4c9ba8 100644 --- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 65599259e22..1f04875c220 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/svd_helper.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -224,15 +225,15 @@ class MatrixRankCPUKernel : public framework::OpKernel { int axis = -1; if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CPUDeviceContext, T, int>( context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); + phi::funcs::GreaterThanFunctor(), &compare_result); } else { - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CPUDeviceContext, T, int>( context, &eigenvalue_tensor, &tol_tensor, axis, - LessThanFunctor(), &compare_result); + phi::funcs::LessThanFunctor(), &compare_result); } auto dito_int = math::DeviceIndependenceTensorOperations { compare_result.mutable_data(detail::NewAxisDim(dim_out, k), context.GetPlace()); int axis = -1; - ElementwiseComputeEx, + ElementwiseComputeEx, platform::CUDADeviceContext, T, int64_t>( context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); + phi::funcs::GreaterThanFunctor(), &compare_result); auto dito_int = math::DeviceIndependenceTensorOperations(context); diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/fluid/operators/matrix_rank_op.h index 80774aa9169..93545fd3103 100644 --- a/paddle/fluid/operators/matrix_rank_op.h +++ b/paddle/fluid/operators/matrix_rank_op.h @@ -15,7 +15,6 @@ #pragma once #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/controlflow/compare_op.h" #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index 63bccc2e6e0..e83278f88b8 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -12,7 +12,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h index 0974177e6c7..e7fe743b964 100644 --- a/paddle/fluid/operators/viterbi_decode_op.h +++ b/paddle/fluid/operators/viterbi_decode_op.h @@ -14,12 +14,13 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_functor.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/unique_op.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/gather.h" #ifdef PADDLE_WITH_MKLML #include @@ -353,8 +354,8 @@ class ViterbiDecodeKernel : public framework::OpKernel { BinaryOperation SubInt; if (include_bos_eos_tag) { AddFloat(dev_ctx, logit0, start_trans, &alpha); - GetMask()(ctx, left_length, one, - &float_mask); + GetMask()(ctx, left_length, + one, &float_mask); MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt); AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); } else { @@ -375,8 +376,8 @@ class ViterbiDecodeKernel : public framework::OpKernel { alpha.Resize({batch_size, n_labels}); // mask = paddle.cast((left_length > 0), dtype='float32') // alpha = mask * alpha_nxt + (1 - mask) * alpha - GetMask()(ctx, left_length, zero, - &float_mask); + GetMask()( + ctx, left_length, zero, &float_mask); // alpha_nxt = mask * alpha_nxt MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt); // inv_mask = 1 - mask @@ -386,8 +387,8 @@ class ViterbiDecodeKernel : public framework::OpKernel { // alpha += alpha_nxt AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); if (include_bos_eos_tag) { - GetMask()(ctx, left_length, one, - &float_mask); + GetMask()(ctx, left_length, + one, &float_mask); // alpha += mask * trans_exp[:, self.stop_idx] MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt); AddFloat(dev_ctx, alpha, alpha_nxt, &alpha); @@ -396,8 +397,8 @@ class ViterbiDecodeKernel : public framework::OpKernel { } argmax(ctx, alpha, &last_ids, scores, 1); left_length.Resize({batch_size}); - GetMask()(ctx, left_length, - zero, &int_mask); + GetMask()( + ctx, left_length, zero, &int_mask); // last_ids_update = last_ids * tag_mask int last_ids_index = 1; int actual_len = (std::min)(seq_len, static_cast(max_seq_len)); @@ -416,17 +417,17 @@ class ViterbiDecodeKernel : public framework::OpKernel { batch_path[actual_len - last_ids_index]; hist->Resize({batch_size * n_labels}); gather(dev_ctx, *hist, gather_idx, &last_ids_update); - GetMask()(ctx, left_length, - zero, &int_mask); + GetMask()( + ctx, left_length, zero, &int_mask); MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update); - GetMask()(ctx, left_length, zero, - &zero_len_mask); + GetMask()( + ctx, left_length, zero, &zero_len_mask); MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp); SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask); MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update); AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update); - GetMask()(ctx, left_length, zero, - &int_mask); + GetMask()( + ctx, left_length, zero, &int_mask); MulInt(dev_ctx, last_ids, int_mask, &last_ids); AddInt(dev_ctx, last_ids_update, last_ids, &last_ids); } diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 7682f6b3d49..1f6f0b211b6 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -13,11 +13,60 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/binary.h" + +#include +#include +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { +void CompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + + if (dim_x == dim_y) { + out->share_meta(x); + } else { + int max_dim = std::max(dim_x.size(), dim_y.size()); + int axis = std::abs(dim_x.size() - dim_y.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(dim_x, + dim_y, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + out->set_dims(make_ddim(out_dims_array)); + out->share_lod(x); + } + + out->set_dtype(DataType::BOOL); +} + +void CompareAllInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + PADDLE_ENFORCE_GE( + dim_x.size(), + dim_y.size(), + errors::InvalidArgument( + "The size of dim_y should not be greater than dim_x's.")); + out->share_lod(x); + out->set_dims(make_ddim({1})); + out->set_dtype(DataType::BOOL); +} + void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { auto x_dims = x.dims(); auto x_rank = static_cast(x_dims.size()); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 5906e06b293..47745f8ce13 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -29,6 +29,15 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void CompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out); + +void CompareAllInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); + void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void MatmulInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/compare_kernel.h b/paddle/phi/kernels/compare_kernel.h new file mode 100644 index 00000000000..5b6b8cd868f --- /dev/null +++ b/paddle/phi/kernels/compare_kernel.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define DECALRE_COMPARE_KERNEL(compare_kernel) \ + template \ + void compare_kernel(const Context& ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out); + +DECALRE_COMPARE_KERNEL(LessThanKernel) +DECALRE_COMPARE_KERNEL(LessEqualKernel) +DECALRE_COMPARE_KERNEL(GreaterThanKernel) +DECALRE_COMPARE_KERNEL(GreaterEqualKernel) +DECALRE_COMPARE_KERNEL(EqualKernel) +DECALRE_COMPARE_KERNEL(NotEqualKernel) +#undef DECALRE_COMPARE_KERNEL + +#define DECALRE_COMPARE_ALL_KERNEL(compare_all_kernel) \ + template \ + void compare_all_kernel(const Context& ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out); + +DECALRE_COMPARE_ALL_KERNEL(EqualAll) +#undef DECALRE_COMPARE_KERNEL + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc new file mode 100644 index 00000000000..9006325a521 --- /dev/null +++ b/paddle/phi/kernels/cpu/compare_kernel.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/compare_kernel.h" +#include "paddle/phi/kernels/impl/compare_kernel_impl.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" + +namespace phi { + +template +inline void CompareKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + ctx.template Alloc(out); + if (x.dims().size() >= y.dims().size()) { + funcs::ElementwiseCompute( + ctx, x, y, axis, Functor(), out); + } else { + funcs::ElementwiseCompute( + ctx, x, y, axis, InverseFunctor(), out); + } +} + +template +inline void CompareAllKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + bool* out_data = ctx.template Alloc(out); + + if (x.dims() != y.dims()) { + out_data[0] = false; + } else { + DenseTensor tmp; + tmp.Resize(x.dims()); + ctx.template Alloc(&tmp); + + if (x.numel() == 1 && y.numel() == 1) { + bool* tmp_data = tmp.data(); + tmp_data[0] = Functor()(x.data()[0], y.data()[0]); + } else { + funcs::ElementwiseCompute( + ctx, x, y, 0, Functor(), &tmp); + } + auto tmp_flat = EigenVector::Flatten(tmp); + auto out_es = EigenScalar::From(*out); + auto& place = *ctx.eigen_device(); + auto reduce_dim = Eigen::array({{0}}); + out_es.device(place) = tmp_flat.all(reduce_dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(less_than, + CPU, + ALL_LAYOUT, + phi::LessThanKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(less_equal, + CPU, + ALL_LAYOUT, + phi::LessEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(greater_than, + CPU, + ALL_LAYOUT, + phi::GreaterThanKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(greater_equal, + CPU, + ALL_LAYOUT, + phi::GreaterEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(equal, + CPU, + ALL_LAYOUT, + phi::EqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(not_equal, + CPU, + ALL_LAYOUT, + phi::NotEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} + +PD_REGISTER_KERNEL(equal_all, + CPU, + ALL_LAYOUT, + phi::EqualAllKernel, + bool, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/funcs/compare_functors.h b/paddle/phi/kernels/funcs/compare_functors.h new file mode 100644 index 00000000000..569fed7b7fb --- /dev/null +++ b/paddle/phi/kernels/funcs/compare_functors.h @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +#define COMPARE_FUNCTOR(func_name, op) \ + template \ + struct func_name { \ + HOSTDEVICE OutT operator()(const InT a, const InT b) const { \ + return static_cast(a op b); \ + } \ + }; + +COMPARE_FUNCTOR(LessThanFunctor, <) +COMPARE_FUNCTOR(LessEqualFunctor, <=) +COMPARE_FUNCTOR(GreaterThanFunctor, >) +COMPARE_FUNCTOR(GreaterEqualFunctor, >=) +#undef COMPARE_FUNCTOR + +template +struct EqualFunctor { + HOSTDEVICE OutT operator()(const InT a, const InT b) const { + if (std::is_floating_point::value) { + return static_cast(fabs(static_cast(a - b)) < 1e-8); + } else { + return static_cast(a == b); + } + } +}; + +template +struct NotEqualFunctor { + HOSTDEVICE bool operator()(const InT a, const InT b) const { + return !EqualFunctor()(a, b); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu new file mode 100644 index 00000000000..272448504ac --- /dev/null +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -0,0 +1,158 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/compare_kernel.h" +#include "paddle/phi/kernels/impl/compare_kernel_impl.h" + +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +template +struct BitwiseAdd { + // Bitwise add operator, returns a + b + inline T initial() { return static_cast(true); } + + __host__ __device__ __forceinline__ T operator()(const T& a, + const T& b) const { + return a & b; + } +}; + +template +inline void CompareKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + ctx.template Alloc(out); + std::vector ins{&x, &y}; + std::vector outs{out}; + funcs::BroadcastKernel( + ctx, ins, &outs, axis, Functor()); +} + +template +inline void CompareAllKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + bool* out_data = ctx.template Alloc(out); + + if (x.dims() != y.dims()) { + thrust::device_ptr out_dev_ptr(out_data); + thrust::fill(out_dev_ptr, out_dev_ptr + 1, false); + return; + } + + DenseTensor tmp; + tmp.Resize(x.dims()); + ctx.template Alloc(&tmp); + + std::vector ins{&x, &y}; + std::vector outs{&tmp}; + funcs::ElementwiseKernel(ctx, ins, &outs, Functor()); + + // Reduce by 'bitwise and' operator + std::vector reduce_dims; + reduce_dims.resize(tmp.dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + kernels::TensorReduceImpl>( + ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(less_than, + GPU, + ALL_LAYOUT, + phi::LessThanKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(less_equal, + GPU, + ALL_LAYOUT, + phi::LessEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(greater_than, + GPU, + ALL_LAYOUT, + phi::GreaterThanKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(greater_equal, + GPU, + ALL_LAYOUT, + phi::GreaterEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(equal, + GPU, + ALL_LAYOUT, + phi::EqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} +PD_REGISTER_KERNEL(not_equal, + GPU, + ALL_LAYOUT, + phi::NotEqualKernel, + bool, + int16_t, + int, + int64_t, + float, + double) {} + +PD_REGISTER_KERNEL(equal_all, + GPU, + ALL_LAYOUT, + phi::EqualAllKernel, + bool, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h new file mode 100644 index 00000000000..4390c1f8e66 --- /dev/null +++ b/paddle/phi/kernels/impl/compare_kernel_impl.h @@ -0,0 +1,81 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/compare_kernel.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" + +namespace phi { + +template +inline void CompareKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +inline void CompareAllKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +#define DEFINE_COMPARE_KERNEL(compare_kernel, functor, inverse_functor) \ + template \ + void compare_kernel(const Context& ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + CompareKernelImpl, inverse_functor>( \ + ctx, x, y, axis, out); \ + } + +DEFINE_COMPARE_KERNEL(LessThanKernel, + funcs::LessThanFunctor, + funcs::GreaterThanFunctor) +DEFINE_COMPARE_KERNEL(LessEqualKernel, + funcs::LessEqualFunctor, + funcs::GreaterEqualFunctor) +DEFINE_COMPARE_KERNEL(GreaterThanKernel, + funcs::GreaterThanFunctor, + funcs::LessThanFunctor) +DEFINE_COMPARE_KERNEL(GreaterEqualKernel, + funcs::GreaterEqualFunctor, + funcs::LessEqualFunctor) +DEFINE_COMPARE_KERNEL(EqualKernel, funcs::EqualFunctor, funcs::EqualFunctor) +DEFINE_COMPARE_KERNEL(NotEqualKernel, + funcs::NotEqualFunctor, + funcs::NotEqualFunctor) +#undef DEFINE_COMPARE_KERNEL + +#define DEFINE_COMPARE_ALL_KERNEL(compare_all_kernel, functor) \ + template \ + void compare_all_kernel(const Context& ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + CompareAllKernelImpl>(ctx, x, y, out); \ + } + +DEFINE_COMPARE_ALL_KERNEL(EqualAllKernel, funcs::EqualFunctor) +#undef DEFINE_COMPARE_ALL_KERNEL + +} // namespace phi diff --git a/paddle/phi/ops/compat/compare_sig.cc b/paddle/phi/ops/compat/compare_sig.cc new file mode 100644 index 00000000000..964c7be3db3 --- /dev/null +++ b/paddle/phi/ops/compat/compare_sig.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LessThanArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("less_than", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature LessEqualArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("less_equal", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature GreaterThanArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("greater_than", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature GreaterEqualArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("greater_equal", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature EqualArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("equal", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature NotEqualArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("not_equal", {"X", "Y"}, {"axis"}, {"Out"}); +} + +KernelSignature EqualAllArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllArgumentMapping); -- GitLab From b1d38deafc3228acc3a06053e8e0359da617e659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:46:54 +0800 Subject: [PATCH 028/261] mlir attr types for infrt place, test=develop (#40087) * mlir attr types for infrt place, test=develop * fix a bug, test=develop --- paddle/infrt/dialect/infrt/common_type.cc | 33 +++++++------ paddle/infrt/dialect/infrt/common_type.h | 11 +++-- paddle/infrt/dialect/infrt/infrt_dialect.cc | 1 + paddle/infrt/dialect/infrt/infrt_ops_base.td | 46 ++++++++++++++++++- paddle/infrt/dialect/phi/ir/infrt_phi_base.td | 38 +++++++-------- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 10 ++-- paddle/infrt/dialect/phi/ir/phi_base.cc | 40 +--------------- paddle/infrt/dialect/phi/ir/phi_base.h | 7 +-- paddle/infrt/host_context/kernel_frame.cc | 37 ++++++--------- .../phi/infershaped/phi_kernel_launcher.h | 20 -------- paddle/infrt/kernel/phi/registry.cc | 11 ----- .../tests/dialect/pten/dense_tensor.mlir | 8 ++-- tools/infrt/generate_phi_kernel_dialect.py | 2 +- 13 files changed, 121 insertions(+), 143 deletions(-) diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc index 5cbd7b2cd61..00684c50526 100644 --- a/paddle/infrt/dialect/infrt/common_type.cc +++ b/paddle/infrt/dialect/infrt/common_type.cc @@ -43,46 +43,49 @@ llvm::Optional GetPrecisionType(llvm::StringRef key) { return llvm::None; } -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) { +llvm::StringRef GetString(TargetType type) { + llvm::StringRef str; switch (type) { case (TargetType::CPU): - os << "CPU"; + str = "CPU"; break; case (TargetType::GPU): - os << "GPU"; + str = "GPU"; break; default: - os << "Unsupported"; + str = "Unsupported"; } - return os; + return str; } -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) { +llvm::StringRef GetString(LayoutType type) { + llvm::StringRef str; switch (type) { case (LayoutType::NCHW): - os << "NCHW"; + str = "NCHW"; break; case (LayoutType::NHWC): - os << "NHWC"; + str = "NHWC"; break; default: - os << "Unsupported"; + str = "Unsupported"; } - return os; + return str; } -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) { +llvm::StringRef GetString(PrecisionType type) { + llvm::StringRef str; switch (type) { case (PrecisionType::FLOAT32): - os << "FP32"; + str = "FP32"; break; case (PrecisionType::FLOAT16): - os << "FP16"; + str = "FP16"; break; default: - os << "Unsupported"; + str = "Unsupported"; } - return os; + return str; } } // namespace infrt diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h index 436e7920ca5..2ebe2b8ccdb 100644 --- a/paddle/infrt/dialect/infrt/common_type.h +++ b/paddle/infrt/dialect/infrt/common_type.h @@ -54,8 +54,13 @@ llvm::Optional GetTargetType(llvm::StringRef key); llvm::Optional GetLayoutType(llvm::StringRef key); llvm::Optional GetPrecisionType(llvm::StringRef key); -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type); -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type); -llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type); +llvm::StringRef GetString(TargetType type); +llvm::StringRef GetString(LayoutType type); +llvm::StringRef GetString(PrecisionType type); +template +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, T type) { + os << GetString(type); + return os; +} } // end namespace infrt diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc index abb60016f90..400e4921c94 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include #include #include #include diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td index f19912dc0cd..8a6eb766567 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -10,16 +10,59 @@ def Infrt_Dialect : Dialect { let name = "infrt"; let cppNamespace = "::infrt"; + let useDefaultAttributePrinterParser = 1; } // Type definitions - // Base class for Infrt dialect types. class Infrt_Type traits = [], string baseCppClass = "::mlir::Type"> : TypeDef { } +class Infrt_EnumParam : TypeParameter { + let parser = [{[&]() -> ::mlir::FailureOr<}] # cppEnumType # [{> { + ::llvm::StringRef enumKeyword; + if (::mlir::failed($_parser.parseKeyword(&enumKeyword))) + return ::mlir::failure(); + auto maybeEnum = }] # stringToSymbolFnName # [{(enumKeyword); + if (maybeEnum) + return *maybeEnum; + llvm_unreachable("}] # cppEnumType # [{ can not be found."); + return {}; + }()}]; + let printer = "$_printer << " # symbolToStringFnName # "($_self)"; +} + +def TargetParam : Infrt_EnumParam<"::infrt::TargetType", "GetTargetType", "GetString">; +def PrecisionParam : Infrt_EnumParam<"::infrt::PrecisionType", "GetPrecisionType", "GetString">; +def LayoutParam : Infrt_EnumParam<"::infrt::LayoutType", "GetLayoutType", "GetString">; + +def TargetAttr : AttrDef { + let mnemonic = "target"; + let parameters = (ins + TargetParam:$target + ); + let assemblyFormat = "`<` $target `>`"; +} + +def PrecisionAttr : AttrDef { + let mnemonic = "precision"; + let parameters = (ins + PrecisionParam:$precision + ); + let assemblyFormat = "`<` $precision `>`"; +} + +def LayoutAttr : AttrDef { + let mnemonic = "layout"; + let parameters = (ins + LayoutParam:$layout + ); + let assemblyFormat = "`<` $layout `>`"; +} + def LoDTensor : Infrt_Type<"LoDTensor"> { let summary = "infrt lod tensor"; let description = [{lod_tensor<3x64x3x3xf32, 3>}]; @@ -37,7 +80,6 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { "::infrt::TargetType":$target, "::infrt::PrecisionType":$precision, "::infrt::LayoutType":$layout - ); } diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index 907f912d9e6..e9591e7f6d7 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -2,6 +2,7 @@ #define PHI_BASE include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/infrt_base.td" def PHI_Dialect : Dialect { let name = "phi"; @@ -11,27 +12,28 @@ def PHI_Dialect : Dialect { }]; let cppNamespace = "::infrt::phi"; -} - -class AllocatorTypeOf traits=[]>: - TypeDef { - let summary = !strconcat("!phi.allocator_", place, " type"); -} - -class ContextTypeOf traits=[]>: - TypeDef { - let summary = !strconcat("!phi.context_", place, " type"); + let useDefaultTypePrinterParser = 1; } def PhiOpTrait : NativeOpTrait<"PhiOpTrait">; -def CPU_Allocator : AllocatorTypeOf<"CPU">; -def GPU_Allocator : AllocatorTypeOf<"GPU">; - -def CPU_Context : ContextTypeOf<"CPU">; -def GPU_Context : ContextTypeOf<"GPU">; - -def Allocator : AnyTypeOf<[CPU_Allocator, GPU_Allocator], "Allocator type">; -def Context : AnyTypeOf<[CPU_Context, GPU_Context], "Context type">; +class PHI_Type traits = []> + : TypeDef {} + +def Allocator : PHI_Type<"Allocator"> { + let mnemonic = "allocator"; + let parameters = (ins + TargetParam:$target + ); + let assemblyFormat = "`<` $target `>`"; + } + + def Context : PHI_Type<"Context"> { + let mnemonic = "context"; + let parameters = (ins + TargetParam:$target + ); + let assemblyFormat = "`<` $target `>`"; + } #endif diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 39677871ff8..3399c408d9b 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -23,7 +23,7 @@ class PDT_Op traits = []> : Op : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> { - let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod); + let arguments = (ins Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod); let results = (outs DenseTensor:$output); } @@ -47,13 +47,13 @@ class PrintDenseTensorOp: class CreateCPUAllocatorOp : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> { let arguments = (ins); - let results = (outs CPU_Allocator:$output); + let results = (outs Allocator:$output); } class CreateCPUContextOp : PDT_Op<"create_context." # "cpu", [NoSideEffect]> { - let arguments = (ins CPU_Allocator:$input); - let results = (outs CPU_Context:$output); + let arguments = (ins Allocator:$input); + let results = (outs Context:$output); } def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">; @@ -63,7 +63,7 @@ def PDT_CreateContextOp_cpu : CreateCPUContextOp; def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { - let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); + let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); let results = (outs DenseTensor:$output); } diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index 7a6b3f3f0a4..d8095d7f3f1 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -14,6 +14,7 @@ #include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include #include #include #include @@ -27,27 +28,6 @@ namespace infrt { namespace phi { -void PHIDialect::printType(::mlir::Type type, - mlir::DialectAsmPrinter& os) const { - if (type.isa()) { - os << "CPU_Allocator"; - return; - } - if (type.isa()) { - os << "GPU_Allocator"; - return; - } - if (type.isa()) { - os << "CPU_Context"; - return; - } - if (type.isa()) { - os << "GPU_Context"; - return; - } - llvm_unreachable("unexpected 'allocator/context' type kind"); -} - void PHIDialect::initialize() { addOperations< #define GET_OP_LIST @@ -59,24 +39,6 @@ void PHIDialect::initialize() { >(); } -mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const { - llvm::StringRef keyword; - if (parser.parseKeyword(&keyword)) return mlir::Type(); - if (keyword == "CPU_allocator") { - return CPUAllocatorType::get(parser.getContext()); - } else if (keyword == "GPU_allocator") { - return GPUAllocatorType::get(parser.getContext()); - } else if (keyword == "CPU_context") { - return CPUContextType::get(parser.getContext()); - } else if (keyword == "GPU_context") { - return GPUContextType::get(parser.getContext()); - } else { - llvm_unreachable("unexpected 'allocator/context' type kind"); - } - - return mlir::Type(); -} - } // namespace phi } // namespace infrt diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h index a08d8229fcc..0ea1973a733 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -18,12 +18,10 @@ #include #include +#include "paddle/infrt/dialect/infrt/common_type.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" -#define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc" - #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc" @@ -41,6 +39,9 @@ class PhiOpTrait : public OpTrait::TraitBase { } // namespace OpTrait } // namespace mlir +#define GET_TYPEDEF_CLASSES +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc" + namespace infrt { namespace phi {} // namespace phi } // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc index 14e88be4b96..266c145f478 100644 --- a/paddle/infrt/host_context/kernel_frame.cc +++ b/paddle/infrt/host_context/kernel_frame.cc @@ -30,28 +30,21 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) { std::string KernelFrame::DumpArgTypes() const { std::stringstream ss; for (auto* value : GetValues(0, GetNumElements())) { - if (value->is_type()) { - ss << "bool (" << &value->get() << "), "; - } else if (value->is_type()) { - ss << "DenseHostTensor(" << &value->get() - << "), "; - } else if (value->is_type()) { - ss << "float(" << &value->get() << "), "; - } else if (value->is_type()) { - ss << "int(" << &value->get() << "), "; - } else if (value->is_type()) { - ss << "phi::DenseTensor(" << &value->get() << "), "; - } else if (value->is_type()) { - ss << "phi::MetaTensor(" << &value->get() << "), "; - } else if (value->is_type<::phi::CPUContext>()) { - ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), "; - } else if (value->is_type()) { - ss << "none(" << &value->get() << "), "; - } else if (value->is_type()) { - ss << "CpuPhiContext(" << &value->get() << "), "; - } else { - ss << "typeid: " << value->index() << ", "; - } +#define DUMP(type_name) \ + if (value->is_type()) { \ + ss << #type_name << &value->get() << "), "; \ + } + DUMP(bool); + DUMP(tensor::DenseHostTensor); + DUMP(float); + DUMP(int); + DUMP(::phi::DenseTensor); + DUMP(::phi::MetaTensor); + DUMP(::phi::CPUContext); + DUMP(host_context::None); + DUMP(backends::CpuPhiContext); +#undef DUMP + ss << "typeid: " << value->index() << ", "; } return ss.str(); } diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h index a0a5b391ea6..75c9e554778 100644 --- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h @@ -24,26 +24,6 @@ namespace infrt { namespace kernel { -static void FakePhiInferShape(const ::phi::MetaTensor& a, - const ::phi::MetaTensor& b, - bool arg_0, - bool arg_1, - ::phi::MetaTensor* c) { - LOG(INFO) << "the ptr of c: " << c; - LOG(INFO) << "c->numel(): " << c->numel(); -} - -static void FakePhiKernel(const ::phi::CPUContext& /*Context*/, - const ::phi::DenseTensor& a, - const ::phi::DenseTensor& b, - bool arg_0, - bool arg_1, - ::phi::DenseTensor* c) { - std::cout << "@FakePhiKernel@" << std::endl; - LOG(INFO) << "the ptr of c: " << c; - LOG(INFO) << "c->numel(): " << c->numel(); -} - template AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); - registry->AddKernel( - "phi_dt.fake_phi_kernel", - std::bind(&KernelLauncherFunc, - KernelLauncher(), - std::placeholders::_1)); } } // namespace kernel diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir index 695143c93b3..586af7a9c50 100644 --- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir @@ -2,11 +2,11 @@ // CHECK-LABEL: @sign_any_float32_execute func @sign_any_float32_execute() { - %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator - %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context - %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) + %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.allocator + %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator) -> !phi.context + %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () - %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor) -> (!infrt.dense_tensor) + %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.context, !infrt.dense_tensor) -> (!infrt.dense_tensor) // CHECK: dense_tensor: shape=shape[1], values=[1] "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 80cf3958b15..8efa03306fb 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -95,7 +95,7 @@ def generate_inputs_info(input_info): def generate_arguments_info(op_name, input_info, attr_info): input_args = generate_inputs_info(input_info) attr_args = generate_attrs_info(op_name, attr_info) - context_args = "CPU_Context:$dev_ctx" + context_args = "Context:$dev_ctx" argument_ = "{},{},{}".format(context_args, input_args, attr_args) return (("let arguments = (ins {});".format(argument_.strip(",")))) -- GitLab From d8b4022389aaaa9d46acb43f2846ad2b823a6ad7 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Thu, 3 Mar 2022 17:29:56 +0800 Subject: [PATCH 029/261] fix_trt_engine_op_bug (#40067) --- paddle/fluid/inference/api/analysis_predictor.cc | 4 +++- paddle/fluid/inference/api/analysis_predictor.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5492c3b0d26..df61b510319 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -80,6 +80,8 @@ using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; #endif +int AnalysisPredictor::clone_num_ = 1; + namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && @@ -1633,7 +1635,7 @@ std::unique_ptr AnalysisPredictor::Clone() { std::lock_guard lk(clone_mutex_); auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); - x->executor_->ResetTrtOps(++x->clone_num_); + x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_); return std::unique_ptr(x); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 8ed183dae0b..21a7e9658bb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -486,7 +486,7 @@ class AnalysisPredictor : public PaddlePredictor { bool status_is_cloned_{false}; std::map>> shape_info_; - int clone_num_{1}; + static int clone_num_; #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ !defined(PADDLE_WITH_ASCEND_CL) -- GitLab From 167d511f074633992764f51c3be416a0d9169ff7 Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Thu, 3 Mar 2022 19:01:38 +0800 Subject: [PATCH 030/261] cinn_launch_op: switch to execution by PE (#39911) * swith to PE execution in cinn launch * fix outer variables erased * skip the map bug temporarily for test * temporary solution for batch_norm bug * update comment * fix compile error * cinn_instruction_run_op_test: update code to skip external alloc/free instructions generated --- .../framework/paddle2cinn/cinn_compiler.cc | 1 - paddle/fluid/operators/cinn/CMakeLists.txt | 6 +-- .../cinn/cinn_instruction_run_op_test.cc | 2 +- .../operators/cinn/cinn_launch_context.cc | 46 +++++++++++++++++-- .../operators/cinn/cinn_launch_context.h | 10 ++++ paddle/fluid/operators/cinn/cinn_launch_op.h | 21 ++++----- .../operators/cinn/cinn_launch_op_test.cc | 4 ++ 7 files changed, 69 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 706815185a1..c015e90f71e 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -241,7 +241,6 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; - options.with_buffer_handle_instruction_inserted = true; auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index f1247ebdf23..2092f65212a 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,9 +1,9 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn) -SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) +SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) @@ -11,7 +11,7 @@ if (WITH_TESTING) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) + cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags) set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 7c4bdc09a56..2afee35112e 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -50,7 +50,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) { auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp( "cinn_instruction_run", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"cached_index", 0}, {"instruction_index", 1}}); + {{"cached_index", 0}, {"instruction_index", 0}}); auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {add_op_out_name}}}, {{}}); diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0a21d937aa1..b76dd604092 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" @@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, // Convert the CINN runtime program to a Paddle graph runtime_graph_ = std::make_unique( BuildCompiledProgram(graph, compiled_obj)); - runtime_graph_->SetNotOwned( - kMemOptVarInfoFromMainGraph, - &graph.Get(kMemOptVarInfoFromMainGraph)); + auto& outer_varinfo = graph.Get(kMemOptVarInfoFromMainGraph); + runtime_graph_->SetNotOwned(kMemOptVarInfoFromMainGraph, + &outer_varinfo); + // collect skip_eager_vars + skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size()); + auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) { + // if a var exists at outer_varinfo map, + // that means it can be erased after graph execution + if (!outer_varinfo.count(var_name)) { + skip_eager_vars_.emplace_back(var_name); + } + }; + std::for_each(input_var_names.begin(), input_var_names.end(), + add_skip_var_fn); + std::for_each(output_var_names.begin(), output_var_names.end(), + add_skip_var_fn); + VLOG(4) << string::Sprintf( + "Distribution of variables in the graph compiled:" + "input[%lu],internal[%lu],output[%lu]," + "outer_eager_deletion[%lu],skip_eager_deletion[%lu]," + "initialized_beforehand[%lu]", + input_var_names.size(), internal_var_names_.size(), + output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(), + initialized_beforehand_vars_.size()); } void CinnLaunchContext::BuildVarNameMap( @@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( // are set by values of the corresponding compiled tensors, // including the in/out variables where the equiality between their tensors // and the CINN compiled ones is verified in corresponding cinn_launch_op. + std::unordered_set has_refer_vars; for (auto&& arg : cinn_argument_names_) { const std::string& var_name = cinn2paddle_varmap_.at(arg); framework::VarDesc* var_desc = block->Var(var_name); @@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ori_desc = res->second; var_desc->SetPersistable(ori_desc->Persistable()); var_desc->SetIsParameter(ori_desc->IsParameter()); + has_refer_vars.insert(var_name); } auto cinn_tensor = GetCinnTensorOfVar(var_name); @@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ins = instructions.at(ins_idx).get(); auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + for (auto&& var_name : in_args) { + if (!has_refer_vars.count(var_name)) { + initialized_beforehand_vars_.emplace_back(var_name); + } + } + has_refer_vars.insert(out_args.begin(), out_args.end()); auto* op_desc = block->AppendOp(); op_desc->SetType("cinn_instruction_run"); @@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, framework::Scope* scope) { if (!parallel_executor_) { framework::details::ExecutionStrategy exec_strategy; + exec_strategy.num_threads_ = 1; + exec_strategy.use_device_ = platform::Place2DeviceType(place); framework::details::BuildStrategy build_strategy; parallel_executor_ = std::make_unique( place, scope, exec_strategy, build_strategy, runtime_graph_.get()); } // update the scope bound to an OpHandle and rebuild temporary variables + VLOG(4) << "Reset scope and initialize temporary variables"; std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); parallel_executor_->PrepareVariables(scope); + for (auto&& var_name : initialized_beforehand_vars_) { + auto* var = scope->GetVar(var_name); + auto* buffer = GetCinnBufferOfVar(var_name); + auto dim = framework::DDim(buffer->dims, buffer->dimensions); + var->GetMutable()->Resize(dim); + var->GetMutable()->mutable_data(place); + } return parallel_executor_.get(); } diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index a4d613ea618..ed5e4383d83 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -86,6 +86,11 @@ class CinnLaunchContext { void CheckTensorEquivalent(const std::string& var_name, const framework::LoDTensor& paddle_tensor); + // Return the name list of variables skipped eager deletion + const std::vector& GetSkipEagerVars() const { + return skip_eager_vars_; + } + // Return internal variable names list const std::unordered_set& GetInternalVarNames() const { return internal_var_names_; @@ -143,6 +148,9 @@ class CinnLaunchContext { std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; + // TODO(CtfGo): remove this list after fixing batch_norm bug + // due to duplicate association in the same variable. + std::vector initialized_beforehand_vars_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; @@ -150,6 +158,8 @@ class CinnLaunchContext { std::unique_ptr runtime_graph_; // a ParallelExecutor to execute the runtime graph std::unique_ptr parallel_executor_; + // the name list of skip_eager_vars in runtime + std::vector skip_eager_vars_; // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index cf3b98c6679..5263aae03ed 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel { details::DebugCinnCompiledResult(cinn_compiled_object); auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. Prepare arguments needed for the compiled executable program. - launch_context->UpdateCapturedEnv(scope, place); + // Step 3. check the computational consistency of the subgraph + // before and after the compilation // 3.1 Input variables: tensors of input variables have // been initialized before graph compiled, just check the // equiality between tensors of paddle and cinn. @@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel { *inputs_name2tensor.at(var_name)); } - // 3.2 Output variables: the output variables will be initialized - // and allocated buffer in callbacks which are defined in the - // external_malloc/free interface of cinn_buffer_t - // in their corresponding arguments. - // 3.3 Internal variables: A temporary scope is created in - // UpdateCapturedEnv to keep the internal variables and - // they are also initialized through callbacks - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. Launch CINN to execute the compiled executable program - VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; - details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + // Step 5. use PE to execute the compiled CINN instructions + // in nodes of the runtime graph + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index f5b6161ff34..460d417e61f 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" @@ -27,7 +28,9 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" USE_OP(cinn_launch); +USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +DECLARE_double(eager_delete_tensor_gb); namespace paddle::operators { @@ -61,6 +64,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); }; + FLAGS_eager_delete_tensor_gb = -1; // CPU run_and_check_fn(platform::CPUPlace()); -- GitLab From 1c2058834367464b4a293dbb58b6fa2137c24cc5 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Thu, 3 Mar 2022 20:23:51 +0800 Subject: [PATCH 031/261] move eye, lerp infershape to phi (#40105) --- paddle/fluid/operators/eye_op.cc | 26 ++++-------- paddle/fluid/operators/lerp_op.cc | 50 +++-------------------- paddle/phi/infermeta/nullary.cc | 8 ++++ paddle/phi/infermeta/nullary.h | 5 +++ paddle/phi/infermeta/ternary.cc | 17 ++++++++ paddle/phi/infermeta/ternary.h | 5 +++ paddle/phi/kernels/eye_kernel.h | 2 +- paddle/phi/kernels/funcs/common_shape.h | 25 ++++++++++++ paddle/phi/kernels/impl/eye_kernel_impl.h | 2 +- 9 files changed, 75 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index 8f8a0f174a7..f8c6b4eb8c5 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of EyeOP should not be null.")); - auto num_rows = ctx->Attrs().Get("num_rows"); - PADDLE_ENFORCE_EQ( - num_rows >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_rows) should be non-negative int.")); - auto num_columns = ctx->Attrs().Get("num_columns"); - if (num_columns == -1) num_columns = num_rows; - PADDLE_ENFORCE_EQ( - num_columns >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_columns) should be non-negative int.")); - ctx->SetOutputDim("Out", {num_rows, num_columns}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, + PT_INFER_META(phi::EyeInferMeta)); REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + EyeInferShapeFunctor); diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index 0aaefc7ca75..fef6fc5319e 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -20,49 +23,6 @@ namespace operators { class LerpOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto w_dims = ctx->GetInputDim("Weight"); - framework::DDim out_dims; - out_dims = GetOutputDims(x_dims, y_dims); - if (w_dims.size() > 1 || w_dims[0] != 1) { - out_dims = GetOutputDims(out_dims, w_dims); - } - - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - private: - framework::DDim GetOutputDims(const framework::DDim& s_dims, - const framework::DDim& l_dims) const { - if (s_dims.size() > l_dims.size()) { - return GetOutputDims(l_dims, s_dims); - } - std::vector shapes = phi::vectorize(l_dims); - for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) { - int64_t s = s_dims[i]; - int64_t l = l_dims[j]; - if (s != l) { - if (l == 1) { - shapes[j] = s; - } else if (s != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of tensor a %s:%d must match shape of tensor b " - "%s:%d.", - s_dims.to_str(), i, l_dims.to_str(), j)); - } - } - } - return phi::make_ddim(shapes); - } }; class LerpOpMaker : public framework::OpProtoAndCheckerMaker { @@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, + PT_INFER_META(phi::LerpInferMeta)); REGISTER_OPERATOR( lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker, paddle::operators::LerpOpGradMaker, paddle::operators::LerpOpGradMaker, - paddle::operators::LerpInplaceInferer); + paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor); REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp); diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 1fdf8a6940a..0c48c9d0c7e 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -32,4 +32,12 @@ void CreateInferMeta(const ScalarArray& shape, CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); } +void EyeInferMeta(int64_t num_rows, + int64_t num_columns, + DataType dtype, + MetaTensor* out) { + if (num_columns == -1) num_columns = num_rows; + out->set_dims({num_rows, num_columns}); + out->set_dtype(dtype); +} } // namespace phi diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index ea5bb71551b..40d6ea595c0 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -35,4 +35,9 @@ void CreateInferMetaBase(const std::vector& shape, void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); +void EyeInferMeta(int64_t num_rows, + int64_t num_columns, + DataType dtype, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 52aeaef8438..1c1497fb0e4 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -89,4 +89,21 @@ void AddmmInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto w_dims = weight.dims(); + DDim out_dims; + out_dims = funcs::GetOutputDims(x_dims, y_dims); + if (w_dims.size() > 1 || w_dims[0] != 1) { + out_dims = funcs::GetOutputDims(out_dims, w_dims); + } + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index d6223dd87aa..5679c5b533f 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -37,4 +37,9 @@ void AddmmInferMeta(const MetaTensor& input, float beta, MetaTensor* out); +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h index 8b21b8ae405..e9e1abffd14 100644 --- a/paddle/phi/kernels/eye_kernel.h +++ b/paddle/phi/kernels/eye_kernel.h @@ -22,7 +22,7 @@ template void EyeKernel(const Context& ctx, int64_t num_rows, int64_t num_columns, - int dtype, + DataType dtype, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index d5289dcc22c..dce80caab72 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -140,5 +140,30 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) { return true; } +inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) { + if (s_dims.size() > l_dims.size()) { + return GetOutputDims(l_dims, s_dims); + } + std::vector shapes = phi::vectorize(l_dims); + for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) { + int64_t s = s_dims[i]; + int64_t l = l_dims[j]; + if (s != l) { + if (l == 1) { + shapes[j] = s; + } else if (s != 1) { + PADDLE_THROW(errors::InvalidArgument( + "The shape of tensor a %s:%d must match shape of tensor b " + "%s:%d.", + s_dims.to_str(), + i, + l_dims.to_str(), + j)); + } + } + } + return phi::make_ddim(shapes); +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h index 453652273a2..f4041f921fd 100644 --- a/paddle/phi/kernels/impl/eye_kernel_impl.h +++ b/paddle/phi/kernels/impl/eye_kernel_impl.h @@ -36,7 +36,7 @@ template void EyeKernel(const Context& ctx, int64_t num_rows, int64_t num_columns, - int dtype, + DataType dtype, DenseTensor* out) { auto num = num_columns; if (num == -1) { -- GitLab From e7aea6507bd33eb8285fa822d1ea2fb4afb8d8af Mon Sep 17 00:00:00 2001 From: Ligoml Date: Thu, 3 Mar 2022 14:55:50 +0800 Subject: [PATCH 032/261] update README --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7dc83aa695c..cdbf2d9f3bf 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md) Welcome to the PaddlePaddle GitHub. PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. -PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. +PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. diff --git a/README_cn.md b/README_cn.md index 6b37cfd97b3..3834ee148f9 100644 --- a/README_cn.md +++ b/README_cn.md @@ -15,7 +15,7 @@ 欢迎来到 PaddlePaddle GitHub -飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者265万,服务企业10万家,基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 +飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者406万,服务企业15.7万家,基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 ## 安装 -- GitLab From eaacf8bfee5c9583f7ebf0deff20b90db9d73478 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 3 Mar 2022 21:31:40 +0800 Subject: [PATCH 033/261] fix save_vars bugs (#40062) --- paddle/fluid/operators/save_combine_op.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 6da73c99068..7fe6623dcca 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { - return expected_kernel_type; + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place()); } }; -- GitLab From d50fb43e66c5c0c1a5f6b95229f39858ad07e7b5 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 4 Mar 2022 09:05:37 +0800 Subject: [PATCH 034/261] Move conv to pten (#39354) * move conv to pten * move conv to pten; test=develop * fix bug; * add conv cudnn impl; test=develop * update * update operator; test=develop * fix bug; test=develop * move operator and prepared_operator to develop; test=develop * resolve conflict; test=develop * remove useless code;test=develop * add depency ; test=develop * fix bug; * add sig.cc ; test=develop * fix use_op error; test=develop * fix bug; test=develop * fix bug; test=develop * add conv3d register; test=develop * fix star gan and conv_nn_grad test failed; test=develop * add header; test=develop * manul to recover to develop; * resolve confilct; test=develop * remove useless code * fix bug; * remove conv2d_cudnn; test=develop * fix bugs; test=develop * fix cpu rocm compile bugs; test=develop * fix blas error; test=develop * fix compile bug; test=develop * fix windows compile error; test=develop * fix windows error; test=develop * resolve confilct; test=develop --- .../tensorrt/convert/test_conv2d_op.cc | 2 +- paddle/fluid/operators/conv_cudnn_helper.h | 29 +- paddle/fluid/operators/conv_cudnn_op.cu | 1478 ----------------- paddle/fluid/operators/conv_miopen_helper.h | 21 +- paddle/fluid/operators/conv_op.cc | 52 +- paddle/fluid/operators/conv_op.cu.cc | 43 - paddle/fluid/operators/conv_op.h | 813 --------- .../operators/conv_transpose_cudnn_op.cu | 62 +- paddle/fluid/operators/conv_transpose_op.cu | 140 ++ paddle/fluid/operators/conv_transpose_op.h | 126 -- .../operators/fused/cudnn_norm_conv_test.cc | 9 +- paddle/fluid/operators/math/depthwise_conv.h | 72 - paddle/fluid/operators/math/vol2col.cc | 227 +++ .../operators/mkldnn/test_mkldnn_caching.cc | 4 +- paddle/phi/core/compat/arg_map_context.h | 1 + paddle/phi/kernels/CMakeLists.txt | 2 +- paddle/phi/kernels/conv_grad_grad_kernel.h | 61 + paddle/phi/kernels/conv_grad_kernel.h | 73 + paddle/phi/kernels/conv_kernel.h | 67 + .../phi/kernels/cpu/conv_grad_grad_kernel.cc | 72 + paddle/phi/kernels/cpu/conv_grad_kernel.cc | 103 ++ paddle/phi/kernels/cpu/conv_kernel.cc | 92 + paddle/phi/kernels/cpu/conv_util.h | 91 + .../phi/kernels/depthwise_conv_grad_kernel.h | 19 + paddle/phi/kernels/depthwise_conv_kernel.h | 19 + paddle/phi/kernels/funcs/batch_norm_utils.h | 143 ++ paddle/phi/kernels/funcs/padding.h | 4 +- .../phi/kernels/gpu/conv_grad_grad_kernel.cu | 23 + paddle/phi/kernels/gpu/conv_grad_kernel.cu | 62 + paddle/phi/kernels/gpu/conv_kernel.cu | 56 + paddle/phi/kernels/gpu/conv_test_kernel.cu | 13 + .../kernels/gpu/depthwise_conv.h} | 798 ++++++--- .../kernels/gpu/depthwise_conv_grad_kernel.cu | 142 ++ .../phi/kernels/gpu/depthwise_conv_kernel.cu | 130 ++ .../gpudnn/conv_grad_grad_kernel_gpudnn.cu | 834 ++++++++++ .../kernels/gpudnn/conv_grad_kernel_gpudnn.cu | 683 ++++++++ .../phi/kernels/gpudnn/conv_kernel_gpudnn.cu | 476 ++++++ paddle/phi/kernels/impl/conv_cudnn_impl.h | 90 + .../kernels/impl/conv_grad_grad_kernel_impl.h | 330 ++++ .../phi/kernels/impl/conv_grad_kernel_impl.h | 257 +++ paddle/phi/kernels/impl/conv_kernel_impl.h | 183 ++ paddle/phi/ops/compat/conv2d_sig.cc | 70 + paddle/phi/ops/compat/conv3d_sig.cc | 70 + paddle/phi/ops/compat/depthwise_conv2d_sig.cc | 77 + .../tests/unittests/test_conv1d_layer.py | 1 + .../tests/unittests/test_conv2d_layer.py | 2 + .../fluid/tests/unittests/test_conv2d_op.py | 2 +- .../fluid/tests/unittests/test_conv3d_op.py | 2 + .../tests/unittests/test_conv_nn_grad.py | 6 +- .../tests/unittests/test_functional_conv2d.py | 1 + .../tests/unittests/test_functional_conv3d.py | 1 + .../test_fuse_relu_depthwise_conv_pass.py | 1 + ...perative_star_gan_with_gradient_penalty.py | 1 + 53 files changed, 5300 insertions(+), 2836 deletions(-) delete mode 100644 paddle/fluid/operators/conv_cudnn_op.cu delete mode 100644 paddle/fluid/operators/conv_op.cu.cc delete mode 100644 paddle/fluid/operators/math/depthwise_conv.h create mode 100644 paddle/phi/kernels/conv_grad_grad_kernel.h create mode 100644 paddle/phi/kernels/conv_grad_kernel.h create mode 100644 paddle/phi/kernels/conv_kernel.h create mode 100644 paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/conv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/conv_kernel.cc create mode 100644 paddle/phi/kernels/cpu/conv_util.h create mode 100644 paddle/phi/kernels/depthwise_conv_grad_kernel.h create mode 100644 paddle/phi/kernels/depthwise_conv_kernel.h create mode 100644 paddle/phi/kernels/funcs/batch_norm_utils.h create mode 100644 paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/conv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/conv_kernel.cu create mode 100644 paddle/phi/kernels/gpu/conv_test_kernel.cu rename paddle/{fluid/operators/math/depthwise_conv.cu => phi/kernels/gpu/depthwise_conv.h} (62%) create mode 100644 paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/depthwise_conv_kernel.cu create mode 100644 paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu create mode 100644 paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu create mode 100644 paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu create mode 100644 paddle/phi/kernels/impl/conv_cudnn_impl.h create mode 100644 paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/conv_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/conv_kernel_impl.h create mode 100644 paddle/phi/ops/compat/conv2d_sig.cc create mode 100644 paddle/phi/ops/compat/conv3d_sig.cc create mode 100644 paddle/phi/ops/compat/depthwise_conv2d_sig.cc diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index 95916746d6f..b96992ef851 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP(conv2d_transpose); namespace paddle { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3bbb284ca82..4e6fda3d09a 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::DSizes(); @@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector& perf_results, using framework::ConvSearchCache; -static void SetConvMathType(const framework::ExecutionContext& ctx, - cudnnDataType_t dtype, +static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); @@ -231,8 +230,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; bool has_got_workspace_size = true; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -284,8 +282,7 @@ struct SearchAlgorithm { } else if (deterministic) { algo = static_cast(1); } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -346,8 +343,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -413,8 +409,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -478,8 +473,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -534,8 +528,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = *(framework::ConvSearchCache::Instance().GetBackwardFilter()); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu deleted file mode 100644 index 2055bf560e6..00000000000 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ /dev/null @@ -1,1478 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the spopecific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/phi/kernels/funcs/padding.h" - -DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; -using DataLayout = platform::DataLayout; - -static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -template -class CUDNNConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - const Tensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - // We will only do data format conversion from NHWC to NCHW. - // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // ------------ transformed tensor ----------- - Tensor transformed_input_channel(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_filter_channel(filter->type()); - T* output_data = nullptr; - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst(ctx, output, - &transformed_output); - - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output.ShareDataWith(*output); - } - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - } else { - transformed_filter_channel.ShareDataWith(*filter); - } - output_data = transformed_output.data(); - - // update padding and dilation - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - std::vector input_pad(transformed_input_channel.dims().size() * 2, - 0); - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_format = GetCudnnTensorFormat(layout); - - args.handle = handle; - -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), groups); -#else - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, - &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - algo = search::Find(args, exhaustive_search, deterministic, - workspace_size, ctx); -#else - cudnnConvolutionFwdAlgo_t algo{}; - using search = SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (ctx.Attr("groups") > 1) { - algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - -// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. -// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; -// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args.idesc.desc(), input_data, - args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, - &beta, args.odesc.desc(), output_data, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, workspace_ptr, workspace_size, &beta, - args.odesc.desc(), output_data + i * group_offset_out)); - }, - workspace_size); - } -#endif - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_output, output); - } - } -}; - -template -class CUDNNConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - } - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - } - - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvGradOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // transform Tensor - Tensor transformed_input_channel(input->type()); - Tensor transformed_output_grad_channel(output_grad->type()); - Tensor transformed_input_grad_channel(input->type()); - Tensor transformed_filter_channel(filter->type()); - Tensor transformed_filter_grad_channel(filter->type()); - - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input, output_grad, input_grad and tensor from " - "NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - TransToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - - if (input_grad) { - ResizeToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy - // the data of input_grad to transformed_input_grad_channel. - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - TransToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - } - } - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output_grad_channel.ShareDataWith(*output_grad); - if (input_grad) { - transformed_input_grad_channel.ShareDataWith(*input_grad); - } - } - - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - - if (filter_grad) { - ResizeToChannelLast( - ctx, filter_grad, &transformed_filter_grad_channel); - } - } else { - transformed_filter_channel.ShareDataWith(*filter); - if (filter_grad) { - transformed_filter_grad_channel.ShareDataWith(*filter_grad); - } - } - - // update paddings - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // cuDNN only supports padding the same amount on every dimension. - // So we create a new padded input tensor. - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_input(input->type()); - Tensor transformed_input_grad(input->type()); - std::vector padding_common(data_dim, 0); - std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - - transformed_input_grad.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (input_grad) { - transformed_input_grad = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - // pad for input - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (input_grad) { - transformed_input_grad.ShareDataWith(transformed_input_grad_channel); - } - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_tensor = GetCudnnTensorFormat(layout); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, - &o_c, &o_d, &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size_d, ctx); -#else - using search1 = SearchAlgorithm; - data_algo = - search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max(workspace_size_d, - search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, - iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size_w, ctx); -#else - using search2 = SearchAlgorithm; - filter_algo = - search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; -#else - ScalingParamType beta = - (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) ? 1.0f : 0.0f; -#endif - VLOG(4) << "Conv_grad: use_addto = " - << (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")); - - if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - Tensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size_d)); - }, - workspace_size_d); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), - transformed_input_grad_data, &alpha, args1.idesc.desc(), - temp_tensor_data, &beta, args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); - }, - workspace_size_d); - } - -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size_d, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size_d); - } -#endif - if (!is_sys_pad) { - std::vector starts(transformed_input_channel.dims().size(), 0); - std::vector axes(transformed_input_channel.dims().size(), 0); - - for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - - transformed_input_grad_channel.mutable_data(ctx.GetPlace()); - if (transformed_input_channel.dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } - } - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_input_grad_channel, input_grad); - } - } - - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), output_grad_data, - args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size_w)); - }, - workspace_size_w); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size_w, &beta_filter, args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size_w); - } -#endif - - if (compute_format == DataLayout::kNHWC) { - TransToChannelFirst( - ctx, &transformed_filter_grad_channel, filter_grad); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv(ddI, W) + conv(I, ddW) - * dW = conv_bp_filter(ddI, dO) - * dI = conv_bp_data(ddW, dO) - */ -template -class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - if (dX) { - transformed_dX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_X_channel, pad_value, - &transformed_X); - if (ddX) { - phi::funcs::PadFunction( - dev_ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{ - &transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, - dilations, dtype}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, - dilations, dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = - static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = - static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find(args1, exhaustive_search, false, - workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find(args2, exhaustive_search, false, - workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, fwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find(args3, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_algo = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find(args4, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_algo = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), ddx, - args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, - &beta, args1.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, workspace_ptr, workspace_size, &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), - ddw, args2.cdesc.desc(), fwd_algo2, &beta, - args2.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, workspace_ptr, workspace_size, &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), transformed_dy_channel, - args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, - &beta, args3.wdesc.desc(), dw, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - ddx + i * group_offset_in, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), transformed_dy_channel, - args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, - &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue -// Use depthwise_conv2d in MIOPEN to resolve this issue -REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 9c9795143eb..66f71869384 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -128,11 +128,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -170,11 +169,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -212,11 +210,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e345a4d2603..8213e877f72 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } -#ifndef PADDLE_WITH_ASCEND_CL - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN or NPU is used")); - } -#endif +// #ifndef PADDLE_WITH_ASCEND_CL +// if (input_data_type == framework::proto::VarType::FP16) { +// PADDLE_ENFORCE_EQ( +// library, framework::LibraryType::kCUDNN, +// platform::errors::InvalidArgument( +// "float16 can only be used when CUDNN or NPU is used")); +// } +// #endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { @@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -// depthwise conv kernel -// TODO(xingzhaolong): neon kernel for mobile -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - REGISTER_OP_VERSION(conv2d) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc deleted file mode 100644 index d07593f5c02..00000000000 --- a/paddle/fluid/operators/conv_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d, - ops::DepthwiseConvKernel, - ops::DepthwiseConvKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad, - ops::DepthwiseConvGradKernel, - ops::DepthwiseConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 26166362da8..a5d888765bf 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output(output->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims = - phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - auto& dev_ctx = context.template device_context(); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim in_matrix_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - Tensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, - T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); - } - } -}; - -template -class GemmConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output_grad(output_grad->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - auto& dev_ctx = context.template device_context(); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - - framework::DDim input_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->dtype()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), - &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); - } - } - } - } -}; - -template -class GemmConvDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - const Tensor* X = ctx.Input("Input"); - const Tensor* dY = ctx.Input("DOutput"); - const Tensor* ddX = ctx.Input("DDInput"); - const Tensor* ddW_in = ctx.Input("DDFilter"); - - Tensor* ddY = ctx.Output("DDOutput"); - Tensor* dW = ctx.Output("DFilter"); - Tensor* dX = ctx.Output("DInput"); - Tensor W = GET_DATA_SAFELY(ctx.Input("Filter"), "Input", "Filter", - "GemmConvDoubleGrad"); - if (!ddY && !dW && !dX) return; - - const int groups = ctx.Attr("groups"); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_X(X->dtype()); - Tensor transformed_dY(dY->dtype()); - Tensor transformed_ddX(X->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X); - TransToChannelFirst(ctx, X, &transformed_X); - - ResizeToChannelFirst(ctx, dY, &transformed_dY); - TransToChannelFirst(ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX); - TransToChannelFirst(ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(phi::vectorize(W.dims())); - std::vector output_shape_vec( - phi::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - framework::DDim input_shape = - phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - framework::DDim filter_matrix_shape = {W.dims()[0], - W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col = ctx.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dX->mutable_data(ctx.GetPlace()); - - Tensor transformed_dX(dX->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dW->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dW, static_cast(0)); - Tensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, - T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - - Tensor transformed_ddY(ddY->dtype()); - if (channel_last) { - ResizeToChannelFirst(ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - Tensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(0.0)); - } - - if (ddW_in) { - Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape); - Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, x_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddY, ddY); - } - } - } -}; - -template -class DepthwiseConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1] % - input->dims()[input->dims().size() - 1], - 0, platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1] % input->dims()[1], 0, - platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], input->dims()[1])); - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - auto& dev_ctx = context.template device_context(); - - if (fuse_relu) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } else { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } - } -}; - -template -class DepthwiseConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (fuse_relu) { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } else { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - if (fuse_relu) { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } else { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 141a99f60f1..1841b78af32 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + algo = search::Find( + args, false, deterministic, workspace_size, + ctx.template device_context()); #else using search = SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + algo = search::Find( + args, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); #endif @@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = - search1::Find(args1, false, deterministic, workspace_size, ctx); + data_algo = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); + data_algo = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); #endif @@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = - search2::Find(args2, false, deterministic, workspace_size, ctx); + filter_algo = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); + filter_algo = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); #endif @@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = - search1::Find(args1, false, deterministic, workspace_size, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); #endif } @@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = - search2::Find(args2, false, deterministic, workspace_size, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); #endif @@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = - search3::Find(args3, false, deterministic, workspace_size, ctx); + filter_algo = search3::Find( + args3, false, deterministic, workspace_size, + ctx.template device_context()); #else using search3 = SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); + filter_algo = search3::Find( + args3, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); #endif @@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = - search4::Find(args4, false, deterministic, workspace_size, ctx); + data_algo = search4::Find( + args4, false, deterministic, workspace_size, + ctx.template device_context()); #else using search4 = SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); + data_algo = search4::Find( + args4, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); #endif diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu index b2a4910222f..054cb4b3389 100644 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ b/paddle/fluid/operators/conv_transpose_op.cu @@ -13,10 +13,150 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ( + groups, filter.dims()[0], + platform::errors::InvalidArgument( + "groups should be error to the 1st dimension of filter. But " + "received groups is %d and filter dimension[0] is %d", + groups, filter.dims()[0])); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( + "dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + static_cast::TYPE&>(dev_ctx), + *output, filter, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, output, data_layout); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv( + static_cast::TYPE&>(dev_ctx), + *output_grad, filter, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, input_grad, data_layout); + } + + if (filter_grad) { + phi::funcs::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + static_cast::TYPE&>(dev_ctx), + *output_grad, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, filter_grad, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle // conv2d REGISTER_OP_CUDA_KERNEL(conv2d_transpose, ops::GemmConvTransposeKernel, diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 76d6ad6bf2f..ee0fb7ab368 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } }; -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - dev_ctx, *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - dev_ctx, *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - dev_ctx, *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 1864bdbb866..b3792a176fa 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,10 +30,10 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(conv2d); -USE_OP(conv2d_grad); -USE_OP_DEVICE_KERNEL(conv2d, CUDNN); -USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); +USE_OP_ITSELF(conv2d); +USE_OP_ITSELF(conv2d_grad); +PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT); template void InitRandomTensor(const std::vector &dims, diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h deleted file mode 100644 index e41f0aedf39..00000000000 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -using DataLayout = framework::DataLayout; - -/* - * \brief Compute the depthwise convolution which include - * forward process and backpropagation process - */ -template -class DepthwiseConvFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvInputGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* input_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvFilterGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* filter_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 42bf1f471de..bc5a589ed6f 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace platform { class CPUDeviceContext; @@ -141,6 +143,116 @@ class Vol2ColFunctor { } }; +template +class Vol2ColFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* col, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol.dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol.dims().size())); + + PADDLE_ENFORCE_EQ(col->dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col->dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + // changed + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + c_in; + } + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + /* * vol = [input_channels,input_depth, input_height, input_width] * col = @@ -258,10 +370,125 @@ class Col2VolFunctor { } }; +template +class Col2VolFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* vol, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol->dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol->dims().size())); + + PADDLE_ENFORCE_EQ(col.dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col.dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = + ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + cIm; + } + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + template class Vol2ColFunctor; template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; + template class Col2VolFunctor; template class Col2VolFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 2fdeecf8934..05cd264cf3e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,7 +33,7 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { @@ -55,7 +55,7 @@ class CacheTester { onednn_dev_ctx_->ResetBlobMap(nullptr); } - bool Analyze(unsigned short int num_entries) { + bool Analyze(uint16_t num_entries) { // Number of created objects in cache should be as expected (num_entries) return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; } diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index f625d57df2e..688a0e54a0c 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/common/place.h" #include "paddle/utils/any.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ef51d6daf6a..4ffa1826a29 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h new file mode 100644 index 00000000000..339f1c00eaa --- /dev/null +++ b/paddle/phi/kernels/conv_grad_grad_kernel.h @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h new file mode 100644 index 00000000000..bad30989ac9 --- /dev/null +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h new file mode 100644 index 00000000000..eb0bfdd0275 --- /dev/null +++ b/paddle/phi/kernels/conv_kernel.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out); + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc new file mode 100644 index 00000000000..f157bb017f8 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void Conv3DGradGradKernel(const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + CPU, + ALL_LAYOUT, + phi::Conv3DGradGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc new file mode 100644 index 00000000000..994ad861bd1 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc new file mode 100644 index 00000000000..e0b4ee7d577 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(depthwise_conv2d, + CPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} + +PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h new file mode 100644 index 00000000000..d26d89086b2 --- /dev/null +++ b/paddle/phi/kernels/cpu/conv_util.h @@ -0,0 +1,91 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/ddim.h" + +namespace phi { + +template +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilation, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& ksize) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ( + data_dims.size() * 2, + paddings->size(), + phi::errors::InvalidArgument( + "Attribute padding's size should be the same or twice as the " + "input's dimension. " + "But recieved: padding's size is %d, padding is [%s]; input's " + "dimension is %d, input's shape is [%s].", + paddings->size(), + make_ddim(*paddings), + data_dims.size(), + data_dims)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + + // dilation + *(dilation->begin() + i) = 1; + } + + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +inline bool IsExpand(const std::vector& filter_dim, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + if (paddings.size() != strides.size()) { + for (size_t j = 0; j < paddings.size(); ++j) { + padding_0 = padding_0 && (paddings[j] == 0); + } + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_grad_kernel.h b/paddle/phi/kernels/depthwise_conv_grad_kernel.h new file mode 100644 index 00000000000..b5eff76e90c --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/depthwise_conv_kernel.h b/paddle/phi/kernels/depthwise_conv_kernel.h new file mode 100644 index 00000000000..b5eff76e90c --- /dev/null +++ b/paddle/phi/kernels/depthwise_conv_kernel.h @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi {} // namespace phi diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h new file mode 100644 index 00000000000..21ebae8487f --- /dev/null +++ b/paddle/phi/kernels/funcs/batch_norm_utils.h @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(make_ddim(in_dims_vec)); + transformed_input->mutable_data(context.GetPlace()); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + phi::funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + phi::funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + phi::funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h index 6d10ff2dfcf..e2c4e766b60 100644 --- a/paddle/phi/kernels/funcs/padding.h +++ b/paddle/phi/kernels/funcs/padding.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include #include -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu new file mode 100644 index 00000000000..6449a193a08 --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu new file mode 100644 index 00000000000..4df7bb26adf --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu new file mode 100644 index 00000000000..680ee4426af --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {} + +PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/conv_test_kernel.cu b/paddle/phi/kernels/gpu/conv_test_kernel.cu new file mode 100644 index 00000000000..0544a1e298b --- /dev/null +++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/phi/kernels/gpu/depthwise_conv.h similarity index 62% rename from paddle/fluid/operators/math/depthwise_conv.cu rename to paddle/phi/kernels/gpu/depthwise_conv.h index a4665a8f9a6..5270a4b2fdb 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/hostdevice.h" + #ifdef __NVCC__ #include #endif @@ -21,7 +25,7 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math/depthwise_conv.h" + #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -30,6 +34,58 @@ namespace paddle { namespace operators { namespace math { +using DataLayout = framework::DataLayout; + +/* + * \brief Compute the depthwise convolution which include + * forward process and backpropagation process + */ +template +class DepthwiseConvFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvInputGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& filter, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* input_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + +template +class DepthwiseConvFilterGradFunctor { + public: + void operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& output_grad, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* filter_grad, + const DataLayout data_layout = DataLayout::kNCHW); +}; + template static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) { typedef cub::WarpReduce WarpReduce; @@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; @@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { } if (c_filter == -1) { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNCHW(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } else { - KernelDepthwiseConvNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data); + KernelDepthwiseConvNHWC(input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + output_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvCFilterNCHW( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } else { KernelDepthwiseConvCFilterNHWC( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + input_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, output_data); } } @@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( } } -template __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( ARG_DEFINE_KernelDepthwiseConvInputGrad) { @@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( } } -template +template __global__ void KernelDepthwiseConvInputGradSp( ARG_DEFINE_KernelDepthwiseConvInputGrad) { int final_filter_multiplier = filter_multiplier; @@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvInputGradNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { KernelDepthwiseConvInputGradNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, final_filter_multiplier, filter_height, - filter_width, h_stride, w_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { - KernelDepthwiseConvInputGradCFilterNCHW( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } else { - KernelDepthwiseConvInputGradCFilterNHWC( - input_data, output_grad_data, filter_data, batch_size, - output_channels, output_height, output_width, input_channels, - input_height, input_width, c_filter_multiplier, filter_height, - filter_width, c_stride, c_stride, padding_height, padding_width, - dilate_height, dilate_width, input_grad_data); + input_data, + output_grad_data, + filter_data, + batch_size, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + c_filter_multiplier, + filter_height, + filter_width, + c_stride, + c_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, + input_grad_data); } } } @@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp( // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { T s = 0; int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; @@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( template __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int bid = blockIdx.z; int image_h = blockIdx.y; int kernel_iw = blockIdx.x % filter_width; @@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( template __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { + const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { const int bid = blockIdx.z; int image_h = blockIdx.x * dilate_height + blockIdx.y; if (image_h >= output_height) { @@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( } } -template -__global__ void KernelDepthwiseConvFilterGradSp( - const T* output_grad_data, const T* input_data, const int num, - const int output_channels, const int output_height, const int output_width, - const int input_channels, const int input_height, const int input_width, - const int filter_multiplier, const int filter_height, - const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data) { +template +__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data, + const T* input_data, + const int num, + const int output_channels, + const int output_height, + const int output_width, + const int input_channels, + const int input_height, + const int input_width, + const int filter_multiplier, + const int filter_height, + const int filter_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + const int dilate_height, + const int dilate_width, + T* filter_grad_data) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; int w_stride = stride_width; @@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp( if (c_filter_multiplier == 0 || c_filter == -1) { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { KernelDepthwiseConvFilterGradNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } else { if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvFilterGradNCHW( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } else { - KernelDepthwiseConvFilterGradCFilterNHWC( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_grad_data, + input_data, + num, + output_channels, + output_height, + output_width, + input_channels, + input_height, + input_width, + final_filter_multiplier, + filter_height, + filter_width, + h_stride, + w_stride, + padding_height, + padding_width, + dilate_height, + dilate_width, filter_grad_data); } } @@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp( * height and width, respectively. */ template -class DepthwiseConvFunctor { +class DepthwiseConvFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, + const std::vector& dilations, + framework::Tensor* output, const DataLayout data_layout = DataLayout::kNCHW) { const int batch_size = input.dims()[0]; const int input_channels = @@ -905,12 +1199,14 @@ class DepthwiseConvFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } else { \ - KernelDepthwiseConvSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ - fuse_relu_before_conv><<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data); \ - } \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (c_filter == -1) { \ + threads.x = block_size; \ + grid.x = grid_size; \ + threads.y = threads.z = grid.y = grid.z = 1; \ + } \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } else { \ + KernelDepthwiseConvSp< \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + input_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + output_data); \ + } \ + return; \ } check_case(1, 1, 3); check_case(1, 1, 5); @@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor -class DepthwiseConvInputGradFunctor { +class DepthwiseConvInputGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& filter, const framework::Tensor& output_grad, @@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor(context.GetPlace()); std::vector perm_axis({2, 3, 0, 1}); - phi::funcs::TransposeNormal trans; + phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); filter_data = filter_hwc.data(); } @@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } else { \ KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + T, \ + c_filter_multiplier, \ + c_stride, \ + c_filter, \ + DataLayout::kNHWC, \ fuse_relu_before_conv><<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data); \ + input_data, \ + output_grad_data, \ + filter_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + input_grad_data); \ } \ return; \ } @@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor -class DepthwiseConvFilterGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& output_grad, const std::vector& strides, @@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ } else { \ framework::Tensor filter_grad_hwc; \ if (c_filter != -1) { \ - framework::DDim filter_grad_hwc_dims( \ - {filter_grad->dims()[2], filter_grad->dims()[3], \ - filter_grad->dims()[0], filter_grad->dims()[1]}); \ + framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2], \ + filter_grad->dims()[3], \ + filter_grad->dims()[0], \ + filter_grad->dims()[1]}); \ filter_grad_hwc.Resize(filter_grad_hwc_dims); \ filter_grad_hwc.mutable_data(context.GetPlace()); \ - phi::funcs::SetConstant set_zero; \ + phi::funcs::SetConstant set_zero; \ set_zero(context, &filter_grad_hwc, static_cast(0)); \ filter_grad_data = filter_grad_hwc.data(); \ } else { \ @@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data); \ + output_grad_data, \ + input_data, \ + batch_size, \ + output_channels, \ + output_height, \ + output_width, \ + input_channels, \ + input_height, \ + input_width, \ + filter_multiplier, \ + ksize_height, \ + ksize_width, \ + stride_height, \ + stride_width, \ + padding_height, \ + padding_width, \ + dilate_height, \ + dilate_width, \ + filter_grad_data); \ if (c_filter != -1) { \ std::vector perm_axis({2, 3, 0, 1}); \ - phi::funcs::TransposeNormal trans; \ + phi::funcs::TransposeNormal trans; \ trans(context, filter_grad_hwc, filter_grad, perm_axis); \ } \ } \ @@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFunctor; -template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; +template class DepthwiseConvFunctor; -template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; +template class DepthwiseConvInputGradFunctor; -template class DepthwiseConvFilterGradFunctor; -template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; +template class DepthwiseConvFilterGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu new file mode 100644 index 00000000000..4f27b6fde99 --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +namespace phi { + +template +void DepthwiseConvGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* output_grad = &out_grad; + + if (!input_grad && !filter_grad) return; + + std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + phi::funcs::SetConstant set_zero; + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, + input, + filter, + *output_grad, + strides, + paddings, + dilations, + input_grad, + data_layout); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, + input, + *output_grad, + strides, + paddings, + dilations, + filter_grad, + data_layout); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu new file mode 100644 index 00000000000..c50ceae33fc --- /dev/null +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/operators/conv_op.h" + +#include "paddle/phi/kernels/gpu/depthwise_conv.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +namespace phi { + +template +void DepthwiseConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + bool fuse_relu, + DenseTensor* out) { + DenseTensor* output = out; + output->mutable_data(dev_ctx.GetPlace()); + + const std::vector strides = strides_t; + std::vector dilations = dilations_t; + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + if (channel_last) { + PADDLE_ENFORCE_EQ( + output->dims()[output->dims().size() - 1] % + input.dims()[input.dims().size() - 1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[output->dims().size() - 1], + input.dims()[input.dims().size() - 1])); + } else { + PADDLE_ENFORCE_EQ( + output->dims()[1] % input.dims()[1], + 0, + phi::errors::InvalidArgument( + "ShapeError: The output channels must be a multiple of the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[1], + input.dims()[1])); + } + + // update padding and dilation + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims; + const paddle::framework::DataLayout data_layout = + paddle::framework::StringToDataLayout(data_format); + if (data_layout != paddle::framework::DataLayout::kNHWC) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + } + + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; + if (!is_sys_pad) { + for (size_t i = 0; i < strides.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + + if (fuse_relu) { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } else { + paddle::operators::math::DepthwiseConvFunctor + depthwiseConv; + depthwiseConv(dev_ctx, + input, + filter, + strides, + paddings, + dilations, + output, + data_layout); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(depthwise_conv2d, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu new file mode 100644 index 00000000000..b4a6fe337c8 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu @@ -0,0 +1,834 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_grad_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + auto X = &input; + auto W = &filter; + auto dO = &out_grad; + auto ddX = input_grad_grad.get_ptr(); + auto ddW = filter_grad_grad.get_ptr(); + + auto ddO = out_grad_grad; + auto dW = filter_grad; + auto dX = input_grad; + if (ddO) { + ddO->mutable_data(ctx.GetPlace()); + phi::funcs::SetConstant set_zero; + set_zero(ctx, ddO, static_cast(0)); + } + if (dW) { + dW->mutable_data(ctx.GetPlace()); + } + if (dX) { + dX->mutable_data(ctx.GetPlace()); + } + + // const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + T* transformed_dx = nullptr; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensors to channel first----------- + DenseTensor transformed_X_channel(X->type()); + DenseTensor transformed_dO_channel(dO->type()); + DenseTensor transformed_ddX_channel(X->type()); + + DenseTensor transformed_ddO_channel(dO->type()); + DenseTensor transformed_dX_channel(X->type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, X, &transformed_X_channel); + TransToChannelFirst(ctx, X, &transformed_X_channel); + + ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); + TransToChannelFirst(ctx, dO, &transformed_dO_channel); + + if (ddX) { + ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); + TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); + } + + if (ddO) { + ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); + } + if (dX) { + ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); + transformed_dX_channel.mutable_data(ctx.GetPlace()); + } + + } else { + transformed_X_channel = *X; + transformed_dO_channel = *dO; + if (ddX) { + transformed_ddX_channel = *ddX; + } + if (ddO) { + transformed_ddO_channel.ShareDataWith(*ddO); + } + if (dX) { + transformed_dX_channel.ShareDataWith(*dX); + } + } + + auto in_dims = transformed_X_channel.dims(); + auto filter_dims = W->dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + DenseTensor transformed_X(X->type()); + DenseTensor transformed_ddX(X->type()); + + DenseTensor transformed_dX(X->type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(X->dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_X_channel.dims()[0]; + new_input_shape_vec[1] = transformed_X_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_X_channel.dims()[i + 2] + padding_diff[i]; + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_X.Resize(new_input_shape); + transformed_ddX.Resize(new_input_shape); + transformed_dX.Resize(new_input_shape); + + transformed_X.mutable_data(ctx.GetPlace()); + + if (ddX) { + transformed_ddX.mutable_data(ctx.GetPlace()); + } + if (dX) { + transformed_dX.mutable_data(ctx.GetPlace()); + } + + // pad for input + const int rank = X->dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_X.ShareDataWith(transformed_X_channel); + if (ddX) { + transformed_ddX.ShareDataWith(transformed_ddX_channel); + } + if (dX) { + transformed_dX.ShareDataWith(transformed_dX_channel); + } + + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* x = transformed_X.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + + paddle::operators::ConvArgs args1{&transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args3{&transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args4{&transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype}; + +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t fwd_algo1 = static_cast(0); + miopenConvFwdAlgorithm_t fwd_algo2 = static_cast(0); + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionFwdAlgo_t fwd_algo1 = + static_cast(0); + cudnnConvolutionFwdAlgo_t fwd_algo2 = + static_cast(0); + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + + T* transformed_ddy_channel = nullptr; + if (ddO) { + ddy = ddO->data(); + transformed_ddy_channel = transformed_ddO_channel.data(); + if (ddX) { + args1.handle = handle; + args1.idesc.set(transformed_ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(transformed_ddO_channel, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_algo1 = search1::Find( + args1, exhaustive_search, false, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); +#endif + } + + if (ddW) { + ddw = ddW->data(); + args2.handle = handle; + args2.idesc.set(transformed_X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(transformed_ddO_channel, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_algo2 = search2::Find( + args2, exhaustive_search, false, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); +#endif + } + } + + if (dW && ddX) { + dw = dW->data(); + args3.handle = handle; + args3.idesc.set(transformed_ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(transformed_dO_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search3 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = search3::Find( + args3, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search3 = + paddle::operators::SearchAlgorithm; + filter_algo = + search3::Find(args3, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif + } + + if (ddW && dX) { + transformed_dx = transformed_dX.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(transformed_dO_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search4 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = search4::Find( + args4, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search4 = + paddle::operators::SearchAlgorithm; + data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW( + transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dO_channel.dims(), + DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto wkspace_handle = ctx.cudnn_workspace_handle(); + + if (ddO) { + if (ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx, + args1.wdesc.desc(), + w, + args1.cdesc.desc(), + fwd_algo1, + &beta, + args1.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx + i * group_offset_in, + args1.wdesc.desc(), + w + i * group_offset_filter, + args1.cdesc.desc(), + fwd_algo1, + workspace_ptr, + workspace_size, + &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (ddW) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x, + args2.wdesc.desc(), + ddw, + args2.cdesc.desc(), + fwd_algo2, + &beta, + args2.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x + i * group_offset_in, + args2.wdesc.desc(), + ddw + i * group_offset_filter, + args2.cdesc.desc(), + fwd_algo2, + workspace_ptr, + workspace_size, + &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_ddO_channel, ddO); + } + } + T* transformed_dy_channel = transformed_dO_channel.data(); + if (dW && ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + transformed_dy_channel, + args3.idesc.desc(), + ddx, + args3.cdesc.desc(), + filter_algo, + &beta, + args3.wdesc.desc(), + dw, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + ddx + i * group_offset_in, + args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), + filter_algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } +#endif + } + + if (dX && ddW) { + ddw = ddW->data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args4.odesc.desc(), + transformed_dy_channel, + args4.wdesc.desc(), + ddw, + args4.cdesc.desc(), + data_algo, + &beta, + args4.idesc.desc(), + transformed_dx, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args4.wdesc.desc(), + ddw + i * group_offset_filter, + args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), + data_algo, + workspace_ptr, + workspace_size, + &beta, + args4.idesc.desc(), + transformed_dx + i * group_offset_in)); + }, + workspace_size); + } +#endif + + if (!is_sys_pad) { + // reverse padded input + std::vector starts(X->dims().size(), 0); + std::vector axes(X->dims().size(), 0); + + for (size_t i = 0; i < X->dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + if (X->dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dX_channel, dX); + } + } +} + +template +void DepthwiseConvCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + bool fuse_relu, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +template +void Conv3DCudnnGradGradKernel( + const Context& ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradGradKernel(ctx, + input_grad_grad, + filter_grad_grad, + out_grad, + input, + filter, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + out_grad_grad, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +#else + +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu new file mode 100644 index 00000000000..64148e902fd --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu @@ -0,0 +1,683 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnGradKernel(const Context& ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + } + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + } + + std::vector dilations = dilations_t; + std::vector strides = strides_t; + std::vector paddings = paddings_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvGradOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // transform Tensor + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output_grad_channel(output_grad.type()); + DenseTensor transformed_input_grad_channel(input.type()); + DenseTensor transformed_filter_channel(filter.type()); + DenseTensor transformed_filter_grad_channel(filter.type()); + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input, output_grad, input_grad and tensor from " + "NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + TransToChannelFirst( + ctx, &output_grad, &transformed_output_grad_channel); + + if (input_grad) { + ResizeToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy + // the data of input_grad to transformed_input_grad_channel. + if (use_addto) { + TransToChannelFirst( + ctx, input_grad, &transformed_input_grad_channel); + } + } + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output_grad_channel.ShareDataWith(output_grad); + if (input_grad) { + transformed_input_grad_channel.ShareDataWith(*input_grad); + } + } + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + + if (filter_grad) { + ResizeToChannelLast( + ctx, filter_grad, &transformed_filter_grad_channel); + } + } else { + transformed_filter_channel.ShareDataWith(filter); + if (filter_grad) { + transformed_filter_grad_channel.ShareDataWith(*filter_grad); + } + } + + // update paddings + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + // cuDNN only supports padding the same amount on every dimension. + // So we create a new padded input tensor. + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + Tensor transformed_input(input.type()); + Tensor transformed_input_grad(input.type()); + std::vector padding_common(data_dim, 0); + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + transformed_input_grad.Resize(new_input_shape); + + if (input_grad) { + transformed_input_grad.mutable_data(ctx.GetPlace()); + } + // pad for input + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (input_grad) { + transformed_input_grad.ShareDataWith(transformed_input_grad_channel); + } + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + const T* output_grad_data = transformed_output_grad_channel.data(); + const T* filter_data = transformed_filter_channel.data(); + T* filter_grad_data = nullptr; + T* input_grad_data = nullptr; + T* transformed_input_grad_data = nullptr; + + paddle::operators::ConvArgs args1{&transformed_input_grad, + &transformed_filter_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + paddle::operators::ConvArgs args2{&transformed_input, + &transformed_filter_grad_channel, + &transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; +// ------------------- cudnn backward algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvBwdDataAlgorithm_t data_algo = + static_cast(0); + miopenConvBwdWeightsAlgorithm_t filter_algo = + static_cast(0); +#else + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); +#endif + // input data workspace_size + size_t workspace_size_d = 0; + // weight workspace_size + size_t workspace_size_w = 0; + int iwo_groups = groups; + int c_groups = 1; + +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (input_grad) { + // ------------------- cudnn descriptors --------------------- + input_grad_data = input_grad->data(); + transformed_input_grad_data = transformed_input_grad.data(); + + args1.handle = handle; + args1.idesc.set(transformed_input_grad, layout_tensor); + args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); + args1.odesc.set(transformed_output_grad_channel, layout_tensor); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); + data_algo = search1::Find( + args1, exhaustive_search, deterministic, workspace_size_d, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); + workspace_size_d = + std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); +#endif + } + + if (filter_grad) { + // ------------------- cudnn descriptors --------------------- + filter_grad_data = transformed_filter_grad_channel.data(); + args2.handle = handle; + args2.idesc.set(transformed_input, layout_tensor); + args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); + args2.odesc.set(transformed_output_grad_channel, layout_tensor); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size_w = + std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); + filter_algo = search2::Find( + args2, exhaustive_search, deterministic, workspace_size_w, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + filter_algo = + search2::Find(args2, exhaustive_search, deterministic, ctx); + workspace_size_w = std::max(workspace_size_w, + search2::GetWorkspaceSize(args2, filter_algo)); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + paddle::operators::ScalingParamType beta = 0.0f; +#else + paddle::operators::ScalingParamType beta = use_addto ? 1.0f : 0.0f; + +#endif + VLOG(4) << "Conv_grad: use_addto = " << use_addto; + + if (input_grad) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. +#ifdef PADDLE_WITH_HIP + if (use_addto) { + DenseTensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + temp_tensor_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args1.idesc.desc(), + transformed_input_grad_data, + &alpha, + args1.idesc.desc(), + temp_tensor_data, + &beta, + args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + data_algo, + &beta, + args1.idesc.desc(), + transformed_input_grad_data, + cudnn_workspace_ptr, + workspace_size_d)); + }, + workspace_size_d); + } + +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args1.wdesc.desc(), + filter_data + i * group_offset_filter, + args1.odesc.desc(), + output_grad_data + i * group_offset_out, + args1.cdesc.desc(), + data_algo, + cudnn_workspace_ptr, + workspace_size_d, + &beta, + args1.idesc.desc(), + transformed_input_grad_data + i * group_offset_in)); + }, + workspace_size_d); + } +#endif + if (!is_sys_pad) { + std::vector starts(transformed_input_channel.dims().size(), 0); + std::vector axes(transformed_input_channel.dims().size(), 0); + + for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + + transformed_input_grad_channel.mutable_data(ctx.GetPlace()); + if (transformed_input_channel.dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, + &transformed_input_grad, + &transformed_input_grad_channel, + starts, + axes); + } + } + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast( + ctx, &transformed_input_grad_channel, input_grad); + } + } + + // filter_grad do not use inplace addto. + paddle::operators::ScalingParamType beta_filter = 0.0f; + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { +// Because beta is zero, it is unnecessary to reset filter_grad. +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + output_grad_data, + args2.idesc.desc(), + input_data, + args2.cdesc.desc(), + filter_algo, + &beta, + args2.wdesc.desc(), + filter_grad_data, + cudnn_workspace_ptr, + workspace_size_w)); + }, + workspace_size_w); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args2.idesc.desc(), + input_data + i * group_offset_in, + args2.odesc.desc(), + output_grad_data + i * group_offset_out, + args2.cdesc.desc(), + filter_algo, + cudnn_workspace_ptr, + workspace_size_w, + &beta_filter, + args2.wdesc.desc(), + filter_grad_data + i * group_offset_filter)); + }, + workspace_size_w); + } +#endif + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + TransToChannelFirst( + ctx, &transformed_filter_grad_channel, filter_grad); + } + } +} + +template +void Conv3DCudnnGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + ConvCudnnGradKernel(dev_ctx, + out_grad, + input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + input_grad, + filter_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradKernel, + float, + double, + phi::dtype::float16) {} + +#endif + +#endif diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu new file mode 100644 index 00000000000..931b6d68845 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu @@ -0,0 +1,476 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { + +template +void ConvCudnnKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* output) { + output->mutable_data(ctx.GetPlace()); + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + // HIP MIOPEN ONLY SUPPORT NCHW format + auto compute_format = paddle::platform::DataLayout::kNCHW; +#else + // Tensor Core introduced from Volta GPUs supports more faster conv op + // with FP16 in NHWC data format. + const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); + // We will only do data format conversion from NHWC to NCHW. + // cudnn will convert NCHW to NHWC automatically on Tensor Core. + auto compute_format = compute_in_nhwc && channel_last + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; +#endif + VLOG(3) << "Compute ConvOp with cuDNN:" + << " data_format=" << data_format << " compute_format=" + << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC" + : "NCHW"); + + // ------------ transformed tensor ----------- + DenseTensor transformed_input_channel(input.type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_filter_channel(filter.type()); + T* output_data = nullptr; + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &input, &transformed_input_channel); + TransToChannelFirst(ctx, &input, &transformed_input_channel); + + ResizeToChannelFirst(ctx, output, &transformed_output); + + } else { + transformed_input_channel.ShareDataWith(input); + transformed_output.ShareDataWith(*output); + } + if (compute_format == paddle::platform::DataLayout::kNHWC) { + VLOG(3) << "Transform filter tensor from NCHW to NHWC."; + ResizeToChannelLast(ctx, &filter, &transformed_filter_channel); + TransToChannelLast(ctx, &filter, &transformed_filter_channel); + } else { + transformed_filter_channel.ShareDataWith(filter); + } + output_data = transformed_output.data(); + + // update padding and dilation + auto in_dims = transformed_input_channel.dims(); + auto filter_dims = transformed_filter_channel.dims(); + DDim in_data_dims; + DDim filter_data_dims; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + } else { + in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1); + filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1); + } + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + + DenseTensor transformed_input; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_input_channel.dims()[0]; + + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[1] = transformed_input_channel.dims()[1]; + } else { + new_input_shape_vec[data_dim + 1] = + transformed_input_channel.dims()[data_dim + 1]; + } + + std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + if (compute_format == paddle::platform::DataLayout::kNCHW) { + new_input_shape_vec[i + 2] = + transformed_input_channel.dims()[i + 2] + padding_diff[i]; + } else { + new_input_shape_vec[i + 1] = + transformed_input_channel.dims()[i + 1] + padding_diff[i]; + } + if (compute_format == paddle::platform::DataLayout::kNCHW) { + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } else { + input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_input.Resize(new_input_shape); + transformed_input.mutable_data(ctx.GetPlace()); + + const int rank = transformed_input_channel.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + case 5: { + funcs::PadFunction(ctx, + input_pad, + transformed_input_channel, + pad_value, + &transformed_input); + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_input.ShareDataWith(transformed_input_channel); + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* input_data = transformed_input.data(); + + const T* filter_data = transformed_filter_channel.data(); + + // ------------------- cudnn descriptors --------------------- + paddle::operators::ConvArgs args{&transformed_input, + &transformed_filter_channel, + &transformed_output, + strides, + padding_common, + dilations, + dtype}; + + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { + layout = compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNDHWC + : paddle::platform::DataLayout::kNCDHW; + } + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + + args.handle = handle; + +#ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + groups); +#else + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn()); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it manually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); + groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; +#endif + args.idesc.set(transformed_input, layout_format); + args.wdesc.set(transformed_filter_channel, layout_format, groups); + args.odesc.set(transformed_output, layout_format); + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + paddle::operators::GetNCDHW(transformed_input.dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + paddle::operators::GetNCDHW(transformed_output.dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel.numel() / groups; + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size = 0; // final workspace to allocate. +// ------------------- cudnn conv algorithm --------------------- +#ifdef PADDLE_WITH_HIP + miopenConvFwdAlgorithm_t algo{}; + using search = paddle::operators::SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + algo = search::Find( + args, exhaustive_search, deterministic, workspace_size, ctx); +#else + cudnnConvolutionFwdAlgo_t algo{}; + using search = + paddle::operators::SearchAlgorithm; + algo = search::Find(args, exhaustive_search, deterministic, ctx); + workspace_size = search::GetWorkspaceSize(args, algo); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ + // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable + // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ + // FWD_ALGO_IMPLICIT_GEMM manually. + if (groups > 1) { + algo = static_cast(0); + } +#endif + + // ------------------- cudnn conv forward --------------------- + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + +// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. +// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); + +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data, + args.wdesc.desc(), + filter_data, + args.cdesc.desc(), + algo, + &beta, + args.odesc.desc(), + output_data, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data + i * group_offset_in, + args.wdesc.desc(), + filter_data + i * group_offset_filter, + args.cdesc.desc(), + algo, + workspace_ptr, + workspace_size, + &beta, + args.odesc.desc(), + output_data + i * group_offset_out)); + }, + workspace_size); + } +#endif + + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { + TransToChannelLast(ctx, &transformed_output, output); + } +} + +template +void Conv3DCudnnKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out) { + ConvCudnnKernel(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + double, + phi::dtype::float16) {} +#endif + +#endif + +// todo register bfloat16 diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h new file mode 100644 index 00000000000..93bc5b64adc --- /dev/null +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/eigen.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/operators/conv_miopen_helper.h" +#else +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#endif + +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/padding.h" + +#include "paddle/fluid/platform/dynload/cudnn.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" + +DECLARE_bool(cudnn_deterministic); +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); + +namespace phi { + +static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { + return dev_ctx.GetComputeCapability() >= 70; +} + +// inline cudnnTensorFormat_t GetCudnnTensorFormat( +// const phi::DataLayout& order) { // Not use +// switch (order) { +// case phi::DataLayout::kNHWC: +// return CUDNN_TENSOR_NHWC; +// case phi::DataLayout::kNCHW: +// return CUDNN_TENSOR_NCHW; +// case phi::DataLayout::NCDHW: +// return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same +// case phi::DataLayout::NDHWC: +// return CUDNN_TENSOR_NHWC; // add, liyamei +// default: +// PADDLE_THROW(phi::errors::Unimplemented( +// "CUDNN has no equivalent dataLayout for input order.")); +// } +// return CUDNN_TENSOR_NCHW; +// } + +static inline void GetNCDHW(const DDim& dims, + const phi::DataLayout& layout, + int* N, + int* C, + int* D, + int* H, + int* W) { + *N = dims[0]; + *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == phi::DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +} // namespace phi + +// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double +// ) {} diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h new file mode 100644 index 00000000000..fbcebf371a6 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h @@ -0,0 +1,330 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradGradKernel(const Context& dev_ctx, + paddle::optional input_grad_grad, + paddle::optional filter_grad_grad, + const DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* out_grad_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + const DenseTensor* X = &input; + const DenseTensor* dY = &out_grad; + const DenseTensor* ddX = input_grad_grad.get_ptr(); + const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); + + DenseTensor* ddY = out_grad_grad; + DenseTensor* dW = filter_grad; + DenseTensor* dX = input_grad; + DenseTensor W = filter; + + if (!ddY && !dW && !dX) return; + + const std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensor + DenseTensor transformed_X(X->type()); + DenseTensor transformed_dY(dY->type()); + DenseTensor transformed_ddX(X->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, X, &transformed_X); + TransToChannelFirst(dev_ctx, X, &transformed_X); + + ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); + TransToChannelFirst(dev_ctx, dY, &transformed_dY); + + if (ddX) { + ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); + TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); + } + } else { + transformed_X = *X; + transformed_dY = *dY; + if (ddX) { + transformed_ddX = *ddX; + } + } + + // update padding and dilation + auto in_dims = transformed_X.dims(); + auto filter_dims = W.dims(); + + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_X.dims()[0]); + std::vector filter_shape_vec(vectorize(W.dims())); + std::vector output_shape_vec(vectorize(transformed_dY.dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = transformed_X.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + DDim input_shape = + slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + DDim output_matrix_shape = { + transformed_dY.dims()[1], + transformed_dY.numel() / + (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; + int in_step = static_cast(transformed_X.dims()[1]) / groups; + int out_step = static_cast(transformed_dY.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + DenseTensor col; + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + dX->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_dX(dX->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); + + } else { + transformed_dX = *dX; + } + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, &transformed_dX, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul( + ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dX, dX); + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW && ddX) { + dW->mutable_data(dev_ctx.GetPlace()); + set_zero(dev_ctx, dW, static_cast(0)); + DenseTensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + ddY->mutable_data(dev_ctx.GetPlace()); + + DenseTensor transformed_ddY(ddY->type()); + if (channel_last) { + ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); + } else { + transformed_ddY = *ddY; + } + + set_zero(dev_ctx, &transformed_ddY, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor ddy_batch = + transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + // gemm + DenseTensor ddy_slice = + ddy_batch.Slice(g * out_step, (g + 1) * out_step); + + if (ddX) { + DenseTensor ddx_batch = + transformed_ddX.Slice(i, i + 1).Resize(input_shape); + DenseTensor ddx_slice = + ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); + } + + if (ddW_in) { + DenseTensor x_batch = + transformed_X.Slice(i, i + 1).Resize(input_shape); + DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + + DenseTensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + x_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_ddY, ddY); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h new file mode 100644 index 00000000000..f1971aca800 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -0,0 +1,257 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_grad_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + + if (!input_grad && !filter_grad) return; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + DenseTensor filter = filter_t; + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output_grad(output_grad.type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + TransToChannelFirst( + dev_ctx, &output_grad, &transformed_output_grad); + } else { + transformed_input = input; + transformed_output_grad = output_grad; + } + + // update padding and dilation + auto in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + vectorize(transformed_output_grad.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = transformed_input.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + + DDim input_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output_grad.dims()[1], + transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * + transformed_output_grad.dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + if (input_grad) { + input_grad->mutable_data(dev_ctx.GetPlace()); + DenseTensor transformed_input_grad(input_grad->type()); + if (channel_last) { + ResizeToChannelFirst( + dev_ctx, input_grad, &transformed_input_grad); + + } else { + transformed_input_grad = *input_grad; + } + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_grad_batch = + transformed_input_grad.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = + filter.Slice(g * out_step, (g + 1) * out_step); + + DenseTensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul(filter_slice, + true, + out_grad_slice, + false, + T(1.0), + &col_matrix, + T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + if (channel_last) { + TransToChannelLast( + dev_ctx, &transformed_input_grad, input_grad); + } + } + + if (filter_grad) { + filter_grad->mutable_data(dev_ctx.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = + transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + DenseTensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(out_grad_slice, + false, + col_matrix, + true, + T(1.0), + &filter_grad_slice, + T(1.0)); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h new file mode 100644 index 00000000000..1945468f025 --- /dev/null +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* output) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + DenseTensor filter = filter_t; + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + output->mutable_data(dev_ctx.GetPlace()); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + DenseTensor transformed_input(input.type()); + DenseTensor transformed_output(output->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &input, &transformed_input); + TransToChannelFirst(dev_ctx, &input, &transformed_input); + + ResizeToChannelFirst(dev_ctx, output, &transformed_output); + + } else { + transformed_input = input; + transformed_output = *output; + } + + // update padding and dilation + auto trans_in_dims = transformed_input.dims(); + auto filter_dims = filter.dims(); + + DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_input.dims()[0]); + + // filter_shape_vec: + // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(vectorize(filter.dims())); + + // output_shape_vec: + // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(vectorize(transformed_output.dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: + // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, + // o_d,o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = trans_in_dims[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + + DDim col_shape(make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: + // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * + // o_w) + + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + + DenseTensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + DenseTensor col_matrix; + if (is_expand) { + // col = context.AllocateTmpTensor(col_shape, dev_ctx); + col.Resize(col_shape); + col.mutable_data(dev_ctx.GetPlace()); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + DDim in_matrix_shape = + slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); + + DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + DDim output_matrix_shape = { + transformed_output.dims()[1], + transformed_output.numel() / + (transformed_output.dims()[0] * transformed_output.dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(transformed_input.dims()[1]) / groups; + int out_step = static_cast(transformed_output.dims()[1]) / groups; + + paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math:: + Im2ColFunctor + im2col; + + auto blas = phi::funcs::GetBlas(dev_ctx); + for (int i = 0; i < batch_size; i++) { + DenseTensor in_batch = + transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); + DenseTensor out_batch = + transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + in_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_output, output); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc new file mode 100644 index 00000000000..a755fdb19ec --- /dev/null +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, + phi::Conv2dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc new file mode 100644 index 00000000000..a036afac82a --- /dev/null +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"Output"}); +} + +KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature Conv3dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("conv3d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, + phi::Conv3dDoubleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc new file mode 100644 index 00000000000..e2b6801f73b --- /dev/null +++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DepthwiseConv2dOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"Output"}); +} + +KernelSignature DepthwiseConv2dGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad", + {GradVarName("Output"), "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {GradVarName("Input"), GradVarName("Filter")}); +} + +KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("depthwise_conv2d_grad_grad", + {"DDInput", "DDFilter", "DOutput", "Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "groups", + "dilations", + "data_format", + "use_addto", + "workspace_size_MB", + "exhaustive_search", + "fuse_relu_before_depthwise_conv"}, + {"DDOutput", "DInput", "DFilter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, + phi::DepthwiseConv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, + phi::DepthwiseConv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad, + phi::DepthwiseConv2dDoubleGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py index dc460cb16f6..ca77177125f 100644 --- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py @@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py index f933d5bf7a4..892fa649a6c 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py @@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg import paddle.nn.functional as F import paddle.fluid.initializer as I import unittest +import paddle def _reverse_repeat_list(t, n): @@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 826f886dab1..6a9f7a47f66 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp): self.groups = 3 -#----------------Conv2DCUDNN---------------- +# #----------------Conv2DCUDNN---------------- create_test_cudnn_class(TestConv2DOp) create_test_cudnn_class(TestWithPad) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 5f23d04dde5..8cf779ccfdd 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -20,6 +20,7 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest import paddle.fluid as fluid +import paddle def conv3d_forward_naive(input, @@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py index 81c6aa1fd17..784d89b93f9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase): def test_grad(self): places = [fluid.CPUPlace()] - places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) @@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase): [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps) def test_grad(self): - places = [fluid.CPUPlace()] + #places = [fluid.CPUPlace()] + places = [] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: @@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py index cec48724da2..8e0a744ecdb 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py @@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py index 8ccaf30cbdb..6c208160658 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py @@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 9b739ebdfb2..d391b04aa47 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 33f304ef33d..0a08aa4ba12 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From c47ae621c31aa94001c4d1d8e55ca4230aa4a25f Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 4 Mar 2022 10:11:36 +0800 Subject: [PATCH 035/261] add eager test in rnn and fc; test=develop (#40149) --- .../tests/unittests/test_imperative_deepcf.py | 34 +++++++++++++++++++ .../test_imperative_recurrent_usage.py | 24 +++++++++++++ 2 files changed, 58 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 04a0e5e4cd1..3e222e3c658 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -24,6 +24,7 @@ import paddle.fluid.core as core from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph import Linear +from paddle.fluid.framework import _test_eager_guard # Can use Amusic dataset as the DeepCF describes. DATA_PATH = os.environ.get('DATA_PATH', '') @@ -294,9 +295,42 @@ class TestDygraphDeepCF(unittest.TestCase): sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss2)) + with fluid.dygraph.guard(): + with _test_eager_guard(): + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + deepcf = DeepCF(num_users, num_items, matrix) + adam = fluid.optimizer.AdamOptimizer( + 0.01, parameter_list=deepcf.parameters()) + + for e in range(NUM_EPOCHES): + sys.stderr.write('epoch %d\n' % e) + for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): + if slice + BATCH_SIZE >= users_np.shape[0]: + break + prediction = deepcf( + to_variable(users_np[slice:slice + BATCH_SIZE]), + to_variable(items_np[slice:slice + BATCH_SIZE])) + loss = fluid.layers.reduce_sum( + fluid.layers.log_loss(prediction, + to_variable( + labels_np[slice:slice + + BATCH_SIZE]))) + loss.backward() + adam.minimize(loss) + deepcf.clear_gradients() + eager_loss = loss.numpy() + sys.stderr.write('eager loss: %s %s\n' % + (slice, eager_loss)) + self.assertEqual(static_loss, dy_loss) self.assertEqual(static_loss, dy_loss2) + self.assertEqual(static_loss, eager_loss) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py index d0b3adc4909..f12ca0a93ff 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py @@ -16,9 +16,11 @@ from __future__ import print_function import unittest import paddle.fluid as fluid +import paddle import paddle.fluid.core as core from paddle.fluid.dygraph.nn import Embedding import paddle.fluid.framework as framework +from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -60,6 +62,25 @@ class TestRecurrentFeed(unittest.TestCase): original_in1.stop_gradient = True rt.clear_gradients() + with fluid.dygraph.guard(): + with _test_eager_guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + original_in1 = to_variable(original_np1) + original_in2 = to_variable(original_np2) + original_in1.stop_gradient = False + original_in2.stop_gradient = False + rt = RecurrentTest("RecurrentTest") + + for i in range(3): + sum_out, out = rt(original_in1, original_in2) + original_in1 = out + eager_sum_out_value = sum_out.numpy() + sum_out.backward() + eager_dyout = out.gradient() + original_in1.stop_gradient = True + rt.clear_gradients() + with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -88,8 +109,11 @@ class TestRecurrentFeed(unittest.TestCase): original_np1 = static_out_value self.assertTrue(np.array_equal(static_sum_out, sum_out_value)) + self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value)) self.assertTrue(np.array_equal(static_dout, dyout)) + self.assertTrue(np.array_equal(static_dout, eager_dyout)) if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From 73a4fe6cbe5222c42a3d750441e12f1316a3a95d Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Fri, 4 Mar 2022 10:11:53 +0800 Subject: [PATCH 036/261] extend test_imperative_qat_user_defined test time (#40114) --- python/paddle/fluid/contrib/slim/tests/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 807f7c15196..49ae8f5fd56 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -355,6 +355,8 @@ set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200) set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200) set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200) set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200) + if(LINUX AND WITH_MKLDNN) set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120) -- GitLab From d2a911b46be80a01ef685f0bbc2bdffb683316b1 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 4 Mar 2022 10:13:08 +0800 Subject: [PATCH 037/261] [Yaml]Support parsing fwd & bwd returns with name (#40107) --- .../final_state_generator/eager_gen.py | 41 +++++++------------ 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 65dbb0368c6..4945a6fb654 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -208,39 +208,26 @@ def ParseYamlArgs(string): def ParseYamlReturns(string): - # Example: Tensor, Tensor - - # list = [ ["", ret_type, orig_position], ...] - returns_list = [] - - returns = [x.strip() for x in string.strip().split(",")] - for i in range(len(returns)): - ret_type = returns[i] - - assert ret_type in yaml_types_mapping.keys() - ret_type = yaml_types_mapping[ret_type] - - returns_list.append(["", ret_type, i]) - - return returns_list - - -def ParseYamlReturnsWithName(string): - # Example: Tensor(out), Tensor(out1) + # Example0: Tensor(out), Tensor(out1) + # Example1: Tensor, Tensor + # Example2: Tensor[](out), Tensor # list = [ [ret_name, ret_type, orig_position], ...] returns_list = [] returns = [x.strip() for x in string.strip().split(",")] - atype = r'(.*?)' - aname = r'(.*?)' - pattern = f'{atype}\({aname}\)' for i in range(len(returns)): ret = returns[i] - m = re.search(pattern, ret) - ret_type = m.group(1) - ret_name = m.group(2) + + ret_name = "" + if "(" in ret and ")" in ret: + # Remove trailing ')' + ret = ret[:-1] + ret_type = ret.split("(")[0].strip() + ret_name = ret.split("(")[1].strip() + else: + ret_type = ret.strip() assert ret_type in yaml_types_mapping.keys() ret_type = yaml_types_mapping[ret_type] @@ -266,7 +253,7 @@ def ParseYamlForwardFromBackward(string): function_returns = m.group(3) forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) - forward_returns_list = ParseYamlReturnsWithName(function_returns) + forward_returns_list = ParseYamlReturns(function_returns) return forward_inputs_list, forward_attrs_list, forward_returns_list @@ -296,7 +283,7 @@ def ParseYamlBackward(args_str, returns_str): args_str = re.search(args_pattern, args_str).group(1) inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturnsWithName(returns_str) + returns_list = ParseYamlReturns(returns_str) return inputs_list, attrs_list, returns_list -- GitLab From 50d5bf7959e660fff3d49d70fa73e1f3b132c0c2 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 4 Mar 2022 10:35:30 +0800 Subject: [PATCH 038/261] [Phi] Change input vec tensor to pointer type (#40078) * change input vec tensor to pointer * update input between * fix format error * resolve conflict * resolve conflict --- paddle/infrt/host_context/value.h | 2 +- paddle/phi/api/lib/api_gen_utils.cc | 6 +-- paddle/phi/api/lib/api_gen_utils.h | 2 +- paddle/phi/core/kernel_context.h | 9 ++--- paddle/phi/core/kernel_registry.h | 4 +- paddle/phi/core/kernel_utils.h | 40 +++++++++---------- .../kernels/broadcast_tensors_grad_kernel.h | 2 +- paddle/phi/kernels/broadcast_tensors_kernel.h | 2 +- paddle/phi/kernels/concat_kernel.h | 8 ++-- .../cpu/broadcast_tensors_grad_kernel.cc | 4 +- paddle/phi/kernels/cpu/concat_kernel.cc | 31 +++++++------- .../gpu/broadcast_tensors_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/concat_kernel.cu | 30 +++++++------- .../impl/broadcast_tensors_kernel_impl.h | 10 ++--- paddle/phi/tests/core/test_custom_kernel.cc | 2 +- .../phi/tests/kernels/test_concat_dev_api.cc | 2 +- python/paddle/utils/code_gen/api_base.py | 28 +++++++++++-- 17 files changed, 103 insertions(+), 83 deletions(-) diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 7e7d77d3af7..0ae482349cd 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -70,7 +70,7 @@ using ValueVariantType = backends::CpuPhiAllocator, backends::CpuPhiContext, ::phi::CPUContext, - std::vector, + std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, std::vector, diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index f04e74b45fc..e1ebe8c6465 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -71,11 +71,11 @@ paddle::optional MakeMetaTensor( } std::vector MakeMetaTensor( - const std::vector& tensors) { + const std::vector& tensors) { std::vector meta_tensors; meta_tensors.reserve(tensors.size()); - for (const auto& t : tensors) { - meta_tensors.emplace_back(t); + for (const auto* t : tensors) { + meta_tensors.emplace_back(*t); } return meta_tensors; } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index 109c6e7ab71..01625f651c3 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -51,7 +51,7 @@ paddle::optional MakeMetaTensor( const paddle::optional& tensor); std::vector MakeMetaTensor( - const std::vector& tensors); + const std::vector& tensors); phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 57e2db60c24..213ac47d30b 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -82,12 +82,11 @@ class KernelContext { } template - std::vector MoveInputsBetween(size_t start, size_t end) { - std::vector v; + std::vector InputsBetween(size_t start, size_t end) { + std::vector v; for (size_t i = start; i < end; ++i) { - auto t = static_cast(inputs_.at(i)); - v.emplace_back(*t); - inputs_[i] = nullptr; + auto* t = static_cast(inputs_.at(i)); + v.emplace_back(t); } return v; } diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 2b04d173af0..35e170a3fce 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -87,8 +87,8 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); - } else if (arg_type == - std::type_index(typeid(const std::vector&))) { + } else if (arg_type == std::type_index(typeid( + const std::vector&))) { args_def->AppendInput(default_key.backend(), default_tensor_layout, default_key.dtype(), diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index b582375155a..f7fa27b0744 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -102,26 +102,26 @@ namespace phi { } \ } -#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ - template \ - struct KernelCallHelper&, Tail...> { \ - template \ - static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const std::pair range = ctx->InputRangeAt(in_idx); \ - std::vector arg = std::move( \ - ctx->MoveInputsBetween(range.first, range.second)); \ - KernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ +#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ + template \ + struct KernelCallHelper&, Tail...> { \ + template \ + static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + const std::pair range = ctx->InputRangeAt(in_idx); \ + std::vector arg = std::move( \ + ctx->InputsBetween(range.first, range.second)); \ + KernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ } #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h index 5ec2e35cc9b..5d24f6684a4 100644 --- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -21,7 +21,7 @@ namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx); } // namespace phi diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h index fb2a6f1136c..22b5201b690 100644 --- a/paddle/phi/kernels/broadcast_tensors_kernel.h +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -21,7 +21,7 @@ namespace phi { template void BroadcastTensorsKernel(const Context& ctx, - const std::vector& x, + const std::vector& x, std::vector out); } // namespace phi diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index f1366788146..ed969e963ec 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -22,19 +22,19 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis, DenseTensor* out); template DenseTensor Concat(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis) { std::vector meta_x; meta_x.reserve(x.size()); std::vector meta_x_ptr; - for (const auto& t : x) { - meta_x.emplace_back(t); + for (const auto* t : x) { + meta_x.emplace_back(*t); meta_x_ptr.push_back(&meta_x.back()); } diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc index 7a97f8c2189..0869cd62024 100644 --- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -59,7 +59,7 @@ namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx) { // Find reduce dimensions const auto& in_tensors = dout; @@ -85,7 +85,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, // For each In-Out tensor pair, // Prepare and apply broadcast dims array for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = &in_tensors[i]; + const auto* input_tensor = in_tensors[i]; auto* output_tensor = out_tensors[i]; const auto& input_dims = input_tensor->dims(); diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 5c4202837c4..6be825d4ef1 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -29,17 +29,17 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis_scalar, DenseTensor* out) { int64_t axis = axis_scalar.to(); - axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); + axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size()); std::vector x_dims; x_dims.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { - x_dims.push_back(x[i].dims()); + x_dims.push_back(x[i]->dims()); } phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); @@ -47,13 +47,13 @@ void ConcatKernel(const Context& dev_ctx, out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. - if (axis == 0 && x[0].lod().size() > 0) { - size_t lod_size_0 = x[0].lod().size(); + if (axis == 0 && x[0]->lod().size() > 0) { + size_t lod_size_0 = x[0]->lod().size(); size_t lod_size = lod_size_0; for (size_t i = 1; i < x.size(); ++i) { - if (x[i].lod().size() > 0) { + if (x[i]->lod().size() > 0) { PADDLE_ENFORCE_EQ( - x[i].lod().size(), + x[i]->lod().size(), lod_size_0, phi::errors::Unimplemented( "The lod level of all input LoDTensors should be same. " @@ -61,7 +61,7 @@ void ConcatKernel(const Context& dev_ctx, "it is not supported currently. The lod level of %dth input " "is %d and first input is %d.", i, - x[i].lod().size(), + x[i]->lod().size(), lod_size_0)); } else { lod_size = 0; @@ -71,7 +71,7 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod()); + auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod()); phi::AppendLoD(out_lod, in_lod); } } @@ -80,28 +80,29 @@ void ConcatKernel(const Context& dev_ctx, // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && x.size() < 10) { size_t output_offset = 0; - for (auto& in : x) { - if (in.numel() == 0UL) { + for (const auto* in : x) { + if (in->numel() == 0UL) { continue; } - auto in_stride = phi::stride_numel(in.dims()); + auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); paddle::operators::StridedNumelCopyWithAxis( dev_ctx, axis, out->data() + output_offset, out_stride, - in.data(), + in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; } } else { + // TODO(chenweihang): concat functor support vector input std::vector inputs; inputs.reserve(x.size()); for (size_t j = 0; j < x.size(); ++j) { - if (x[j].numel() > 0) { - inputs.emplace_back(x[j]); + if (x[j]->numel() > 0) { + inputs.emplace_back(*x[j]); } else { continue; } diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 6fb24d72145..275b8411ccc 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -27,7 +27,7 @@ namespace phi { template void BroadcastTensorsGradKernel(const Context& ctx, - const std::vector& dout, + const std::vector& dout, std::vector dx) { // Find reduce dimensions const auto& in_tensors = dout; @@ -54,7 +54,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, // For each In-Out tensor pair, // Prepare and apply broadcast dims array for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = &in_tensors[i]; + auto* input_tensor = in_tensors[i]; auto* output_tensor = out_tensors[i]; const DDim& input_dims = input_tensor->dims(); diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 2b04b979c20..accb1cc3d77 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -29,16 +29,16 @@ namespace phi { template void ConcatKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, const Scalar& axis_scalar, DenseTensor* out) { int64_t axis = axis_scalar.to(); - axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); + axis = phi::funcs::ComputeAxis(axis, x[0]->dims().size()); std::vector x_dims; for (size_t i = 0; i < x.size(); ++i) { - x_dims.push_back(x[i].dims()); + x_dims.push_back(x[i]->dims()); } phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); @@ -46,13 +46,13 @@ void ConcatKernel(const Context& dev_ctx, out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. - if (axis == 0 && x[0].lod().size() > 0) { - size_t lod_size_0 = x[0].lod().size(); + if (axis == 0 && x[0]->lod().size() > 0) { + size_t lod_size_0 = x[0]->lod().size(); size_t lod_size = lod_size_0; for (size_t i = 1; i < x.size(); ++i) { - if (x[i].lod().size() > 0) { + if (x[i]->lod().size() > 0) { PADDLE_ENFORCE_EQ( - x[i].lod().size(), + x[i]->lod().size(), lod_size_0, phi::errors::Unimplemented( "The lod level of all input LoDTensors should be same. " @@ -60,7 +60,7 @@ void ConcatKernel(const Context& dev_ctx, "it is not supported currently. The lod level of %dth input " "is %d and first input is %d.", i, - x[i].lod().size(), + x[i]->lod().size(), lod_size_0)); } else { lod_size = 0; @@ -70,7 +70,7 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLoD(x[i].lod()); + auto in_lod = phi::ConvertToLengthBasedLoD(x[i]->lod()); phi::AppendLoD(out_lod, in_lod); } } @@ -79,18 +79,18 @@ void ConcatKernel(const Context& dev_ctx, // Sometimes direct copies will be faster, this maybe need deeply analysis. if (axis == 0 && x.size() < 10) { size_t output_offset = 0; - for (auto& in : x) { - if (in.numel() == 0UL) { + for (auto* in : x) { + if (in->numel() == 0UL) { continue; } - auto in_stride = phi::stride_numel(in.dims()); + auto in_stride = phi::stride_numel(in->dims()); auto out_stride = phi::stride_numel(out->dims()); paddle::operators::StridedNumelCopyWithAxis( dev_ctx, axis, out->data() + output_offset, out_stride, - in.data(), + in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; @@ -98,8 +98,8 @@ void ConcatKernel(const Context& dev_ctx, } else { std::vector inputs; for (size_t j = 0; j < x.size(); ++j) { - if (x[j].numel() > 0) { - inputs.push_back(x[j]); + if (x[j]->numel() > 0) { + inputs.push_back(*x[j]); } else { continue; } diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h index eb01b83377c..d7167704a48 100644 --- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -23,10 +23,10 @@ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/math_function.h" -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(ctx, &in_tensors[i], out_tensors[i]); \ - break; \ +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(ctx, in_tensors[i], out_tensors[i]); \ + break; \ } namespace phi { @@ -75,7 +75,7 @@ void ApplyBroadcast(const Context& ctx, template void BroadcastTensorsKernel(const Context& ctx, - const std::vector& x, + const std::vector& x, std::vector out) { const auto& in_tensors = x; auto out_tensors = out; diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index 69922c055cb..a4e89231e14 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -43,7 +43,7 @@ template void FakeDot(const Context& dev_ctx, const phi::DenseTensor& x, const phi::DenseTensor& y, - const std::vector& fake_input_vec, + const std::vector& fake_input_vec, bool fake_attr_bool, int fake_attr_int, float fake_attr_float, diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc index 55dd6dce1aa..7f954085f60 100644 --- a/paddle/phi/tests/kernels/test_concat_dev_api.cc +++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc @@ -53,7 +53,7 @@ TEST(DEV_API, concat) { } } - std::vector inputs = {dense_x, dense_y}; + std::vector inputs = {&dense_x, &dense_y}; // 2. test API phi::CPUContext dev_ctx; diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 6c07cdec2ee..601248a4176 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -458,7 +458,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. elif self.inputs['input_info'][ param] == "const std::vector&": meta_tensor_code = meta_tensor_code + f""" -{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param}); +{code_indent} auto {param}_meta_vec = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); {code_indent} std::vector {param}_metas({param}_meta_vec.size()); {code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{ {code_indent} {param}_metas[i] = &{param}_meta_vec[i]; @@ -502,7 +502,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. input_trans_map = { 'const Tensor&': 'const phi::DenseTensor&', 'const std::vector&': - 'const std::vector&', + 'const std::vector&', 'const paddle::optional&': 'paddle::optional', 'const paddle::optional>&': @@ -539,9 +539,22 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. {code_indent} }}""" else: - input_tensor_code = input_tensor_code + f""" + if self.inputs['input_info'][input_name] == "const Tensor&": + input_tensor_code = input_tensor_code + f""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});""" + elif self.inputs['input_info'][ + input_name] == "const std::vector&": + input_tensor_code = input_tensor_code + f""" +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag}); +{code_indent} std::vector {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size()); +{code_indent} for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{ +{code_indent} {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i); +{code_indent} }}""" + + else: + # do nothing + pass else: if input_name in self.optional_vars: input_tensor_code = input_tensor_code + f""" @@ -561,7 +574,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. if param in self.optional_vars: kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " else: - kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + if self.inputs['input_info'][param] == "const Tensor&": + kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + elif self.inputs['input_info'][ + input_name] == "const std::vector&": + kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " + else: + # do nothing + pass kernel_in_type = input_trans_map[input_infos[param]] kernel_args_type_list.append(kernel_in_type) elif param in attr_names: -- GitLab From caa61990dd3c954c591aa24f7b5791c7fb8af545 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 4 Mar 2022 10:52:08 +0800 Subject: [PATCH 039/261] Fix develop whl package not found (#40016) --- paddle/scripts/paddle_build.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 41e5e0469dc..175b4be295e 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -330,7 +330,7 @@ function check_style() { # pre-commit use python3.8.0 OLD_PATH=$PATH - export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH} + export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH} pre-commit install clang-format --version @@ -2754,17 +2754,20 @@ function build_pr_and_develop() { fi git fetch upstream develop + git checkout develop dev_commit=`git log -1|head -1|awk '{print $2}'` dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'` if [ "$url_return" == '200' ];then - mkdir ${PADDLE_ROOT}/build/dev_whl && wget -P ${PADDLE_ROOT}/build/dev_whl ${dev_url} + mkdir ${PADDLE_ROOT}/build/dev_whl && wget -q -P ${PADDLE_ROOT}/build/dev_whl ${dev_url} + cp ${PADDLE_ROOT}/build/dev_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl ${PADDLE_ROOT}/build/python/dist else git checkout -b develop_base_pr upstream/$BRANCH cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} generate_api_spec "$1" "DEV" mkdir ${PADDLE_ROOT}/build/dev_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/dev_whl fi + } function build_develop() { -- GitLab From a6947991d82f3b79840bb39f22f4bd3c65d036e8 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 4 Mar 2022 10:58:05 +0800 Subject: [PATCH 040/261] Generate forward-only operators (#39962) * [Eager][Yaml]Supported Scalar and ScalarArray for AutoCodeGen * Generate forward-only operators * [Yaml]Support parsing fwd & bwd returns with name * Fixed issues * Fixed minor issues --- .../final_state_generator/eager_gen.py | 10 ++-- .../final_state_generator/python_c_gen.py | 57 ++++++++++++------- paddle/fluid/pybind/eager_utils.cc | 47 +++++++++++++-- paddle/fluid/pybind/eager_utils.h | 10 ++++ 4 files changed, 93 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 4945a6fb654..7de7747ebf0 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -26,14 +26,14 @@ core_ops_args_type_info = {} yaml_types_mapping = { 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ - 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ - 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', - 'Scalar' : 'Scalar', - 'ScalarArray' : 'ScalarArray' + 'Scalar' : 'paddle::experimental::Scalar', + 'ScalarArray' : 'paddle::experimental::ScalarArray' } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9c4e102ca45..d0506e45eb4 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -16,20 +16,26 @@ import os import argparse from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +skipped_fwd_api_names = set(["scale"]) + atype_to_parsing_function = { "bool": "CastPyArg2Boolean", "int": "CastPyArg2Int", "long": "CastPyArg2Long", + "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", "string": "CastPyArg2String", - "bool[]": "CastPyArg2Booleans", - "int[]": "CastPyArg2Ints", - "long[]": "CastPyArg2Longs", - "float[]": "CastPyArg2Floats", - "double[]": "CastPyArg2Float64s", - "string[]": "CastPyArg2Strings", - "Scalar": "CastPyArg2Scalar", - "ScalarArray": "CastPyArg2ScalarArray" + "std::vector": "CastPyArg2Booleans", + "std::vector": "CastPyArg2Ints", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Floats", + "std::vector": "CastPyArg2Float64s", + "std::vector": "CastPyArg2Strings", + "paddle::experimental::Scalar": "CastPyArg2Scalar", + "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", + "paddle::experimental::Backend": "CastPyArg2Backend", + "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -43,15 +49,9 @@ def ParseArguments(): return args -def GetCxxType(atype): - if atype not in yaml_types_mapping.keys(): - assert False - - return yaml_types_mapping[atype] - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): + print(f"Unable to find {atype} in atype_to_parsing_function.") assert False return atype_to_parsing_function[atype] @@ -59,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, forward_attrs_list, forward_outputs_position_map, - optional_inputs): + optional_inputs, is_forward_only): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] @@ -86,11 +86,10 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, # Get Attributes for name, atype, _, pos in forward_attrs_list: parsing_function = FindParsingFunctionFromAttributeType(atype) - cxx_type = GetCxxType(atype) key = f"{name}" parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" + parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) @@ -127,9 +126,14 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ + if is_forward_only: + fwd_function_name = fwd_api_name + else: + fwd_function_name = GetForwardFunctionName(fwd_api_name) + python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - GetForwardFunctionName(fwd_api_name), dygraph_function_call_str) + fwd_function_name, dygraph_function_call_str) python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" @@ -213,6 +217,11 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #pragma once #include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" @@ -251,19 +260,23 @@ if __name__ == "__main__": python_c_function_list = [] python_c_function_reg_list = [] for fwd_api in fwd_api_list: + # We only generate Ops with grad + is_forward_only = False if 'backward' not in fwd_api.keys(): - continue + is_forward_only = True assert 'api' in fwd_api.keys() assert 'args' in fwd_api.keys() assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() fwd_api_name = fwd_api['api'] fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] + if fwd_api_name in skipped_fwd_api_names: + continue + # Parse Dispensable Inputs optional_inputs = [] if 'optional' in fwd_api.keys(): @@ -285,7 +298,7 @@ if __name__ == "__main__": python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs) + forward_outputs_position_map, optional_inputs, is_forward_only) python_c_function_list.append(python_c_function_str) python_c_function_reg_list.append(python_c_function_reg_str) print("Generated Python-C Function: ", python_c_function_str) diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 7647930ef07..0cfb08345b6 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -757,7 +757,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, if (obj == Py_None) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "int, float, bool or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -784,7 +784,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "int, float, bool or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -801,7 +801,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( if (obj == Py_None) { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "list or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -821,7 +821,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "bool, but got %s", + "list or Tensor, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT } @@ -830,5 +830,44 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( return paddle::experimental::ScalarArray({1}); } +paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "int or place, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + PyTypeObject* type = obj->ob_type; + auto type_name = std::string(type->tp_name); + if (type_name == "int") { + int value = CastPyArg2Int(obj, op_type, arg_pos); + return static_cast(value); + } else { + platform::Place place = CastPyArg2Place(obj, arg_pos); + return phi::TransToPhiBackend(place); + } + + return paddle::experimental::Backend::CPU; +} + +paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + if (obj == Py_None) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "data_type, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos); + return framework::TransToPhiDataType(type); +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 6e990691776..c5da1bb37af 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -11,6 +11,8 @@ limitations under the License. */ #pragma once #include +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" @@ -100,6 +102,14 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, paddle::experimental::ScalarArray CastPyArg2ScalarArray( PyObject* obj, const std::string& op_type, ssize_t arg_pos); +paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); + +paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); + paddle::optional GetOptionalTensorFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); -- GitLab From 14e98a0fb0ff3aeb36b3061d55d70d4b71f95d79 Mon Sep 17 00:00:00 2001 From: chenjian Date: Fri, 4 Mar 2022 11:51:30 +0800 Subject: [PATCH 041/261] fix warning (#40133) --- paddle/fluid/platform/profiler/profiler.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 35dbc96874d..46cbb3358c6 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -95,7 +95,7 @@ std::unique_ptr Profiler::Stop() { collector.ThreadNames(); for (const auto& kv : thread_names) { extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first), - kv.second); + std::string("%s"), kv.second.c_str()); } return std::unique_ptr( new platform::ProfilerResult(std::move(tree), extrainfo)); -- GitLab From abacc4cb1275abd5e942db3a849fcd0d83f9f9f8 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Fri, 4 Mar 2022 11:52:22 +0800 Subject: [PATCH 042/261] transfer selu infershape (#40137) --- paddle/fluid/operators/selu_op.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 88ef1f3ea4a..0372a79b967 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - return UnaryOpUnchangedInferShape(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -121,7 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, - ops::SeluGradMaker); + ops::SeluGradMaker, + SeluInferShapeFunctor); + REGISTER_OPERATOR(selu_grad, ops::SeluGradOp); -- GitLab From 3ac9bc9521a7c0914bdaa1c8b27014153a001f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Fri, 4 Mar 2022 12:33:10 +0800 Subject: [PATCH 043/261] [infrt] add ir for convert pd dilect to phi dialect. test=develop (#40104) --- paddle/infrt/dialect/infrt/infrt_ops.td | 7 ++ paddle/infrt/dialect/phi/CMakeLists.txt | 3 + paddle/infrt/dialect/phi/ir/infrt_phi_base.td | 1 + .../infrt/dialect/phi/pass/kernel_op_desc.cc | 63 ++++++++++- .../infrt/dialect/phi/pass/kernel_op_desc.h | 4 + .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc | 100 ++++++++++++++++-- .../dialect/phi/pass/proto_arg_map_context.h | 2 +- paddle/infrt/dialect/phi/phi_exec.cc | 67 +++++++----- paddle/infrt/dialect/phi/phi_ir_exec.cc | 47 ++++++++ paddle/infrt/host_context/CMakeLists.txt | 3 +- paddle/infrt/pass/CMakeLists.txt | 1 - paddle/infrt/tests/CMakeLists.txt | 2 +- .../infrt/tests/dialect/pten/pten_pass.mlir | 2 +- paddle/infrt/tests/lit.cfg.py.in | 3 +- paddle/scripts/infrt_build.sh | 2 +- tools/infrt/fake_models/multi_fc.py | 1 - tools/infrt/generate_phi_kernel_dialect.py | 5 +- 17 files changed, 263 insertions(+), 50 deletions(-) create mode 100644 paddle/infrt/dialect/phi/phi_ir_exec.cc delete mode 100755 paddle/infrt/pass/CMakeLists.txt diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td index 00f94805c7d..ecd7093e72b 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/infrt_ops.td @@ -17,3 +17,10 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> { OptionalAttr:$attrs); let results = (outs Variadic); } + +def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { + let summary = "convert tensor type op"; + let description = [{convert tensor type op!}]; + let arguments = (ins AnyType:$input); + let results = (outs AnyType:$output); +} diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index d477b6b9bdc..a2677a946cb 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -5,5 +5,8 @@ endif() add_subdirectory(ir) add_subdirectory(pass) +add_executable(phi-ir-exec phi_ir_exec.cc) +target_link_libraries(phi-ir-exec infrt) + add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index e9591e7f6d7..671646b9259 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -3,6 +3,7 @@ include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" +include "mlir/Interfaces/InferTypeOpInterface.td" def PHI_Dialect : Dialect { let name = "phi"; diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 6c0f6df8921..12a6cfcc3e4 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -16,8 +16,10 @@ #include #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/kernel_registry.h" -namespace infrt { +#include "paddle/phi/kernels/declarations.h" +namespace infrt { +namespace { phi::Backend cvtTarget2Phi(TargetType target) { switch (target) { case TargetType::CPU: @@ -124,19 +126,76 @@ Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) { cvtLayoutFromPhi(tensor_arg.layout)); } +} // namespace + +std::string getPhiTargetPrefix(TargetType target) { + switch (target) { + case TargetType::CPU: + return "phi_cpu."; + case TargetType::GPU: + return "phi_gpu."; + default: + LOG(FATAL) << "UnSupported target type !"; + return std::string(); + } +} +std::string getPhiPrecisionSuffix(PrecisionType precision) { + switch (precision) { + case PrecisionType::FLOAT32: + return ".float32"; + case PrecisionType::FLOAT16: + return ".float16"; + case PrecisionType::FLOAT64: + return ".float64"; + case PrecisionType::UINT8: + return ".uint8"; + case PrecisionType::INT8: + return ".int8"; + case PrecisionType::INT16: + return ".int16"; + case PrecisionType::INT32: + return ".int32"; + case PrecisionType::INT64: + return ".int64"; + case PrecisionType::COMPLEX64: + return ".complex64"; + case PrecisionType::COMPLEX128: + return ".complex128"; + case PrecisionType::BOOL: + return ".bool"; + default: + LOG(FATAL) << "UnSupported precision type !"; + return std::string(); + } +} +std::string getPhiLayoutSuffix(LayoutType layout) { + switch (layout) { + case LayoutType::NCHW: + return ".nchw"; + case LayoutType::NHWC: + return ".nhwc"; + case LayoutType::ANY: + return ".any"; + default: + LOG(FATAL) << "UnSupported layout type !"; + return std::string(); + } +} + std::vector getCandidateKernels( std::string name, const std::vector& valid_palces) { std::vector candidate_kernels; PhiKernelDesc phi_kernel_desc; phi::KernelKeyMap kernel_key_map = phi::KernelFactory::Instance().SelectKernelMap(name); - for (const Place& place : valid_palces) { + for (Place place : valid_palces) { phi::KernelKey kernel_key = cvtPlace2Phi(place); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) { kernel_key = phi::KernelKey(kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; + place.layout = LayoutType::ANY; } phi_kernel_desc.kernelType = place; phi_kernel_desc.inputsType.clear(); diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index b74107f674e..34fd2f0f62d 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -26,6 +26,10 @@ struct PhiKernelDesc { Place kernelType; // kernel place }; +std::string getPhiTargetPrefix(TargetType target); +std::string getPhiPrecisionSuffix(PrecisionType precision); +std::string getPhiLayoutSuffix(LayoutType layout); + std::vector getCandidateKernels( std::string name, const std::vector& valid_palces); diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc index df3472aa01d..376ab31938a 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -18,11 +18,14 @@ #include #include #include +#include +#include #include #include #include #include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" @@ -58,8 +61,8 @@ void phiOpCvtPass::convertStage() { continue; } - phi::KernelSignature kernel_sign = - phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + ::phi::KernelSignature kernel_sign = + ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( ProtoArgumentMappingContext(op)); // resort input&output according to kernel_sign ::llvm::SmallVector inputs, ori_output; @@ -104,13 +107,92 @@ void phiOpCvtPass::diapatchStage() { infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null(&op); if (nullptr != kernel_op) worklist.push_back(kernel_op); } - // ToDo: implementation in the next PR - while (!worklist.empty()) { - // infrt::KernelOp kernel_op = worklist.back(); - worklist.pop_back(); - // std::string kernel_name = kernel_op.name().str(); - // std::vector candidates = - // getCandidateKernels(kernel_name, valid_places_); + + mlir::OpBuilder builder(&block, block.begin()); + std::map phi_context; + for (infrt::KernelOp kernel_op : worklist) { + std::string kernel_name = kernel_op.name().str(); + std::vector candidates = + getCandidateKernels(kernel_name, valid_places_); + if (candidates.empty()) { + LOG(FATAL) << "No candidate kernels for op:" << kernel_name; + continue; + } + builder.setInsertionPoint(kernel_op); + + // Todo: Implimentation the concrete pass pick strategy + const PhiKernelDesc &phi_kernel_desc = candidates.front(); + + kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + kernel_name + + getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) + + getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision); + + // mlir::OperationName operation_name = kernel_op.getOperation()->getName(); + + mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); + mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); + + if (phi_context.find(phi_kernel_desc.kernelType.target) == + phi_context.end()) { + switch (phi_kernel_desc.kernelType.target) { + case TargetType::CPU: { + auto alloctor_value = + builder + .create( + kernel_op.getLoc(), + phi::AllocatorType::get(kernel_op.getContext(), + TargetType::CPU)) + .output(); + auto context_value = + builder + .create( + kernel_op.getLoc(), + phi::ContextType::get(kernel_op.getContext(), + TargetType::CPU), + alloctor_value) + .output(); + phi_context[TargetType::CPU] = context_value; + } break; + case TargetType::GPU: + case TargetType::UNK: + default: + LOG(FATAL) << "Unsupported TargetType"; + break; + } + } + operation_state.addOperands( + phi_context.at(phi_kernel_desc.kernelType.target)); + for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { + mlir::Value input = kernel_op.getOperand(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), + DenseTensorType::get(kernel_op.getContext(), + phi_kernel_desc.inputsType[index].target, + phi_kernel_desc.inputsType[index].precision, + phi_kernel_desc.inputsType[index].layout), + input); + operation_state.addOperands(cvt_tensor_type_op.output()); + } + for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + ++index) { + operation_state.addTypes( + DenseTensorType::get(kernel_op.getContext(), + phi_kernel_desc.outputsType[index].target, + phi_kernel_desc.outputsType[index].precision, + phi_kernel_desc.outputsType[index].layout)); + } + operation_state.addAttributes(kernel_op.attrsAttr().getValue()); + mlir::Operation *phi_operation = builder.createOperation(operation_state); + for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + ++index) { + mlir::Value input = phi_operation->getResult(index); + auto cvt_tensor_type_op = builder.create( + kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); + kernel_op.getResult(index).replaceAllUsesWith( + cvt_tensor_type_op.output()); + } + kernel_op.erase(); } } } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index ca8a22a7e75..e4e9b5c3ff8 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/compat/arg_map_context.h" namespace infrt { -class ProtoArgumentMappingContext : public phi::ArgumentMappingContext { +class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext { public: // only support op in pd dialect explicit ProtoArgumentMappingContext(mlir::Operation* op) diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc index 4e99661a6a2..a2808a00cb6 100644 --- a/paddle/infrt/dialect/phi/phi_exec.cc +++ b/paddle/infrt/dialect/phi/phi_exec.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,37 +11,46 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include -#include -#include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/mlir_loader.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" -int main(int argc, char** argv) { - static llvm::cl::opt input_file( - llvm::cl::Positional, - llvm::cl::desc("Specify input filename"), - llvm::cl::init("-")); - - llvm::cl::ParseCommandLineOptions(argc, argv); +#include "paddle/infrt/host_context/paddle_mlir.h" - mlir::MLIRContext* context = infrt::Global::getMLIRContext(); - auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); +void print_usage() { + std::cout << "Error inputs format, two kinds of inputs are supported:\n"; + std::cout << " [1] ./paddle-mlir-convert $path_to_model_file " + "$path_to_params_file\n"; + std::cout << " [2] ./paddle-mlir-convert $path_to_model_dir(__model__ + " + "params)\n"; +} - module->dump(); - mlir::PassManager pm(context); +bool parse_inputs(int argc, + char** argv, + std::string* model_file_name, + std::string* params_file_name) { + switch (argc) { + case 1: { + print_usage(); + return false; + } + case 2: { + *model_file_name = std::string(argv[1]) + std::string("/__model__"); + *params_file_name = std::string(argv[1]) + std::string("/params"); + return true; + } + case 3: { + *model_file_name = argv[1]; + *params_file_name = argv[2]; + return true; + } + default: { return false; } + } +} - mlir::OpPassManager& phi_pass_manager = pm.nest(); - std::vector valid_places = {{infrt::TargetType::CPU, - infrt::PrecisionType::FLOAT32, - infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); - if (mlir::failed(pm.run(*module))) { - std::cout << "\npass failed!\n" << std::endl; - return 4; +int main(int argc, char** argv) { + std::string model_file_name; + std::string params_file_name; + if (parse_inputs(argc, argv, &model_file_name, ¶ms_file_name)) { + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); } - module->dump(); - return 0; } diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc new file mode 100644 index 00000000000..1df929895b1 --- /dev/null +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" + +int main(int argc, char** argv) { + static llvm::cl::opt input_file( + llvm::cl::Positional, + llvm::cl::desc("Specify input filename"), + llvm::cl::init("-")); + + llvm::cl::ParseCommandLineOptions(argc, argv); + + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); + context->loadAllAvailableDialects(); + module->dump(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(std::make_unique(valid_places)); + if (mlir::failed(pm.run(*module))) { + std::cout << "\npass failed!\n" << std::endl; + return 4; + } + module->dump(); + return 0; +} diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt index 11304742ecd..14cbea70ca8 100644 --- a/paddle/infrt/host_context/CMakeLists.txt +++ b/paddle/infrt/host_context/CMakeLists.txt @@ -12,6 +12,7 @@ gather_srcs(infrt_src SRCS function.cc mlir_function_executable.cc mlir_program_executor.cc + paddle_mlir.cc ) cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS}) @@ -21,7 +22,7 @@ cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${ML cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS}) cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS}) -add_executable(paddle-mlir-convert paddle_mlir.cc paddle_mlir_converter.cc) +add_executable(paddle-mlir-convert paddle_mlir_converter.cc) target_link_libraries(paddle-mlir-convert infrt ${MLIR_IR_LIBS}) add_executable(infrtexec mlir_exec.cc) target_link_libraries(infrtexec infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt deleted file mode 100755 index 51fecdf9077..00000000000 --- a/paddle/infrt/pass/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(phi) diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index e5cc1ec1121..5ce6d867342 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,6 +1,6 @@ configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" - DEPENDS infrtopt infrtexec) + DEPENDS infrtopt infrtexec phi-ir-exec) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir index 30ff2636ae5..61a66cb3d71 100644 --- a/paddle/infrt/tests/dialect/pten/pten_pass.mlir +++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt %s | FileCheck %s +// RUN: phi-ir-exec %s // CHECK-LABEL: @ops func @ops() { %a = pd.feed() {name="input0"} : !infrt.lod_tensor diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in index d47957dac92..fe35dc4b8b3 100644 --- a/paddle/infrt/tests/lit.cfg.py.in +++ b/paddle/infrt/tests/lit.cfg.py.in @@ -23,9 +23,10 @@ config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib") infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/") trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/") infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/") +phi_ir_exec_bin = os.path.join(build_dir, "paddle/infrt/dialect/phi") llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/") config.environment['PATH'] = os.path.pathsep.join( - (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH'])) + (infrtopt_bin, infrtexec_bin, trtexec_bin, phi_ir_exec_bin, llvm_bin, config.environment['PATH'])) config.suffixes = ['.mlir'] diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 75b27e4165d..fb7be82d1c5 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -92,7 +92,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py index 03cf6828cc7..0d633cfc60a 100644 --- a/tools/infrt/fake_models/multi_fc.py +++ b/tools/infrt/fake_models/multi_fc.py @@ -19,7 +19,6 @@ import sys, os import numpy as np import paddle import paddle.fluid as fluid -from paddle.fluid.backward import append_backward size = 2 num_layers = 4 diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 8efa03306fb..f3a78a8d4e8 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -16,7 +16,7 @@ import json import sys attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'} -supported_kernels = ['sign', 'dot', 'digamma', 'conj'] +supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw'] target_type_converter = {"CPU": "CPU", "GPU": "GPU"} layout_type_converter = { @@ -66,7 +66,8 @@ def generate_attrs_info(op_name, attrs_info): 'digamma': [], 'lerp': [], 'cast': ['out_dtype', 'in_dtype'], - 'abs': [] + 'abs': [], + 'add_raw': ['axis'], } attrs_args_ = "" if len(kernel_attrs_names[op_name]) == len(attrs_info): -- GitLab From e2e2d53142a71ec35d82eb7c7630543572bc531b Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 4 Mar 2022 12:38:58 +0800 Subject: [PATCH 044/261] [phi]move reduce gpu impl funcs into pten/kernels/funcs (#39990) * move reduce gpu impl funcs into pten/kernels/funcs * change reduce header name and namespace * fix spell word error * change mutable_data to dev_ctx.Alloc * modify place to devcontex * format code style * fix build error * fix build error * fix conflict --- .../fluid/operators/fused/attn_bias_add.cu.h | 4 +- .../reduce_ops/check_reduce_rank_test.cu | 4 +- .../fluid/operators/reduce_ops/reduce_op.cu.h | 5 +- paddle/phi/kernels/funcs/reduce_function.h | 1240 +++++++++++++++++ .../gpu/broadcast_tensors_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/compare_kernel.cu | 2 +- paddle/phi/kernels/gpu/elementwise_grad.h | 10 +- paddle/phi/kernels/gpu/reduce.h | 1234 +--------------- paddle/phi/kernels/gpu/trace_kernel.cu | 4 +- .../kernels/impl/matmul_grad_kernel_impl.h | 2 +- 10 files changed, 1264 insertions(+), 1245 deletions(-) create mode 100644 paddle/phi/kernels/funcs/reduce_function.h diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 20801d2243f..51cf3bce1ce 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num, int num_block = (max_threads / left_num); if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block); + *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block); if (*blocking_size <= 1) { - *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num)); + *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num)); } else if (*blocking_size * 2 < reduce_num) { *blocking_size *= 2; } diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu index e8e4ff7010d..a724524716b 100644 --- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu +++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu @@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) { } if (is_valid) { - phi::kernels::details::CheckReduceRank(reduce_rank, rank); + phi::funcs::details::CheckReduceRank(reduce_rank, rank); } else { - ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank), + ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank), paddle::platform::EnforceNotMet); } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 3aab906804f..eb76eee1048 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -23,8 +23,7 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace paddle { namespace operators { @@ -37,7 +36,7 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::kernels::TensorReduceImpl( + phi::funcs::TensorReduceImpl( static_cast(dev_ctx), x, y, transform, origin_reduce_dims, stream); } diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h new file mode 100644 index 00000000000..7df772682ec --- /dev/null +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -0,0 +1,1240 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// CUDA and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#include +#include +#include +#include +#include + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif + +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" +#include "paddle/utils/string/string_helper.h" + +// Reduce split or not, Whether to use ReduceHigherDim +#define REDUCE_SPLIT_BOUNDARY 512 +#define REDUCE_VEC_SIZE 4 + +namespace kps = phi::kps; + +namespace phi { +namespace funcs { + +namespace details { + +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + +// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny +static inline std::vector GetDimStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +// get blockDim for reduceLastDim and reduceAny +static inline int GetBlockDim(int block_dim) { + return block_dim >= kps::details::kReduceMaxThread + ? kps::details::kReduceMaxThread + : GetLastPow2(block_dim); +} + +// check reduce rand is valid +static inline void CheckReduceRank(int reduce_rank, int rank) { + if (rank % 2 == 0) { + PADDLE_ENFORCE_EQ(reduce_rank, + rank / 2, + phi::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, " + "reduce_rank must be %d, but got %d.", + rank, + rank / 2, + reduce_rank)); + } else { + auto lower_rank = (rank - 1) / 2; + auto upper_rank = (rank + 1) / 2; + PADDLE_ENFORCE_EQ( + reduce_rank == lower_rank || reduce_rank == upper_rank, + true, + phi::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " + "must be %d or %d, but got %d.", + rank, + lower_rank, + upper_rank, + reduce_rank)); + } +} + +// convert dims from vector to array +template +static inline phi::Array VectorToArray( + const VectorLikeType& vec) { + PADDLE_ENFORCE_LE( + vec.size(), + ElementCount, + phi::errors::InvalidArgument("Cub reduce Array: size not match. Received " + "vec.size() %d > ElementCount %d.", + vec.size(), + ElementCount)); + size_t n = static_cast(vec.size()); + phi::Array ret; + for (size_t i = 0; i < n; ++i) { + ret[i] = vec[i]; + } + return ret; +} + +static inline std::vector GetReduceDim(const std::vector& dims, + int dim_size, + bool reduce_all) { + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(dim_size); + int reduce_size = reduce_dims.size(); + for (int i = 0; i < reduce_size; ++i) { + reduce_dims[i] = i; + } + } else { + for (auto e : dims) { + PADDLE_ENFORCE_LT(e, + dim_size, + phi::errors::InvalidArgument( + "ReduceOp: invalid axis, when x_dims is %d, " + "axis[i] should less than x_dims, but got %d.", + dim_size, + e)); + reduce_dims.push_back(e >= 0 ? e : e + dim_size); + } + } + return reduce_dims; +} + +} // namespace details + +constexpr int kMaxRank = phi::DDim::kMaxRank; + +enum ReduceType { + kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; + kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim + kReduceAny = 0x03, // when reduce_dim.size() > 1 +}; + +struct IndexCalculator { + IndexCalculator(int dim, + const std::vector& cal_dims, + const std::vector& cal_strides, + const std::vector& full_strides) + : dim(dim) { + dims = details::VectorToArray(cal_dims); + strides = details::VectorToArray(full_strides); + reduce_strides = details::VectorToArray(cal_strides); +#ifndef PADDLE_WITH_XPU_KP + std::vector cal_divmoders; + // fast divmod + for (auto i : cal_strides) { + cal_divmoders.push_back(paddle::platform::FastDivMod(i)); + } + divmoders = details::VectorToArray( + cal_divmoders); +#endif + } + + __device__ inline int operator()(int offset) const { +#ifdef PADDLE_WITH_XPU_KP + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + index += (offset / reduce_strides[i]) * strides[dims[i]]; + offset = offset % reduce_strides[i]; + } + return index; +#else + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + auto divmod = divmoders[i].Divmod(offset); + index += (divmod.val[0] * strides[dims[i]]); + offset = divmod.val[1]; + } + return index; +#endif + } + + int dim; + phi::Array dims; + phi::Array strides; + phi::Array reduce_strides; +#ifndef PADDLE_WITH_XPU2 + phi::Array divmoders; +#endif +}; + +template +struct ReduceIndexMapping { + const kps::DimConfig dim; + HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) + : dim(dims) {} + + __device__ __forceinline__ int BlockIdX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } else { + return cluster_id() % dim.split_num_x; + } +#else + return blockIdx.x; +#endif + } + + __device__ __forceinline__ int BlockIdY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return (cluster_id() % dim.split_num_x); + } else { + return (cluster_id() / dim.split_num_x % dim.split_num_y); + } +#else + return blockIdx.y; +#endif + } + + __device__ __forceinline__ int BlockDimX() { +#ifdef PADDLE_WITH_XPU2 + return dim.deal_size_x; +#else + return blockDim.x; +#endif + } + + __device__ __forceinline__ int BlockDimY() { +#ifdef PADDLE_WITH_XPU2 + return 1; +#else + return blockDim.y; +#endif + } + + __device__ __forceinline__ int GridDimX() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_y; + } else { + return dim.split_num_x; + } +#else + return gridDim.x; +#endif + } + + __device__ __forceinline__ int GridDimY() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.split_num_x; + } else { + return dim.split_num_y; + } +#else + return gridDim.y; +#endif + } + + __device__ __forceinline__ int GetLoopSize() { +#ifdef PADDLE_WITH_XPU2 + if (ReduceLastDim) { + return dim.deal_size_y; + } else { + return dim.deal_size_x; + } +#else + return 1; +#endif + } +}; + +// when reduce_type == kReduceLastDim this struct will be used +// for higher performance +struct OneDimIndexCal { + explicit OneDimIndexCal(int num) : stride(num) {} + + __device__ inline int operator()(int index) const { return index * stride; } + int stride; +}; + +// reduce config +template +struct ReduceConfig { + ReduceConfig(const std::vector& origin_reduce_dims, + const std::vector& origin_x_dim) + : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {} + + // get the parameters of reduceKernel + void Run() { + // step1: update the reduce_dim left_dim and x_dim + SetReduceDim(); + + // step2: get the strides of dim for reduceAny and reduceLastDim + SetStrides(); + + // step3: get the type of reduce + SetReduceType(); + + // step4: set the block and grid for launch kernel + SetBlockDim(); + } + + // when should_reduce_again is true, we need malloc temp space for temp data + void SetOutputData(Ty* y_data, + const phi::GPUContext& dev_ctx, + phi::DenseTensor* tmp) { + if (should_reduce_again) { + tmp->ResizeAndAllocate(phi::make_ddim( + {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); + + output_data = dev_ctx.Alloc(tmp); + } else { + output_data = y_data; + } + } + + private: + // set reduce_dim, left_dim and update x_dim + // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1] + // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] + void SetReduceDim() { + std::set reduce_set; + for (auto e : reduce_dims_origin) { + auto pos = e >= 0 ? e : e + x_dim.size(); + reduce_set.insert(pos); + } + + std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); + std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); + + // update reduce_dim and x_dim + std::vector x_new_dim; + + reduce_dim.push_back(reduce_dim_temp[0]); + x_new_dim.push_back(x_dim[0]); + + int idx_reduce = 1; + int num = 0; + + if (reduce_dim_temp.size() > 1) { + for (int i = 1; i < x_dim.size(); i++) { + if ((idx_reduce < reduce_dim_temp.size()) && + (i == reduce_dim_temp[idx_reduce])) { + int result = + reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; + bool is_equal = ((result - num) == 1); + if (is_equal) { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + num++; + } else { + reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); + x_new_dim.push_back(x_dim[i]); + } + idx_reduce++; + } else { + x_new_dim.push_back(x_dim[i]); + } + } + } else { + x_new_dim = x_dim; + } + + // update x_dim + x_dim = x_new_dim; + std::vector().swap(x_new_dim); + + std::vector reduce_dim_new; + int is_reduced = 0; + for (auto e : reduce_dim) { + is_reduced |= 1 << e; + } + + std::vector().swap(reduce_dim); + + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + x_new_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + reduce_dim_new.push_back(x_new_dim.size() - 1); + } else { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + } + } + + x_dim = x_new_dim; + reduce_dim = reduce_dim_new; + + int x_rank = static_cast(x_dim.size()); + std::set left_set; + + for (int i = 0; i < x_rank; ++i) { + left_set.insert(i); + } + + for (auto e : reduce_dim) { + left_set.erase(e); + } + + left_dim.assign(left_set.begin(), left_set.end()); + + // if the last dim gets involved in reduction + reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1); + } + + // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny + // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1] + // --SetStrides--> x_strides= [6,1], reduce_strides = [1], + // left_strides = [1] + void SetStrides() { + std::vector idx_dim; + for (int i = 0; i < x_dim.size(); i++) { + idx_dim.push_back(i); + } + + x_strides = details::GetDimStrides(x_dim, idx_dim); + reduce_strides = details::GetDimStrides(x_dim, reduce_dim); + left_strides = details::GetDimStrides(x_dim, left_dim); + reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + + left_num = 1; + if (left_dim.size()) { + left_num = left_strides[0] * x_dim[left_dim[0]]; + } + } + + // get the reduceType + // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim + // x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim + // x_dim = [8] reduce_dim = [0] --> reduceAll + // x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny + void SetReduceType() { + int rank = x_dim.size(); + int reduce_rank = reduce_dim.size(); + bool is_last_dim = + (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); + if (rank == reduce_rank || is_last_dim) { +#ifdef PADDLE_WITH_XPU_KP + reduce_type = static_cast(ReduceType::kReduceAny); +#else + reduce_type = static_cast(ReduceType::kReduceLastDim); +#endif + } else if (reduce_rank == 1) { +// ReduceFirstDim and reduceSecondDim +#ifdef PADDLE_WITH_XPU_KP + if (reduce_dim[0] == 0) { + reduce_type = static_cast(ReduceType::kReduceHigherDim); + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } +#else + reduce_type = static_cast(ReduceType::kReduceHigherDim); +#endif + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } + } + +#ifndef PADDLE_WITH_XPU_KP + void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { + constexpr int min_reduce_num_per_thread = 16; + constexpr int max_reduce_num_per_thread = 256; + constexpr int max_num_threads = kps::details::kReduceMaxThread; + + // set block size. + // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same + // will process the reduction for one output. + // The number of output for one block is blockDim.y; + // 2. If reduce_last_dim == false, different threadIdx.x will process + // different reduction and gets the output separately. If it is + // necessary, it should reduce in block y. + // The number of output for one block is blockDim.x; + int block_x, block_y; + int grid_num, reduce_num_per_thread; + if (reduce_last_dim) { + block_x = details::GetBlockDim(reduce_num); + block_y = details::GetBlockDim(left_num); + block_dim->x = block_x; + block_dim->y = + std::min(block_y, static_cast(max_num_threads / block_dim->x)); + grid_num = details::AlignUp(left_num, block_dim->y); + reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x); + } else { + block_x = details::GetBlockDim(left_num); + block_y = details::GetBlockDim(reduce_num); + block_dim->x = std::min(block_x, 32); + block_dim->y = + std::min(block_y, static_cast(max_num_threads / block_dim->x)); + block_dim->x = + std::min(block_x, static_cast(max_num_threads / block_dim->y)); + grid_num = details::AlignUp(left_num, block_dim->x); + reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y); + } + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); + int max_threads_per_mp = + paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + int num_threads = block_dim->x * block_dim->y; + int max_num_blocks = max_threads / num_threads; + + // set grid size. + // Whether to set grid.y larger than 1, there are 3 following rules: + // 1. The number that each thread process should no less than + // min_reduce_num_per_threadbut no more than max_reduce_num_per_thread; + // 2. It should maximize the utilization of SM. + // So we choose the minimum between input_split_num_1 and input_split_num_3 + // to make each thread process as mush data as possible. Meanwhile, + // the number cannot be larger than max_reduce_num_per_thread, so we + // choose the maximum between the result above and input_split_num_2. + int input_split_num_1 = + details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread); + int input_split_num_2 = + details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread); + int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num); + + grid_dim->x = grid_num; + grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3), + input_split_num_2); + // if grid.y > 1, we need launch reduce kernel again. + if (grid_dim->y > 1) { + should_reduce_again = true; + } + } + + // set block and grid for launch kernel + // for ReduceHigherDim: if block is enough -> splite reduce_num + // else init block(32, 1) grid(block_num, 1) + // for others: block(block_num, 1) , grid(left_num, 1) + void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) { + int last_dim_num = x_dim.back(); + // update left_num + int grid_z = left_num / last_dim_num; + left_num = last_dim_num; + grid_dim->z = grid_z; + int device_id = paddle::platform::GetCurrentDeviceId(); + int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); + int max_threads_per_mp = + paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + // init + int num_block = (max_threads / left_num); + block_dim->x = details::GetBlockDim(left_num); + grid_dim->x = details::AlignUp(left_num, block_dim->x); + blocking_size = reduce_num; + + if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { + blocking_size = details::GetLastPow2(reduce_num / num_block); + if (blocking_size <= 1) { + blocking_size = details::GetLastPow2(sqrt(reduce_num)); + } else if (blocking_size * 2 < reduce_num) { + blocking_size *= 2; + } + should_reduce_again = true; + grid_dim->y = details::AlignUp(reduce_num, blocking_size); + } + } +#endif + + void SetBlockDim() { + // init + int block_num = details::GetBlockDim(reduce_num); + should_reduce_again = false; + dim3 block_dim(block_num, 1, 1); + dim3 grid_dim(left_num, 1, 1); + blocking_size = reduce_num; +#ifdef PADDLE_WITH_XPU_KP + if (reduce_last_dim) { + block_dim.x = 64; + block_dim.y = reduce_num; + grid_dim.x = 1; + grid_dim.y = 8; + } else { + block_dim.x = 64; + block_dim.y = left_num; + grid_dim.x = 8; + grid_dim.y = 1; + } +#else + if (reduce_type == ReduceType::kReduceHigherDim) { + SetBlockDimForHigher(&block_dim, &grid_dim); + } else { + SetBlockDimForReduceAny(&block_dim, &grid_dim); + } +#endif + + block = block_dim; + grid = grid_dim; + } + + public: + std::vector reduce_dims_origin; + std::vector reduce_dim; + std::vector x_dim; + std::vector left_dim; + std::vector x_strides; + std::vector left_strides; + std::vector reduce_strides; + + int reduce_type; + int reduce_num; + int left_num; + int blocking_size; + bool should_reduce_again; + bool reduce_last_dim; + + Ty* output_data; + + dim3 block; + dim3 grid; +}; + +// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or +// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this +// function will be used +template +__global__ void ReduceAnyKernel(const Tx* x, + Ty* y, + ReduceOp reducer, + TransformOp transformer, + MPType init, + int reduce_num, + int left_num, + bool reduce_last_dim, + const Calculator reduce_index_calculator, + const Calculator left_index_calculator, + const kps::DimConfig dim) { + int input_idx, left_idx, stride; + int block_size = 0; + bool need_store = true; + int loop_left = 0; + int tid = 0; + // the last dim gets involved in reduction + int store_offset = 0; + int stride_left = 0; + if (reduce_last_dim) { + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimX(); + left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y; + stride = block.GridDimY() * block.BlockDimX(); + block_size = block.BlockDimX(); + need_store = (THREAD_ID_X == 0) && (left_idx < left_num); + store_offset = block.BlockIdY() * left_num + left_idx; + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = 1; + tid = THREAD_ID_X; + } else { + auto block = ReduceIndexMapping(dim); + input_idx = block.BlockIdY() * block.BlockDimY(); + left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X; + stride = block.GridDimY() * block.BlockDimY(); + block_size = block.BlockDimY(); + need_store = (THREAD_ID_Y == 0) && (left_idx < left_num); + loop_left = min(block.GetLoopSize(), left_num - left_idx); + stride_left = block.BlockDimX() * block.GridDimX(); + store_offset = block.BlockIdY() * left_num + left_idx; + tid = THREAD_ID_Y; + } + // calculate the offset, means the addr where each thread really start. + // 1. reduce for each thread + MPType input_compute[REDUCE_VEC_SIZE]; + Tx input_reg[REDUCE_VEC_SIZE]; + int input_idx_tmp = input_idx; + for (int i = 0; i < loop_left; i += stride_left) { + int input_offset = left_index_calculator(left_idx + i); + const _ptr_ Tx* input = x + input_offset; + MPType reduce_var = init; + // load REDUCE_VEC_SIZE data once, and then compute + int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; + input_idx = input_idx_tmp; + for (; input_idx + block_size < bound; + input_idx += REDUCE_VEC_SIZE * stride) { + kps::ReadDataReduce, + false>(&input_reg[0], + input, + input_idx, + reduce_index_calculator, + 1, + reduce_num, + 1, + stride, + kps::IdentityFunctor(), + reduce_last_dim); + kps::ElementwiseUnary( + &input_compute[0], &input_reg[0], transformer); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); + } + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce(&input_compute[0], + input, + input_idx, + reduce_index_calculator, + 1, + reduce_num - input_idx, + 1, + stride, + transformer, + reduce_last_dim); + kps::Reduce( + &reduce_var, &input_compute[0], reducer, reduce_last_dim); + + kps::Reduce( + &reduce_var, &reduce_var, reducer, reduce_last_dim); + if (need_store) { + y[store_offset + i] = static_cast(reduce_var); + } + } +} + +template +__global__ void ReduceHigherDimKernel(const Tx* x, + Ty* y, + ReduceOp reducer, + TransformOp transformer, + MPType init, + int reduce_num, + int left_num, + int blocking_size, + const kps::DimConfig dim) { + // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this + // function will be used + auto block = ReduceIndexMapping(dim); + int idy = block.BlockIdY() * blocking_size; + int idx = block.BlockIdX() * block.BlockDimX(); + int idz = BLOCK_ID_Z * left_num; + int stride = dim.split_num_x * dim.deal_size_x; + int size = left_num - dim.rem_x; + int loop_size = min(reduce_num - idy, blocking_size); + int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); + int block_offset = idy * left_num + idz * reduce_num; + const _ptr_ Tx* input = x + block_offset; + Tx reduce_input; + for (; idx < size; idx += stride) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + block.BlockDimX(), + 1, + 1, + left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData( + y + store_offset + idx, &result, block.BlockDimX()); + } + + if (idx < left_num) { + MPType reduce_var = init; + MPType reduce_compute = init; + for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { + kps::ReadData(&reduce_input, + input + loop_idx * left_num + idx, + dim.rem_x, + 1, + 1, + left_num); + kps::ElementwiseUnary( + &reduce_compute, &reduce_input, transformer); + kps::Reduce( + &reduce_var, &reduce_compute, reducer, false); + } + Ty result = static_cast(reduce_var); + kps::WriteData( + y + store_offset + idx, &result, dim.rem_x); + } +} + +template +static void LaunchReduceKernel(const Tx* x_data, + Ty* y_data, + const ReduceOp& reducer, + const TransformOp& transform, + MPType init, + KPStream stream, + ReduceConfig config) { + if (config.reduce_type == kReduceLastDim) { + int stride_reduce = 1; + int stride_left = config.reduce_num; + // for higher performance + auto reduce_index_calculator = OneDimIndexCal(stride_reduce); + auto left_index_calculator = OneDimIndexCal(stride_left); + + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.block.y, + 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceAnyKernel<<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#else + ReduceAnyKernel<<>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#endif + + } else { + int reduce_rank = config.reduce_strides.size(); + int left_rank = config.left_strides.size(); + auto reduce_index_calculator = IndexCalculator(reduce_rank, + config.reduce_dim, + config.reduce_strides, + config.x_strides); + auto left_index_calculator = IndexCalculator( + left_rank, config.left_dim, config.left_strides, config.x_strides); + + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.block.y, + 0); + dim.SetRem(config.reduce_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceAnyKernel<<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#else + ReduceAnyKernel<<>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); +#endif + } + + if (config.should_reduce_again) { + dim3 block; + dim3 grid; + if (config.reduce_last_dim) { + block = dim3(32, 1, 1); + grid = dim3(details::AlignUp(config.left_num, 32), 1, 1); + } else { + block = dim3(config.block.x, 1, 1); + grid = dim3(config.grid.x, 1, config.grid.z); + } + + auto last_index = OneDimIndexCal(1); + auto first_index = OneDimIndexCal(config.left_num); + kps::DimConfig dim = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim.SetRem(config.left_num % block.x, 0, 0); +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(), + init, + config.grid.y, + config.left_num, + config.grid.y, + dim); +#else + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(), + init, + config.grid.y, + config.left_num, + config.grid.y, + dim); +#endif + } +} + +template class ReduceOp, + typename TransformOp> +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const phi::GPUContext& dev_ctx, + KPStream stream) { + auto reducer = ReduceOp(); + cub::TransformInputIterator trans_x(x_data, + transform); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, + temp_storage_bytes, + trans_x, + y_data, + reduce_num, + reducer, + reducer.initial(), + stream); + phi::DenseTensor tmp = + phi::Empty(dev_ctx, {static_cast(temp_storage_bytes)}); + + auto* temp_storage = dev_ctx.Alloc(&tmp); + + cub::DeviceReduce::Reduce(temp_storage, + temp_storage_bytes, + trans_x, + y_data, + reduce_num, + reducer, + reducer.initial(), + stream); +} + +template class ReduceOp, + typename TransformOp> +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const phi::GPUContext& dev_ctx, + KPStream stream) { + PADDLE_THROW(phi::errors::InvalidArgument( + "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); +} + +template class ReduceOp, + typename TransformOp> +void TensorReduceImpl(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + phi::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims, + KPStream stream) { + dev_ctx.Alloc(y); + + auto x_dim = phi::vectorize(x.dims()); + auto config = ReduceConfig(origin_reduce_dims, x_dim); + config.Run(); + int numel = x.numel(); + // after config.run() + // SetOutputData for ReduceHigherDim when should_reduce_again is true, + // temp_output should be stored temp_data in output_data space or stored in + // y_data; + + phi::DDim tmp_ddim; + phi::DenseTensor tmp = phi::Empty(dev_ctx); + + auto x_data = x.data(); + auto y_data = y->data(); + + if (config.reduce_num == 1) { + std::vector inputs = {&x}; + std::vector outputs = {y}; + funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); + return; + } + + config.SetOutputData(y_data, dev_ctx, &tmp); + constexpr bool kIsTxFP16 = std::is_same::value; + bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; +#ifndef PADDLE_WITH_XPU_KP + if (use_cub_reduce) { + CubTensorReduceImpl( + x_data, y_data, transform, config.reduce_num, dev_ctx, stream); + return; + } +#endif + + using MPType = typename kps::details::MPTypeTrait::Type; + auto reducer = ReduceOp(); + // launch ReduceHigherDimKernel + // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this + // function will be used + // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 + // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / + // 32 + // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 + if (config.reduce_type == ReduceType::kReduceHigherDim) { + kps::DimConfig dim = kps::DimConfig(config.grid.x, + config.grid.y, + config.grid.z, + config.block.x, + config.blocking_size, + 0); + dim.SetRem(config.left_num % config.block.x, + config.reduce_num % config.blocking_size, + 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel, + TransformOp><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); +#else + ReduceHigherDimKernel< + Tx, + Ty, + MPType, + ReduceOp, + TransformOp><<>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); +#endif + + if (config.should_reduce_again) { + dim3 block = dim3(config.block.x, 1, 1); + dim3 grid = dim3(config.grid.x, 1, config.grid.z); + kps::DimConfig dim2 = + kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); + dim2.SetRem(config.left_num % config.block.x, 0, 0); + +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(config.grid.y), + reducer.initial(), + config.grid.y, + config.left_num, + config.grid.y, + dim2); +#else + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<>>( + config.output_data, + y_data, + reducer, + kps::IdentityFunctor(config.grid.y), + reducer.initial(), + config.grid.y, + config.left_num, + config.grid.y, + dim2); +#endif + } + return; + } + + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or + // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this + // function will be used + LaunchReduceKernel, TransformOp>( + x_data, y_data, reducer, transform, reducer.initial(), stream, config); +} + +} // namespace funcs + +} // namespace phi + +#endif diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 275b8411ccc..926dffc7450 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -20,7 +20,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" namespace phi { @@ -87,7 +87,7 @@ void BroadcastTensorsGradKernel(const Context& ctx, *input_tensor, ctx.GetPlace(), ctx, output_tensor); } else { // reduce_sum implementation on CUDA - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, *input_tensor, output_tensor, diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu index 272448504ac..9c02627e546 100644 --- a/paddle/phi/kernels/gpu/compare_kernel.cu +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -80,7 +80,7 @@ inline void CompareAllKernelImpl(const Context& ctx, for (int i = 0; i < reduce_dims.size(); ++i) { reduce_dims[i] = i; } - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); } diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index b17196b6b11..20799f4e37b 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_grad_base.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { @@ -84,7 +84,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -99,7 +99,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -197,7 +197,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); } } @@ -218,7 +218,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 94c2e980e36..0319de7558e 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -17,1229 +17,9 @@ // CUDA and HIP use same api #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include -#include -#include -#include - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif - -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/fast_divmod.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/utils/array.h" -#include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/primitive/kernel_primitives.h" - -// Reduce split or not, Whether to use ReduceHigherDim -#define REDUCE_SPLIT_BOUNDARY 512 -#define REDUCE_VEC_SIZE 4 - -namespace kps = phi::kps; +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { -namespace kernels { - -namespace details { - -static inline int GetLastPow2(int n) { - n |= (n >> 1); - n |= (n >> 2); - n |= (n >> 4); - n |= (n >> 8); - n |= (n >> 16); - return std::max(1, n - (n >> 1)); -} - -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } - -// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny -static inline std::vector GetDimStrides(const std::vector& dims, - const std::vector& idx) { - int n = static_cast(idx.size()); - if (n == 0) return std::vector(); - std::vector strides(n); - strides.back() = 1; - for (int i = n - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dims[idx[i + 1]]; - } - return strides; -} - -// get blockDim for reduceLastDim and reduceAny -static inline int GetBlockDim(int block_dim) { - return block_dim >= kps::details::kReduceMaxThread - ? kps::details::kReduceMaxThread - : GetLastPow2(block_dim); -} - -// check reduce rand is valid -static inline void CheckReduceRank(int reduce_rank, int rank) { - if (rank % 2 == 0) { - PADDLE_ENFORCE_EQ(reduce_rank, - rank / 2, - phi::errors::InvalidArgument( - "ReduceOp: invalid reduce rank. When rank = %d, " - "reduce_rank must be %d, but got %d.", - rank, - rank / 2, - reduce_rank)); - } else { - auto lower_rank = (rank - 1) / 2; - auto upper_rank = (rank + 1) / 2; - PADDLE_ENFORCE_EQ( - reduce_rank == lower_rank || reduce_rank == upper_rank, - true, - phi::errors::InvalidArgument( - "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " - "must be %d or %d, but got %d.", - rank, - lower_rank, - upper_rank, - reduce_rank)); - } -} - -// convert dims from vector to array -template -static inline phi::Array VectorToArray( - const VectorLikeType& vec) { - PADDLE_ENFORCE_LE( - vec.size(), - ElementCount, - phi::errors::InvalidArgument("Cub reduce Array: size not match. Received " - "vec.size() %d > ElementCount %d.", - vec.size(), - ElementCount)); - size_t n = static_cast(vec.size()); - phi::Array ret; - for (size_t i = 0; i < n; ++i) { - ret[i] = vec[i]; - } - return ret; -} - -static inline std::vector GetReduceDim(const std::vector& dims, - int dim_size, - bool reduce_all) { - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(dim_size); - int reduce_size = reduce_dims.size(); - for (int i = 0; i < reduce_size; ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - PADDLE_ENFORCE_LT(e, - dim_size, - phi::errors::InvalidArgument( - "ReduceOp: invalid axis, when x_dims is %d, " - "axis[i] should less than x_dims, but got %d.", - dim_size, - e)); - reduce_dims.push_back(e >= 0 ? e : e + dim_size); - } - } - return reduce_dims; -} - -} // namespace details - -constexpr int kMaxRank = phi::DDim::kMaxRank; - -enum ReduceType { - kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; - kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim - kReduceAny = 0x03, // when reduce_dim.size() > 1 -}; - -struct IndexCalculator { - IndexCalculator(int dim, - const std::vector& cal_dims, - const std::vector& cal_strides, - const std::vector& full_strides) - : dim(dim) { - dims = details::VectorToArray(cal_dims); - strides = details::VectorToArray(full_strides); - reduce_strides = details::VectorToArray(cal_strides); -#ifndef PADDLE_WITH_XPU_KP - std::vector cal_divmoders; - // fast divmod - for (auto i : cal_strides) { - cal_divmoders.push_back(paddle::platform::FastDivMod(i)); - } - divmoders = details::VectorToArray( - cal_divmoders); -#endif - } - - __device__ inline int operator()(int offset) const { -#ifdef PADDLE_WITH_XPU_KP - int index = 0; -#pragma unroll - for (int i = 0; i < kMaxRank; ++i) { - if (i == dim) { - break; - } - index += (offset / reduce_strides[i]) * strides[dims[i]]; - offset = offset % reduce_strides[i]; - } - return index; -#else - int index = 0; -#pragma unroll - for (int i = 0; i < kMaxRank; ++i) { - if (i == dim) { - break; - } - auto divmod = divmoders[i].Divmod(offset); - index += (divmod.val[0] * strides[dims[i]]); - offset = divmod.val[1]; - } - return index; -#endif - } - - int dim; - phi::Array dims; - phi::Array strides; - phi::Array reduce_strides; -#ifndef PADDLE_WITH_XPU2 - phi::Array divmoders; -#endif -}; - -template -struct ReduceIndexMapping { - const kps::DimConfig dim; - HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) - : dim(dims) {} - - __device__ __forceinline__ int BlockIdX() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return (cluster_id() / dim.split_num_x % dim.split_num_y); - } else { - return cluster_id() % dim.split_num_x; - } -#else - return blockIdx.x; -#endif - } - - __device__ __forceinline__ int BlockIdY() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return (cluster_id() % dim.split_num_x); - } else { - return (cluster_id() / dim.split_num_x % dim.split_num_y); - } -#else - return blockIdx.y; -#endif - } - - __device__ __forceinline__ int BlockDimX() { -#ifdef PADDLE_WITH_XPU2 - return dim.deal_size_x; -#else - return blockDim.x; -#endif - } - - __device__ __forceinline__ int BlockDimY() { -#ifdef PADDLE_WITH_XPU2 - return 1; -#else - return blockDim.y; -#endif - } - - __device__ __forceinline__ int GridDimX() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.split_num_y; - } else { - return dim.split_num_x; - } -#else - return gridDim.x; -#endif - } - - __device__ __forceinline__ int GridDimY() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.split_num_x; - } else { - return dim.split_num_y; - } -#else - return gridDim.y; -#endif - } - - __device__ __forceinline__ int GetLoopSize() { -#ifdef PADDLE_WITH_XPU2 - if (ReduceLastDim) { - return dim.deal_size_y; - } else { - return dim.deal_size_x; - } -#else - return 1; -#endif - } -}; - -// when reduce_type == kReduceLastDim this struct will be used -// for higher performance -struct OneDimIndexCal { - explicit OneDimIndexCal(int num) : stride(num) {} - - __device__ inline int operator()(int index) const { return index * stride; } - int stride; -}; - -// reduce config -template -struct ReduceConfig { - ReduceConfig(const std::vector& origin_reduce_dims, - const std::vector& origin_x_dim) - : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {} - - // get the parameters of reduceKernel - void Run() { - // step1: update the reduce_dim left_dim and x_dim - SetReduceDim(); - - // step2: get the strides of dim for reduceAny and reduceLastDim - SetStrides(); - - // step3: get the type of reduce - SetReduceType(); - - // step4: set the block and grid for launch kernel - SetBlockDim(); - } - - // when should_reduce_again is true, we need malloc temp space for temp data - void SetOutputData(Ty* y_data, - const paddle::platform::Place& place, - phi::DenseTensor* tmp) { - if (should_reduce_again) { - tmp->ResizeAndAllocate(phi::make_ddim( - {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); - output_data = tmp->mutable_data(place); - } else { - output_data = y_data; - } - } - - private: - // set reduce_dim, left_dim and update x_dim - // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1] - // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] - void SetReduceDim() { - std::set reduce_set; - for (auto e : reduce_dims_origin) { - auto pos = e >= 0 ? e : e + x_dim.size(); - reduce_set.insert(pos); - } - - std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); - std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); - - // update reduce_dim and x_dim - std::vector x_new_dim; - - reduce_dim.push_back(reduce_dim_temp[0]); - x_new_dim.push_back(x_dim[0]); - - int idx_reduce = 1; - int num = 0; - - if (reduce_dim_temp.size() > 1) { - for (int i = 1; i < x_dim.size(); i++) { - if ((idx_reduce < reduce_dim_temp.size()) && - (i == reduce_dim_temp[idx_reduce])) { - int result = - reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; - bool is_equal = ((result - num) == 1); - if (is_equal) { - x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; - num++; - } else { - reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); - x_new_dim.push_back(x_dim[i]); - } - idx_reduce++; - } else { - x_new_dim.push_back(x_dim[i]); - } - } - } else { - x_new_dim = x_dim; - } - - // update x_dim - x_dim = x_new_dim; - std::vector().swap(x_new_dim); - - std::vector reduce_dim_new; - int is_reduced = 0; - for (auto e : reduce_dim) { - is_reduced |= 1 << e; - } - - std::vector().swap(reduce_dim); - - for (int i = 0; i < x_dim.size(); i++) { - if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { - x_new_dim.push_back(x_dim[i]); - if ((is_reduced >> i) & 1) - reduce_dim_new.push_back(x_new_dim.size() - 1); - } else { - x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; - } - } - - x_dim = x_new_dim; - reduce_dim = reduce_dim_new; - - int x_rank = static_cast(x_dim.size()); - std::set left_set; - - for (int i = 0; i < x_rank; ++i) { - left_set.insert(i); - } - - for (auto e : reduce_dim) { - left_set.erase(e); - } - - left_dim.assign(left_set.begin(), left_set.end()); - - // if the last dim gets involved in reduction - reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1); - } - - // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny - // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1] - // --SetStrides--> x_strides= [6,1], reduce_strides = [1], - // left_strides = [1] - void SetStrides() { - std::vector idx_dim; - for (int i = 0; i < x_dim.size(); i++) { - idx_dim.push_back(i); - } - - x_strides = details::GetDimStrides(x_dim, idx_dim); - reduce_strides = details::GetDimStrides(x_dim, reduce_dim); - left_strides = details::GetDimStrides(x_dim, left_dim); - reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; - - left_num = 1; - if (left_dim.size()) { - left_num = left_strides[0] * x_dim[left_dim[0]]; - } - } - - // get the reduceType - // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim - // x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim - // x_dim = [8] reduce_dim = [0] --> reduceAll - // x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny - void SetReduceType() { - int rank = x_dim.size(); - int reduce_rank = reduce_dim.size(); - bool is_last_dim = - (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); - if (rank == reduce_rank || is_last_dim) { -#ifdef PADDLE_WITH_XPU_KP - reduce_type = static_cast(ReduceType::kReduceAny); -#else - reduce_type = static_cast(ReduceType::kReduceLastDim); -#endif - } else if (reduce_rank == 1) { -// ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU_KP - if (reduce_dim[0] == 0) { - reduce_type = static_cast(ReduceType::kReduceHigherDim); - } else { - reduce_type = static_cast(ReduceType::kReduceAny); - } -#else - reduce_type = static_cast(ReduceType::kReduceHigherDim); -#endif - } else { - reduce_type = static_cast(ReduceType::kReduceAny); - } - } - -#ifndef PADDLE_WITH_XPU_KP - void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { - constexpr int min_reduce_num_per_thread = 16; - constexpr int max_reduce_num_per_thread = 256; - constexpr int max_num_threads = kps::details::kReduceMaxThread; - - // set block size. - // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same - // will process the reduction for one output. - // The number of output for one block is blockDim.y; - // 2. If reduce_last_dim == false, different threadIdx.x will process - // different reduction and gets the output separately. If it is - // necessary, it should reduce in block y. - // The number of output for one block is blockDim.x; - int block_x, block_y; - int grid_num, reduce_num_per_thread; - if (reduce_last_dim) { - block_x = details::GetBlockDim(reduce_num); - block_y = details::GetBlockDim(left_num); - block_dim->x = block_x; - block_dim->y = - std::min(block_y, static_cast(max_num_threads / block_dim->x)); - grid_num = details::AlignUp(left_num, block_dim->y); - reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x); - } else { - block_x = details::GetBlockDim(left_num); - block_y = details::GetBlockDim(reduce_num); - block_dim->x = std::min(block_x, 32); - block_dim->y = - std::min(block_y, static_cast(max_num_threads / block_dim->x)); - block_dim->x = - std::min(block_x, static_cast(max_num_threads / block_dim->y)); - grid_num = details::AlignUp(left_num, block_dim->x); - reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y); - } - int device_id = paddle::platform::GetCurrentDeviceId(); - int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); - int max_threads_per_mp = - paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); - int max_threads = max_threads_per_mp * max_mp; - int num_threads = block_dim->x * block_dim->y; - int max_num_blocks = max_threads / num_threads; - - // set grid size. - // Whether to set grid.y larger than 1, there are 3 following rules: - // 1. The number that each thread process should no less than - // min_reduce_num_per_threadbut no more than max_reduce_num_per_thread; - // 2. It should maximize the utilization of SM. - // So we choose the minimum between input_split_num_1 and input_split_num_3 - // to make each thread process as mush data as possible. Meanwhile, - // the number cannot be larger than max_reduce_num_per_thread, so we - // choose the maximum between the result above and input_split_num_2. - int input_split_num_1 = - details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread); - int input_split_num_2 = - details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread); - int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num); - - grid_dim->x = grid_num; - grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3), - input_split_num_2); - // if grid.y > 1, we need launch reduce kernel again. - if (grid_dim->y > 1) { - should_reduce_again = true; - } - } - - // set block and grid for launch kernel - // for ReduceHigherDim: if block is enough -> splite reduce_num - // else init block(32, 1) grid(block_num, 1) - // for others: block(block_num, 1) , grid(left_num, 1) - void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) { - int last_dim_num = x_dim.back(); - // update left_num - int grid_z = left_num / last_dim_num; - left_num = last_dim_num; - grid_dim->z = grid_z; - int device_id = paddle::platform::GetCurrentDeviceId(); - int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); - int max_threads_per_mp = - paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); - int max_threads = max_threads_per_mp * max_mp; - // init - int num_block = (max_threads / left_num); - block_dim->x = details::GetBlockDim(left_num); - grid_dim->x = details::AlignUp(left_num, block_dim->x); - blocking_size = reduce_num; - - if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - blocking_size = details::GetLastPow2(reduce_num / num_block); - if (blocking_size <= 1) { - blocking_size = details::GetLastPow2(sqrt(reduce_num)); - } else if (blocking_size * 2 < reduce_num) { - blocking_size *= 2; - } - should_reduce_again = true; - grid_dim->y = details::AlignUp(reduce_num, blocking_size); - } - } -#endif - - void SetBlockDim() { - // init - int block_num = details::GetBlockDim(reduce_num); - should_reduce_again = false; - dim3 block_dim(block_num, 1, 1); - dim3 grid_dim(left_num, 1, 1); - blocking_size = reduce_num; -#ifdef PADDLE_WITH_XPU_KP - if (reduce_last_dim) { - block_dim.x = 64; - block_dim.y = reduce_num; - grid_dim.x = 1; - grid_dim.y = 8; - } else { - block_dim.x = 64; - block_dim.y = left_num; - grid_dim.x = 8; - grid_dim.y = 1; - } -#else - if (reduce_type == ReduceType::kReduceHigherDim) { - SetBlockDimForHigher(&block_dim, &grid_dim); - } else { - SetBlockDimForReduceAny(&block_dim, &grid_dim); - } -#endif - - block = block_dim; - grid = grid_dim; - } - - public: - std::vector reduce_dims_origin; - std::vector reduce_dim; - std::vector x_dim; - std::vector left_dim; - std::vector x_strides; - std::vector left_strides; - std::vector reduce_strides; - - int reduce_type; - int reduce_num; - int left_num; - int blocking_size; - bool should_reduce_again; - bool reduce_last_dim; - - Ty* output_data; - - dim3 block; - dim3 grid; -}; - -// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or -// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this -// function will be used -template -__global__ void ReduceAnyKernel(const Tx* x, - Ty* y, - ReduceOp reducer, - TransformOp transformer, - MPType init, - int reduce_num, - int left_num, - bool reduce_last_dim, - const Calculator reduce_index_calculator, - const Calculator left_index_calculator, - const kps::DimConfig dim) { - int input_idx, left_idx, stride; - int block_size = 0; - bool need_store = true; - int loop_left = 0; - int tid = 0; - // the last dim gets involved in reduction - int store_offset = 0; - int stride_left = 0; - if (reduce_last_dim) { - auto block = ReduceIndexMapping(dim); - input_idx = block.BlockIdY() * block.BlockDimX(); - left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y; - stride = block.GridDimY() * block.BlockDimX(); - block_size = block.BlockDimX(); - need_store = (THREAD_ID_X == 0) && (left_idx < left_num); - store_offset = block.BlockIdY() * left_num + left_idx; - loop_left = min(block.GetLoopSize(), left_num - left_idx); - stride_left = 1; - tid = THREAD_ID_X; - } else { - auto block = ReduceIndexMapping(dim); - input_idx = block.BlockIdY() * block.BlockDimY(); - left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X; - stride = block.GridDimY() * block.BlockDimY(); - block_size = block.BlockDimY(); - need_store = (THREAD_ID_Y == 0) && (left_idx < left_num); - loop_left = min(block.GetLoopSize(), left_num - left_idx); - stride_left = block.BlockDimX() * block.GridDimX(); - store_offset = block.BlockIdY() * left_num + left_idx; - tid = THREAD_ID_Y; - } - // calculate the offset, means the addr where each thread really start. - // 1. reduce for each thread - MPType input_compute[REDUCE_VEC_SIZE]; - Tx input_reg[REDUCE_VEC_SIZE]; - int input_idx_tmp = input_idx; - for (int i = 0; i < loop_left; i += stride_left) { - int input_offset = left_index_calculator(left_idx + i); - const _ptr_ Tx* input = x + input_offset; - MPType reduce_var = init; - // load REDUCE_VEC_SIZE data once, and then compute - int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; - input_idx = input_idx_tmp; - for (; input_idx + block_size < bound; - input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce, - false>(&input_reg[0], - input, - input_idx, - reduce_index_calculator, - 1, - reduce_num, - 1, - stride, - kps::IdentityFunctor(), - reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce(&input_compute[0], - input, - input_idx, - reduce_index_calculator, - 1, - reduce_num - input_idx, - 1, - stride, - transformer, - reduce_last_dim); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - - kps::Reduce( - &reduce_var, &reduce_var, reducer, reduce_last_dim); - if (need_store) { - y[store_offset + i] = static_cast(reduce_var); - } - } -} - -template -__global__ void ReduceHigherDimKernel(const Tx* x, - Ty* y, - ReduceOp reducer, - TransformOp transformer, - MPType init, - int reduce_num, - int left_num, - int blocking_size, - const kps::DimConfig dim) { - // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this - // function will be used - auto block = ReduceIndexMapping(dim); - int idy = block.BlockIdY() * blocking_size; - int idx = block.BlockIdX() * block.BlockDimX(); - int idz = BLOCK_ID_Z * left_num; - int stride = dim.split_num_x * dim.deal_size_x; - int size = left_num - dim.rem_x; - int loop_size = min(reduce_num - idy, blocking_size); - int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); - int block_offset = idy * left_num + idz * reduce_num; - const _ptr_ Tx* input = x + block_offset; - Tx reduce_input; - for (; idx < size; idx += stride) { - MPType reduce_var = init; - MPType reduce_compute = init; - for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { - kps::ReadData(&reduce_input, - input + loop_idx * left_num + idx, - block.BlockDimX(), - 1, - 1, - left_num); - kps::ElementwiseUnary( - &reduce_compute, &reduce_input, transformer); - kps::Reduce( - &reduce_var, &reduce_compute, reducer, false); - } - Ty result = static_cast(reduce_var); - kps::WriteData( - y + store_offset + idx, &result, block.BlockDimX()); - } - - if (idx < left_num) { - MPType reduce_var = init; - MPType reduce_compute = init; - for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) { - kps::ReadData(&reduce_input, - input + loop_idx * left_num + idx, - dim.rem_x, - 1, - 1, - left_num); - kps::ElementwiseUnary( - &reduce_compute, &reduce_input, transformer); - kps::Reduce( - &reduce_var, &reduce_compute, reducer, false); - } - Ty result = static_cast(reduce_var); - kps::WriteData( - y + store_offset + idx, &result, dim.rem_x); - } -} - -template -static void LaunchReduceKernel(const Tx* x_data, - Ty* y_data, - const ReduceOp& reducer, - const TransformOp& transform, - MPType init, - KPStream stream, - ReduceConfig config) { - if (config.reduce_type == kReduceLastDim) { - int stride_reduce = 1; - int stride_left = config.reduce_num; - // for higher performance - auto reduce_index_calculator = OneDimIndexCal(stride_reduce); - auto left_index_calculator = OneDimIndexCal(stride_left); - - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.block.y, - 0); - dim.SetRem(config.reduce_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#else - ReduceAnyKernel<<>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#endif - - } else { - int reduce_rank = config.reduce_strides.size(); - int left_rank = config.left_strides.size(); - auto reduce_index_calculator = IndexCalculator(reduce_rank, - config.reduce_dim, - config.reduce_strides, - config.x_strides); - auto left_index_calculator = IndexCalculator( - left_rank, config.left_dim, config.left_strides, config.x_strides); - - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.block.y, - 0); - dim.SetRem(config.reduce_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#else - ReduceAnyKernel<<>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); -#endif - } - - if (config.should_reduce_again) { - dim3 block; - dim3 grid; - if (config.reduce_last_dim) { - block = dim3(32, 1, 1); - grid = dim3(details::AlignUp(config.left_num, 32), 1, 1); - } else { - block = dim3(config.block.x, 1, 1); - grid = dim3(config.grid.x, 1, config.grid.z); - } - - auto last_index = OneDimIndexCal(1); - auto first_index = OneDimIndexCal(config.left_num); - kps::DimConfig dim = - kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); - dim.SetRem(config.left_num % block.x, 0, 0); -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#else - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#endif - } -} - -template class ReduceOp, - typename TransformOp> -static typename std::enable_if::value, - void>::type -CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - KPStream stream) { - auto reducer = ReduceOp(); - cub::TransformInputIterator trans_x(x_data, - transform); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, - temp_storage_bytes, - trans_x, - y_data, - reduce_num, - reducer, - reducer.initial(), - stream); - - phi::DenseTensor tmp = phi::DenseTensor( - phi::make_intrusive(place), - phi::DenseTensorMeta( - phi::DataType::UINT8, - phi::make_ddim({static_cast(temp_storage_bytes)}))); - - auto* temp_storage = tmp.mutable_data(place); - - cub::DeviceReduce::Reduce(temp_storage, - temp_storage_bytes, - trans_x, - y_data, - reduce_num, - reducer, - reducer.initial(), - stream); -} - -template class ReduceOp, - typename TransformOp> -static typename std::enable_if::value, - void>::type -CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - KPStream stream) { - PADDLE_THROW(phi::errors::InvalidArgument( - "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); -} - -template class ReduceOp, - typename TransformOp> -void TensorReduceImpl(const phi::GPUContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - KPStream stream) { - y->mutable_data(x.place()); - - auto x_dim = phi::vectorize(x.dims()); - auto config = ReduceConfig(origin_reduce_dims, x_dim); - config.Run(); - int numel = x.numel(); - // after config.run() - // SetOutputData for ReduceHigherDim when should_reduce_again is true, - // temp_output should be stored temp_data in output_data space or stored in - // y_data; - - phi::DDim tmp_ddim; - phi::DenseTensor tmp = phi::DenseTensor( - phi::make_intrusive(y->place()), - phi::DenseTensorMeta(y->dtype(), tmp_ddim, y->layout())); - - auto x_data = x.data(); - auto y_data = y->data(); - - if (config.reduce_num == 1) { - std::vector inputs = {&x}; - std::vector outputs = {y}; - funcs::ElementwiseKernel(dev_ctx, inputs, &outputs, transform); - return; - } - - config.SetOutputData(y_data, x.place(), &tmp); - constexpr bool kIsTxFP16 = std::is_same::value; - bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; -#ifndef PADDLE_WITH_XPU_KP - if (use_cub_reduce) { - CubTensorReduceImpl( - x_data, y_data, transform, config.reduce_num, x.place(), stream); - return; - } -#endif - - using MPType = typename kps::details::MPTypeTrait::Type; - auto reducer = ReduceOp(); - // launch ReduceHigherDimKernel - // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this - // function will be used - // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 - // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / - // 32 - // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 - if (config.reduce_type == ReduceType::kReduceHigherDim) { - kps::DimConfig dim = kps::DimConfig(config.grid.x, - config.grid.y, - config.grid.z, - config.block.x, - config.blocking_size, - 0); - dim.SetRem(config.left_num % config.block.x, - config.reduce_num % config.blocking_size, - 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel, - TransformOp><<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#else - ReduceHigherDimKernel< - Tx, - Ty, - MPType, - ReduceOp, - TransformOp><<>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#endif - - if (config.should_reduce_again) { - dim3 block = dim3(config.block.x, 1, 1); - dim3 grid = dim3(config.grid.x, 1, config.grid.z); - kps::DimConfig dim2 = - kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); - dim2.SetRem(config.left_num % config.block.x, 0, 0); - -#ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#else - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#endif - } - return; - } - - // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or - // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this - // function will be used - LaunchReduceKernel, TransformOp>( - x_data, y_data, reducer, transform, reducer.initial(), stream, config); -} - -} // namespace kernels template class ReduceOp, @@ -1252,7 +32,7 @@ void Reduce(const KPDevice& dev_ctx, DataType out_dtype, DenseTensor* out) { std::vector reduce_dims = - phi::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all); + phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); int reduce_num = 1; for (auto i : reduce_dims) { @@ -1271,10 +51,10 @@ void Reduce(const KPDevice& dev_ctx, "TensorReduceImpl", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - phi::kernels::TensorReduceImpl>( + phi::funcs::TensorReduceImpl>( dev_ctx, tmp_tensor, out, @@ -1284,7 +64,7 @@ void Reduce(const KPDevice& dev_ctx, })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - phi::kernels::TensorReduceImpl>( + phi::funcs::TensorReduceImpl>( dev_ctx, x, out, diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 7ac7c451b00..4266f0174ff 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -17,7 +17,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { @@ -34,7 +34,7 @@ void TraceKernel(const Context& ctx, auto stream = ctx.stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( ctx, diag, out, kps::IdentityFunctor(), reduce_dims, stream); } else { phi::funcs::SetConstant functor; diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index f2549c171dd..7c8d10e0565 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -60,7 +60,7 @@ struct ReduceSumForMatmulGrad { DenseTensor* output, const std::vector& reduce_dims) { auto stream = dev_ctx.stream(); - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); } }; -- GitLab From b9672a1eeef8495313efcd5d4e8be2383571881c Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 4 Mar 2022 13:11:09 +0800 Subject: [PATCH 045/261] clean distribution_helper, index_impl, aligned_vector code in fluid (#40071) * clean distribution_helper, index_impl, aligned_vector code in fluid * fix conflicts --- paddle/fluid/operators/distribution_helper.h | 244 ------------------ paddle/fluid/operators/dropout_impl.cu.h | 24 +- paddle/fluid/operators/exponential_op.cc | 2 +- paddle/fluid/operators/exponential_op.cu | 6 +- paddle/fluid/operators/exponential_op.h | 2 +- .../fluid/operators/fused/attn_bias_add.cu.h | 6 +- .../operators/fused/fused_dropout_act_bias.h | 30 +-- .../operators/fused/fused_dropout_common.h | 2 +- .../fused_layernorm_residual_dropout_bias.h | 46 ++-- .../fused/fused_residual_dropout_bias.h | 41 ++- paddle/fluid/operators/gaussian_random_op.cu | 9 +- paddle/fluid/operators/gelu_op.cu | 8 +- paddle/fluid/operators/index_impl.cu.h | 6 +- paddle/fluid/operators/layer_norm_kernel.cu.h | 44 ++-- .../operators/optimizers/cast_with_ptr.h | 3 +- .../optimizers/distributed_fused_lamb_op.cu | 64 ++--- .../operators/optimizers/lars_momentum_op.cu | 4 +- paddle/fluid/operators/uniform_random_op.h | 13 +- paddle/fluid/platform/fast_divmod.h | 4 +- paddle/phi/kernels/funcs/broadcast_function.h | 8 +- .../phi/kernels/funcs/distribution_helper.h | 1 + paddle/phi/kernels/funcs/elementwise_base.h | 11 +- paddle/phi/kernels/gpu/bernoulli_kernel.cu | 3 +- paddle/phi/kernels/gpu/cast_kernel.cu | 2 +- 24 files changed, 166 insertions(+), 417 deletions(-) delete mode 100644 paddle/fluid/operators/distribution_helper.h diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h deleted file mode 100644 index c13bf687af2..00000000000 --- a/paddle/fluid/operators/distribution_helper.h +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -#endif - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/core/hostdevice.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/kernels/primitive/kernel_primitives.h" -#endif - -#if !defined(_WIN32) -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -#else -// there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition) -#endif - -namespace paddle { -namespace distribution { - -using Tensor = framework::Tensor; - -/********************* Transformation Function **********************/ -template -struct exponential_transform { - explicit exponential_transform(T lambda) : lambda_(lambda) {} - - HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) - if (std::is_same::value) { - return static_cast(-1.0) / lambda_ * log(val); - } else { - return static_cast(-1.0) / lambda_ * __logf(val); - } -#else - return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); -#endif - } - - private: - T lambda_; -}; - -template -struct uniform_transform { - explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} - - HOSTDEVICE inline T operator()(T val) const { - if (UNLIKELY(val == static_cast(1.0))) { - return min_; - } else { - return val * range_ + min_; - } - } - - private: - T range_; - T min_; -}; - -template -struct normal_transform { - explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} - - HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } - - private: - T mean_; - T std_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) - -namespace kps = phi::kps; - -/*********************** Distribution Function *************************/ -template -struct uniform_distribution; - -template -struct normal_distribution; - -#if defined(__NVCC__) -template <> -struct uniform_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -#else -template <> -struct uniform_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; -#endif - -/******** Launch GPU function of distribution and transformation *********/ -template -__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, - DistOp dist, TransformOp trans, T *out_data, - size_t stride) { - size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); - static constexpr int kCount = DistOp::kReturnsCount; -#if defined(__NVCC__) - curandStatePhilox4_32_10_t state; - curand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = curandStatePhilox4_32_10_t; -#else - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = hiprandStatePhilox4_32_10_t; -#endif - size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; - T args[kCount]; - T result[kCount]; - for (size_t i = idx; i < size; i += total_thread * kCount) { - kps::ElementwiseRandom(&args[0], dist, &state); - kps::ElementwiseUnary(&result[0], &args[0], - trans); - kps::WriteData(out_data + i, &result[0], size - i, - 1, stride, 1); - __syncthreads(); - } -} - -template -void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, - Tensor *out, DistOp dist, TransformOp trans) { - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - auto size = out->numel(); - - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - size_t block_size = 256; - size_t expect_grid_size = (size + block_size - 1) / block_size; - const auto &prop = platform::GetDeviceProperties(device_id); - size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * - prop.multiProcessorCount; - size_t grid_size = - expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size; - - size_t total_thread = block_size * grid_size; - size_t curand4_loop_times = - (size + 4 * total_thread - 1) / (4 * total_thread); - // 'increment' shoulde be multiple of 4 - uint64_t increment = curand4_loop_times * 4; - - auto seed_offset = gen_cuda->IncrementOffset(increment); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - DistributionKernel< - T, DistOp, TransformOp><<>>( - size, seed, offset, dist, trans, out_data, total_thread); -} - -#endif - -} // namespace distribution -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index cdcf683fb92..dcdab033e8f 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -34,8 +34,8 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -86,8 +86,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, bool is_upscale_in_train, uint64_t increment) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; #ifdef PADDLE_WITH_HIP int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -102,7 +102,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, MT factor = static_cast(1.0f / (1.0f - dropout_prob)); for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) { LoadT src_val; - platform::Load(&src[i], &src_val); + phi::Load(&src[i], &src_val); #ifdef PADDLE_WITH_HIP float4 rand = hiprand_uniform4(&state); @@ -126,8 +126,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, } } - platform::Store(dst_val, &dst[i]); - platform::Store(mask_val, &mask[i]); + phi::Store(dst_val, &dst[i]); + phi::Store(mask_val, &mask[i]); } } @@ -153,16 +153,16 @@ __global__ void DropoutGradCUDAKernel( const typename details::MPTypeTrait::Type factor, const int64_t size, T* dx) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_val; - platform::Load(&dout[i], &dout_val); + phi::Load(&dout[i], &dout_val); MaskLoadT mask_val; - platform::Load(&mask[i], &mask_val); + phi::Load(&mask[i], &mask_val); LoadT dx_val; @@ -172,7 +172,7 @@ __global__ void DropoutGradCUDAKernel( static_cast(mask_val[j]) * factor); } - platform::Store(dx_val, &dx[i]); + phi::Store(dx_val, &dx[i]); } } @@ -219,7 +219,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, uint64_t increment; // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; - int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; + int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index ee456dcdafb..1a48a676785 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -76,7 +76,7 @@ class ExponentialKernel auto engine = gen->GetCPUEngine(); std::uniform_real_distribution uniform(0.0, 1.0); - distribution::exponential_transform trans(lambda); + phi::funcs::exponential_transform trans(lambda); for (int64_t i = 0; i < size; ++i) { out_data[i] = trans(uniform(*engine)); } diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu index 8b989501e4f..d5abbf9a26a 100644 --- a/paddle/fluid/operators/exponential_op.cu +++ b/paddle/fluid/operators/exponential_op.cu @@ -26,9 +26,9 @@ class ExponentialKernel auto& dev_cxt = ctx.template device_context(); T lambda = static_cast(ctx.Attr("lambda")); - distribution::uniform_distribution dist; - distribution::exponential_transform trans(lambda); - distribution::distribution_and_transform(dev_cxt, out, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::exponential_transform trans(lambda); + phi::funcs::distribution_and_transform(dev_cxt, out, dist, trans); } }; diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h index fbcabc594db..7ded174a9f4 100644 --- a/paddle/fluid/operators/exponential_op.h +++ b/paddle/fluid/operators/exponential_op.h @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distribution_helper.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 51cf3bce1ce..3a2de0c4a09 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary( template void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n, const T* in0, const T* in1, T* out) { - int in_vec_size = std::min(platform::GetVectorizedSize(in0), - platform::GetVectorizedSize(in1)); - int out_vec_size = std::min(4, platform::GetVectorizedSize(out)); + int in_vec_size = + std::min(phi::GetVectorizedSize(in0), phi::GetVectorizedSize(in1)); + int out_vec_size = std::min(4, phi::GetVectorizedSize(out)); int vec_size = std::min(out_vec_size, in_vec_size); int numel = m * n; diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 994601a2f06..9f5a1bad047 100755 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, const T factor, const int64_t size, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; LoadT src_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); - platform::Load(&src[i], &src_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); + phi::Load(&src[i], &src_vec); StoreT dx_vec; #pragma unroll @@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, T tmp = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]); } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, T *dx, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum if (col_id * VecSize < cols) { @@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, LoadT bias_vec; MaskLoadT mask_vec; - platform::Load(&dout[index], &dout_vec); - platform::Load(&src[index], &src_vec); - platform::Load(&mask[index], &mask_vec); - platform::Load(&bias[col_id * VecSize], &bias_vec); + phi::Load(&dout[index], &dout_vec); + phi::Load(&src[index], &src_vec); + phi::Load(&mask[index], &mask_vec); + phi::Load(&bias[col_id * VecSize], &bias_vec); StoreT dx_vec; #pragma unroll @@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, dx_vec[i] = val; tmp_sum[i] += val; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index f79277e4e8f..6bf3a7114f4 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -21,11 +21,11 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index ceba3accca7..d53a24a57e3 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -42,12 +42,12 @@ __device__ void CalcLayernormY( const LayerNormScaleBiasT *bias, const T *x, T *y, const int row_id, const int col_id, const int cols, const LayerNormParamType mean_val, const LayerNormParamType invvar) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using LoadU = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using LoadU = phi::AlignedVector; using LoadScaleOrBias = - platform::AlignedVector, - VecSize>; + phi::AlignedVector, + VecSize>; for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) { LoadScaleOrBias scale_vec; LoadScaleOrBias bias_vec; @@ -60,15 +60,15 @@ __device__ void CalcLayernormY( static_cast>(0); } // vectorize load data from global - platform::Load(&x[row_id * cols + i], &x_vec); + phi::Load(&x[row_id * cols + i], &x_vec); if (scale != nullptr) { - platform::Load, - VecSize>(&scale[i], &scale_vec); + phi::Load, VecSize>( + &scale[i], &scale_vec); } if (bias != nullptr) { - platform::Load, - VecSize>(&bias[i], &bias_vec); + phi::Load, VecSize>( + &bias[i], &bias_vec); } StoreT y_vec; @@ -78,7 +78,7 @@ __device__ void CalcLayernormY( (static_cast(x_vec[ii]) - mean_val) * invvar + static_cast(bias_vec[ii])); } - platform::Store(y_vec, &y[row_id * cols + i]); + phi::Store(y_vec, &y[row_id * cols + i]); } } @@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec residual[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); - platform::Load( - residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(residual_ptr + row * LN_NUM_COLS + col * VecSize, + &residual[it]); col += THREADS_PER_ROW; } @@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( // store dropout_residual_out and mask_out #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store( + phi::Store( x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize); - platform::Store( + phi::Store( mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } @@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 1b135ad6098..1d3085a013f 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val, Functor act_func) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; using U = typename details::MPTypeTrait::Type; LoadT src_vec; @@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( residual_vec[ii] = static_cast(0); } // vectorize load data from global - platform::Load(&src[row_id * cols + col_id], &src_vec); + phi::Load(&src[row_id * cols + col_id], &src_vec); if (residual) { - platform::Load(&residual[row_id * cols + col_id], - &residual_vec); + phi::Load(&residual[row_id * cols + col_id], &residual_vec); } if (bias) { - platform::Load(&bias[col_id], &bias_vec); + phi::Load(&bias[col_id], &bias_vec); } MaskStoreT mask_vec; @@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( } // store result to global - platform::Store(dest_vec, &dst[row_id * cols + col_id]); + phi::Store(dest_vec, &dst[row_id * cols + col_id]); if (!is_test) { - platform::Store(mask_vec, &mask[row_id * cols + col_id]); + phi::Store(mask_vec, &mask[row_id * cols + col_id]); } } @@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum @@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, LoadT out_vec; MaskLoadT mask_vec; StoreT dx_vec; - platform::Load(&dout[index], &out_vec); - platform::Load(&mask[index], &mask_vec); + phi::Load(&dout[index], &out_vec); + phi::Load(&mask[index], &mask_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, tmp_sum[i] += out_vec[i]; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index d419bd70e67..717ec774414 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -19,9 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/operators/index_impl.cu.h" + +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" DECLARE_bool(use_curand); @@ -79,10 +80,10 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int64_t gen_offset = size * seed_offset.second; auto func = GaussianGenerator(mean, std, seed_offset.first, seed_offset.second); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } else { auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } }; diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 6b778eee434..ef836ab72f0 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT in_arr = *reinterpret_cast(x + offset); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT x_in_arr = *reinterpret_cast(x + offset); ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); #pragma unroll @@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(y, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ @@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ is_aligned(x_g, kAlignment)) { \ diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index 2e3e6569ef5..bb26e2f445e 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -19,11 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace paddle { @@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int numel = out->numel(); T *out_data = out->mutable_data(dev_ctx.GetPlace()); if (numel <= 0) return; - int vec_size = paddle::platform::GetVectorizedSize(out_data); + int vec_size = phi::GetVectorizedSize(out_data); #ifdef PADDLE_WITH_XPU_KP int block = 64; int grid = 8; diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 62c21dd2eee..412ae3c49b5 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { namespace operators { @@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } @@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr, T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr, T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); col += THREADS_PER_ROW; } @@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, - &dout[it]); - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, + &dout[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); if (isFusedDropoutResidualLn) { - platform::Load( + phi::Load( mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]); } @@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Store(x[it], - dx_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize); if (isFusedDropoutResidualLn) { - platform::Store( + phi::Store( dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize); } col += THREADS_PER_ROW; @@ -641,7 +637,7 @@ template < __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_, ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) { - using Vec = platform::AlignedVector; + using Vec = phi::AlignedVector; static_assert(VEC_COLS == LN_NUM_COLS / VecSize, ""); const int tidx = threadIdx.x; @@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( for (int row = r; row < rows; row += ROWS_PER_CTA) { Vec dg; Vec db; - platform::Load(dg_part_ptr, &dg); - platform::Load(db_part_ptr, &db); + phi::Load(dg_part_ptr, &dg); + phi::Load(db_part_ptr, &db); dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index ab8b4f2b8f4..a3fbb0e59e2 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, PADDLE_ENFORCE_NE( static_cast(x), static_cast(y), platform::errors::InvalidArgument("Inplace cast is not supported yet.")); - int vec_size = - std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y)); + int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y)); switch (vec_size) { case 4: return details::VecCastKernel(ctx, x, y, n); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 8bb4606ffff..5b60f65442b 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -19,11 +19,11 @@ #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" #include "paddle/fluid/operators/tensor_to_string.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -66,8 +66,8 @@ struct L2NormFunctor { int i; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += (BlockDim * VecSize)) { - platform::AlignedVector tmp_vec; - platform::Load(ptr + i, &tmp_vec); + phi::AlignedVector tmp_vec; + phi::Load(ptr + i, &tmp_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { auto tmp = static_cast(tmp_vec[j]); @@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int max_load_bits = 128; int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); auto address = reinterpret_cast(ptr); - constexpr int vec8 = alignof(platform::AlignedVector); - constexpr int vec4 = alignof(platform::AlignedVector); - constexpr int vec2 = alignof(platform::AlignedVector); + constexpr int vec8 = alignof(phi::AlignedVector); + constexpr int vec4 = alignof(phi::AlignedVector); + constexpr int vec2 = alignof(phi::AlignedVector); chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); @@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector x_vec; - platform::AlignedVector y_vec; + phi::AlignedVector x_vec; + phi::AlignedVector y_vec; - platform::Load(x + i, &x_vec); + phi::Load(x + i, &x_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { y_vec[j] = static_cast(static_cast(x_vec[j]) * s); } - platform::Store(y_vec, y + i); + phi::Store(y_vec, y + i); } for (; i < num; ++i) { @@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector param_vec; - platform::AlignedVector grad_vec; - platform::AlignedVector mom1_vec; - platform::AlignedVector mom2_vec; - platform::AlignedVector trust_ratio_div_vec; + phi::AlignedVector param_vec; + phi::AlignedVector grad_vec; + phi::AlignedVector mom1_vec; + phi::AlignedVector mom2_vec; + phi::AlignedVector trust_ratio_div_vec; T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; if (cur_weight_decay != static_cast(0.0)) { - platform::Load(param_p + i, ¶m_vec); + phi::Load(param_p + i, ¶m_vec); } else { #pragma unroll for (int j = 0; j < VecSize; ++j) { param_vec[j] = static_cast(0); } } - platform::Load(grad_p + i, &grad_vec); - platform::Load(mom1_p + i, &mom1_vec); - platform::Load(mom2_p + i, &mom2_vec); + phi::Load(grad_p + i, &grad_vec); + phi::Load(mom1_p + i, &mom1_vec); + phi::Load(mom2_p + i, &mom2_vec); #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ __trust_ratio_div, __idx) \ @@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( mom2_vec, trust_ratio_div_vec, j); } - platform::Store(mom1_vec, mom1_p + i); - platform::Store(mom2_vec, mom2_p + i); - platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + phi::Store(mom1_vec, mom1_p + i); + phi::Store(mom2_vec, mom2_p + i); + phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i); } for (; i < num; ++i) { @@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor { trust_ratio_div += offset; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { - platform::AlignedVector trust_ratio_div_vec; - platform::Load(trust_ratio_div + i, &trust_ratio_div_vec); + phi::AlignedVector trust_ratio_div_vec; + phi::Load(trust_ratio_div + i, &trust_ratio_div_vec); if (HasMasterParam) { - platform::AlignedVector master_param_vec; - platform::Load(master_param + i, &master_param_vec); - platform::AlignedVector param_vec; + phi::AlignedVector master_param_vec; + phi::Load(master_param + i, &master_param_vec); + phi::AlignedVector param_vec; #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; master_param_vec[j] = p; param_vec[j] = static_cast(p); } - platform::Store(master_param_vec, master_param + i); - platform::Store(param_vec, param + i); + phi::Store(master_param_vec, master_param + i); + phi::Store(param_vec, param + i); } else { - platform::AlignedVector param_vec; - platform::Load(param + i, ¶m_vec); + phi::AlignedVector param_vec; + phi::Load(param + i, ¶m_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; param_vec[j] = static_cast(p); } - platform::Store(param_vec, param + i); + phi::Store(param_vec, param + i); } } diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index df5da1b7953..fe5cd066864 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate( T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, const int grid_stride, const int numel, MT* master_param_out = nullptr) { - using VecType = paddle::platform::AlignedVector; - using VecMType = paddle::platform::AlignedVector; + using VecType = phi::AlignedVector; + using VecMType = phi::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index a864c48ad75..b941dc21c3a 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -25,8 +25,9 @@ DECLARE_bool(use_curand); #include #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/operators/index_impl.cu.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #endif namespace paddle { @@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context, if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_cxt, tensor, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::uniform_real_transform trans(min, max); + phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; auto func = UniformGeneratorOffset(min, max, seed_offset.first, diag_num, diag_step, diag_val, gen_offset); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } #endif diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h index f26c4fdd17a..39eefab774d 100644 --- a/paddle/fluid/platform/fast_divmod.h +++ b/paddle/fluid/platform/fast_divmod.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #define INT_BITS 32 @@ -25,7 +25,7 @@ namespace platform { struct FastDivMod { // 1st value represents the result of input number divides by recorded divisor // 2nd value represents the result of input number modulo by recorded divisor - using DivModT = AlignedVector; + using DivModT = phi::AlignedVector; FastDivMod() {} HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) { diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index e9fd4cf47b8..aab31cfbd55 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize( "%d-th output tensor`s shape is not.", i)); out_vec_size = std::min( - paddle::platform::GetVectorizedSize((*outs)[i]->data()), - out_vec_size); + phi::GetVectorizedSize((*outs)[i]->data()), out_vec_size); } } else { - out_vec_size = - paddle::platform::GetVectorizedSize((*outs)[0]->data()); + out_vec_size = phi::GetVectorizedSize((*outs)[0]->data()); } for (auto *in : ins) { - auto temp_size = paddle::platform::GetVectorizedSize(in->data()); + auto temp_size = phi::GetVectorizedSize(in->data()); in_vec_size = in->dims() == (*outs)[0]->dims() ? std::min(temp_size, in_vec_size) : in_vec_size; diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index f0793fb9d27..3ef39dc55d1 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/hostdevice.h" #if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 235dbdd40f6..332ec0b0312 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -23,9 +23,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #define HOSTDEVICE __host__ __device__ @@ -546,9 +546,8 @@ struct VecSizeGetter { const ArgsT &args, int *vec_size) { using Type = std::tuple_element_t; - *vec_size = std::min( - *vec_size, - paddle::platform::GetVectorizedSize(ins[Index]->data())); + *vec_size = std::min(*vec_size, + phi::GetVectorizedSize(ins[Index]->data())); } }; @@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector &ins, // The Arg VecSize=1 is to match the Unroller template. Unroller::step(ins, arg, &vec_size); for (auto iter = outs.begin(); iter != outs.end(); ++iter) { - vec_size = std::min( - vec_size, paddle::platform::GetVectorizedSize((*iter)->data())); + vec_size = + std::min(vec_size, phi::GetVectorizedSize((*iter)->data())); } return vec_size; } diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index 2b6140d2fde..79d8a7b0f34 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/bernoulli_kernel.h" + #include #include #ifdef __NVCC__ @@ -28,7 +30,6 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/bernoulli_kernel.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index 569a46f56d5..542234c80b5 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -20,11 +20,11 @@ #include "paddle/phi/kernels/funcs/elementwise_base.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace phi { -- GitLab From d9dd840f09d53ba3c1f25f90ff5a6a333c9f4a31 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Fri, 4 Mar 2022 13:28:48 +0800 Subject: [PATCH 046/261] Publish header files for out user (#40150) --- paddle/fluid/memory/detail/buddy_allocator.cc | 1 + paddle/phi/backends/callback_manager.h | 2 -- paddle/phi/backends/device_base.cc | 1 + paddle/phi/backends/event.h | 3 ++- paddle/phi/backends/stream.h | 3 ++- python/setup.py.in | 6 +++++- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index cdaa2b7b1df..076a9613961 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { diff --git a/paddle/phi/backends/callback_manager.h b/paddle/phi/backends/callback_manager.h index a15cb075668..359958b7c93 100644 --- a/paddle/phi/backends/callback_manager.h +++ b/paddle/phi/backends/callback_manager.h @@ -30,8 +30,6 @@ #include #include // NOLINT -#include "paddle/fluid/platform/enforce.h" - namespace phi { namespace stream { diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc index 6f634c58af0..14fe90192e5 100644 --- a/paddle/phi/backends/device_base.cc +++ b/paddle/phi/backends/device_base.cc @@ -14,6 +14,7 @@ #include "paddle/phi/backends/device_base.h" #include "gflags/gflags.h" +#include "paddle/phi/core/enforce.h" DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h index f2e86343f8f..0866adcf39a 100644 --- a/paddle/phi/backends/event.h +++ b/paddle/phi/backends/event.h @@ -13,7 +13,8 @@ // limitations under the License. #pragma once -#include "paddle/fluid/platform/place.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/macros.h" namespace phi { diff --git a/paddle/phi/backends/stream.h b/paddle/phi/backends/stream.h index 6c26ab3c2d5..d1578c90ec1 100644 --- a/paddle/phi/backends/stream.h +++ b/paddle/phi/backends/stream.h @@ -14,8 +14,9 @@ #pragma once -#include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/callback_manager.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/macros.h" namespace phi { diff --git a/python/setup.py.in b/python/setup.py.in index 91580614fa9..0bc32cfbc00 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -576,8 +576,12 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) + # phi api list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) + # phi common headers # phi level api headers (low level api) - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # phi core headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi')) + # phi extension header + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/include', recursive=True)) + # phi include headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) + # phi backends headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # phi core headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) + # phi infermeta headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) + # phi kernels headers # utila api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers -- GitLab From 03eb792dcdb212f96c951fb48d13b6d9147c466c Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Fri, 4 Mar 2022 13:37:09 +0800 Subject: [PATCH 047/261] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20bitwise?= =?UTF-8?q?=5Fand/bitwise=5For/bitwise=5Fxor/bitwise=5Fnot=20op=20into=20p?= =?UTF-8?q?hi=20(#40031)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate bitwise_and/or/xor/not op into phi * fix CI --- .../operators/controlflow/CMakeLists.txt | 2 +- .../fluid/operators/controlflow/bitwise_op.cc | 43 ++++--- .../fluid/operators/controlflow/bitwise_op.cu | 74 ------------ .../fluid/operators/controlflow/bitwise_op.h | 112 ------------------ paddle/phi/kernels/bitwise_kernel.h | 44 +++++++ paddle/phi/kernels/cpu/bitwise_kernel.cc | 99 ++++++++++++++++ paddle/phi/kernels/funcs/bitwise_functors.h | 51 ++++++++ paddle/phi/kernels/gpu/bitwise_kernel.cu | 98 +++++++++++++++ 8 files changed, 313 insertions(+), 210 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/bitwise_op.cu delete mode 100644 paddle/fluid/operators/controlflow/bitwise_op.h create mode 100644 paddle/phi/kernels/bitwise_kernel.h create mode 100644 paddle/phi/kernels/cpu/bitwise_kernel.cc create mode 100644 paddle/phi/kernels/funcs/bitwise_functors.h create mode 100644 paddle/phi/kernels/gpu/bitwise_kernel.cu diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 70937069d97..0c18522fa32 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -21,4 +21,4 @@ endif() file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") -file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc index 55cab03ea9e..4dcbbc8568f 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cc +++ b/paddle/fluid/operators/controlflow/bitwise_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/bitwise_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` . } }; -class BitwiseOp : public framework::OperatorWithKernel { +template +class UnaryBitwiseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; protected: + void InferShape(framework::InferShapeContext *context) const override { + OpComment comment; + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel { }; template -class UnaryBitwiseOp : public BitwiseOp { - public: - using BitwiseOp::BitwiseOp; - - protected: - void InferShape(framework::InferShapeContext *context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - context->SetOutputDim("Out", context->GetInputDim("X")); - context->ShareLoD("X", "Out"); - } -}; - -template -class BinaryBitwiseOp : public BitwiseOp { +class BinaryBitwiseOp : public framework::OperatorWithKernel { public: - using BitwiseOp::BitwiseOp; + using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(framework::InferShapeContext *context) const override { @@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp { } context->ShareLoD("X", "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // BitwiseOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } }; } // namespace operators @@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y"); REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y"); REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y"); REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X"); - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu deleted file mode 100644 index 5d98da2c027..00000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/bitwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x, y}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, -1, - functor); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h deleted file mode 100644 index 9e652f92007..00000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ - template \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = T; \ - HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \ - }; \ - \ - template <> \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = bool; \ - HOSTDEVICE bool operator()(const bool a, const bool b) const { \ - return a bool_expr b; \ - } \ - }; - -BITWISE_BINARY_FUNCTOR(And, &, &&) -BITWISE_BINARY_FUNCTOR(Or, |, ||) -BITWISE_BINARY_FUNCTOR(Xor, ^, !=) -#undef BITWISE_BINARY_FUNCTOR - -template -struct BitwiseNotFunctor { - using ELEM_TYPE = T; - HOSTDEVICE T operator()(const T a) const { return ~a; } -}; - -template <> -struct BitwiseNotFunctor { - using ELEM_TYPE = bool; - HOSTDEVICE bool operator()(const bool a) const { return !a; } -}; - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - ElementwiseComputeEx(context, x, y, -1, func, - out); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), out->mutable_data(context.GetPlace()), - func); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>); - -#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>); diff --git a/paddle/phi/kernels/bitwise_kernel.h b/paddle/phi/kernels/bitwise_kernel.h new file mode 100644 index 00000000000..17307004f36 --- /dev/null +++ b/paddle/phi/kernels/bitwise_kernel.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BitwiseAndKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void BitwiseOrKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void BitwiseXorKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void BitwiseNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/bitwise_kernel.cc b/paddle/phi/kernels/cpu/bitwise_kernel.cc new file mode 100644 index 00000000000..69f52790f77 --- /dev/null +++ b/paddle/phi/kernels/cpu/bitwise_kernel.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/bitwise_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/bitwise_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/transform.h" + +namespace phi { + +#define DEFINE_BITWISE_KERNEL(op_type) \ + template \ + void Bitwise##op_type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + funcs::Bitwise##op_type##Functor func; \ + funcs::ElementwiseCompute, T, T>( \ + dev_ctx, x, y, -1, func, out); \ + } + +DEFINE_BITWISE_KERNEL(And) +DEFINE_BITWISE_KERNEL(Or) +DEFINE_BITWISE_KERNEL(Xor) +#undef DEFINE_BITWISE_KERNEL + +template +void BitwiseNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + const T* x_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); + size_t numel = x.numel(); + funcs::BitwiseNotFunctor func; + paddle::platform::Transform trans; + trans(dev_ctx, x_data, x_data + numel, out_data, func); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bitwise_and, + CPU, + ALL_LAYOUT, + phi::BitwiseAndKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_or, + CPU, + ALL_LAYOUT, + phi::BitwiseOrKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_xor, + CPU, + ALL_LAYOUT, + phi::BitwiseXorKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_not, + CPU, + ALL_LAYOUT, + phi::BitwiseNotKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/bitwise_functors.h b/paddle/phi/kernels/funcs/bitwise_functors.h new file mode 100644 index 00000000000..db1fc59f534 --- /dev/null +++ b/paddle/phi/kernels/funcs/bitwise_functors.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ + template \ + struct Bitwise##func##Functor { \ + HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \ + }; \ + \ + template <> \ + struct Bitwise##func##Functor { \ + HOSTDEVICE bool operator()(const bool a, const bool b) const { \ + return a bool_expr b; \ + } \ + }; + +BITWISE_BINARY_FUNCTOR(And, &, &&) +BITWISE_BINARY_FUNCTOR(Or, |, ||) +BITWISE_BINARY_FUNCTOR(Xor, ^, !=) +#undef BITWISE_BINARY_FUNCTOR + +template +struct BitwiseNotFunctor { + using ELEM_TYPE = T; + HOSTDEVICE T operator()(const T a) const { return ~a; } +}; + +template <> +struct BitwiseNotFunctor { + using ELEM_TYPE = bool; + HOSTDEVICE bool operator()(const bool a) const { return !a; } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/bitwise_kernel.cu b/paddle/phi/kernels/gpu/bitwise_kernel.cu new file mode 100644 index 00000000000..e88ecef318a --- /dev/null +++ b/paddle/phi/kernels/gpu/bitwise_kernel.cu @@ -0,0 +1,98 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/bitwise_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/bitwise_functors.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +namespace phi { + +#define DEFINE_BITWISE_KERNEL(op_type) \ + template \ + void Bitwise##op_type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + dev_ctx.template Alloc(out); \ + funcs::Bitwise##op_type##Functor func; \ + std::vector ins = {&x, &y}; \ + std::vector outs = {out}; \ + funcs::BroadcastKernel( \ + dev_ctx, ins, &outs, -1, func); \ + } + +DEFINE_BITWISE_KERNEL(And) +DEFINE_BITWISE_KERNEL(Or) +DEFINE_BITWISE_KERNEL(Xor) +#undef DEFINE_BITWISE_KERNEL + +template +void BitwiseNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + funcs::BitwiseNotFunctor func; + funcs::BroadcastKernel( + dev_ctx, ins, &outs, -1, func); +} + +} // namespace phi + +PD_REGISTER_KERNEL(bitwise_and, + GPU, + ALL_LAYOUT, + phi::BitwiseAndKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_or, + GPU, + ALL_LAYOUT, + phi::BitwiseOrKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_xor, + GPU, + ALL_LAYOUT, + phi::BitwiseXorKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} + +PD_REGISTER_KERNEL(bitwise_not, + GPU, + ALL_LAYOUT, + phi::BitwiseNotKernel, + bool, + uint8_t, + int8_t, + int16_t, + int, + int64_t) {} -- GitLab From 5435459a81fe68a170ce3ad08c588c3793bea773 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 4 Mar 2022 14:03:00 +0800 Subject: [PATCH 048/261] add communication api for ProcessGroupGloo (#40100) * add pg_gloo apis --- .../collective/ProcessGroupGloo.cc | 194 ++++++++++++++++++ .../distributed/collective/ProcessGroupGloo.h | 14 ++ paddle/fluid/distributed/store/tcp_store.cc | 68 +++--- .../tests/unittests/process_group_gloo.py | 83 +++++++- 4 files changed, 327 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 03ad48f560a..5dc43af1178 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -25,6 +25,8 @@ #endif #include +#include +#include #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/platform/enforce.h" @@ -144,6 +146,22 @@ void set_inputs(P& opts, const std::vector& tensors) { // NOLINT opts.setInputs(get_multi_data(tensors), tensors[0].numel()); } +template +void set_inputs_for_scatter(P& opts, // NOLINT + const std::vector& tensors, // NOLINT + int nranks) { + std::vector ret(nranks); + auto raw_tensor = + std::dynamic_pointer_cast(tensors[0].impl()); + T* raw_pointer = reinterpret_cast(raw_tensor->data()); + size_t offset = 0; + for (int i = 0; i < nranks; i++) { + ret[i] = raw_pointer + offset; + offset += tensors[0].numel() / nranks; + } + opts.setInputs(ret, tensors[0].numel() / nranks); +} + ProcessGroupGloo::GlooTask::GlooTask(int rank, const std::vector& inputs, CommType comm_type) @@ -257,6 +275,182 @@ std::shared_ptr ProcessGroupGloo::AllReduce( return task; } +class BarrierGlooTask : public ProcessGroupGloo::GlooTask { + public: + BarrierGlooTask(int rank, const std::shared_ptr& context) + : ProcessGroupGloo::GlooTask(rank, std::vector{}, + CommType::BARRIER), + _context(context) {} + + void Run() override { _do_barrier(); } + + private: + std::shared_ptr _context; + + void _do_barrier() { + gloo::BarrierOptions opts(_context); + gloo::barrier(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Barrier( + const BarrierOptions& opts) { + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context); + task->Run(); + return task; +} + +class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllgatherGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _tag(tag) {} + + void Run() override { _do_allgather(_inputs, _outputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + uint32_t _tag; + + void _do_allgather(std::vector& in, // NOLINT + std::vector& out) { // NOLINT + const auto& dtype = in[0].type(); + gloo::AllgatherOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, in[0]); + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setTag(_tag); + gloo::allgather(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, in_tensors, + out_tensors, tag); + task->Run(); + return task; +} + +class ReduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + ReduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& in, ReduceOp reduce_op, // NOLINT + int dst, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE), + _context(context), + _inputs(in), + _reduce_op(reduce_op), + _dst(dst), + _tag(tag) {} + + void Run() override { _do_reduce(_inputs, _dst); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + int _dst; + uint32_t _tag; + + gloo::ReduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::ReduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::ReduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_reduce(std::vector& tensors, int dst) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::ReduceOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, tensors[0]); + GENERATE_FUNC(dtype, set_output, opts, tensors[0]); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + opts.setRoot(dst); + gloo::reduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, tensors, + opts.reduce_op, opts.root_rank, tag); + task->Run(); + return task; +} + +class ScatterGlooTask : public ProcessGroupGloo::GlooTask { + public: + ScatterGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int src, int size, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _src(src), + _size(size), + _tag(tag) {} + + void Run() override { _do_scatter(_inputs, _outputs, _src); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + int _src; + int _size; + uint32_t _tag; + + void _do_scatter(std::vector& in, std::vector& out, // NOLINT + int src) { + const auto& dtype = in[0].type(); + gloo::ScatterOptions opts(_context); + if (rank_ == src) { + GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size); + } + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setRoot(src); + opts.setTag(_tag); + gloo::scatter(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared( + rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag); + task->Run(); + return task; +} + std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { ::gloo::transport::tcp::attr attr; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index d989939fcb8..24f156571a4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -114,6 +114,20 @@ class ProcessGroupGloo : public ProcessGroup { std::vector& inputs, const AllreduceOptions& opts = AllreduceOptions()) override; + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + std::shared_ptr<::gloo::Context> get_context() { return _context; } uint64_t next_tag() { return _tag++; } diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index 8675981955d..eb98c89c99e 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -74,6 +74,7 @@ void MasterDaemon::_do_set(SocketType socket) { } void MasterDaemon::_do_get(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_get"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); PADDLE_ENFORCE_NE( @@ -86,13 +87,14 @@ void MasterDaemon::_do_get(SocketType socket) { void MasterDaemon::_do_stop(SocketType socket) { VLOG(3) << "MasterDaemon::_do_stop"; ReplyType value = ReplyType::STOP_WAIT; + tcputils::send_value(socket, value); if (--_nranks == 0) { _stop = true; } - tcputils::send_value(socket, value); } void MasterDaemon::_do_wait(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_wait"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); auto reply = ReplyType::STOP_WAIT; @@ -134,32 +136,42 @@ void MasterDaemon::run() { } for (size_t i = 1; i < fds.size(); i++) { - if (fds[i].revents == 0) { - continue; - } - - Command command = tcputils::receive_value(fds[i].fd); - VLOG(3) << "TCPStore: recv command: " << static_cast(command) << "."; - - switch (command) { - case Command::ADD: - _do_add(fds[i].fd); - break; - case Command::GET: - _do_get(fds[i].fd); - break; - case Command::SET: - _do_set(fds[i].fd); - break; - case Command::WAIT: - _do_wait(fds[i].fd); - break; - case Command::STOP: - _do_stop(fds[i].fd); - break; - default: - VLOG(0) << "Unknow command: " << static_cast(command); - exit(-1); + VLOG(0) << "fds.size:" << fds.size(); + VLOG(0) << "fds.size-i:" << i; + VLOG(0) << "fds[i].revents:" << fds[i].revents; + + try { + if (fds[i].revents == 0) { + continue; + } + + Command command = tcputils::receive_value(fds[i].fd); + VLOG(3) << "TCPStore: recv command: " << static_cast(command) + << "."; + + switch (command) { + case Command::ADD: + _do_add(fds[i].fd); + break; + case Command::GET: + _do_get(fds[i].fd); + break; + case Command::SET: + _do_set(fds[i].fd); + break; + case Command::WAIT: + _do_wait(fds[i].fd); + break; + case Command::STOP: + _do_stop(fds[i].fd); + break; + default: + VLOG(0) << "Unknow command: " << static_cast(command); + exit(-1); + } + } catch (...) { + fds.erase(fds.begin() + i); + _sockets.erase(_sockets.begin() + i - 1); } } } @@ -281,8 +293,8 @@ void TCPStore::wait(const std::string& key) { } TCPStore::~TCPStore() { - _client->send_command_for_key(Command::STOP, ""); VLOG(3) << "~TCPStore"; + _client->send_command_for_key(Command::STOP, ""); ReplyType ret = _client->receive_value(); PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, platform::errors::InvalidArgument( diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py index 5420e1d36b3..c62c4615f74 100644 --- a/python/paddle/fluid/tests/unittests/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -104,16 +104,91 @@ class TestProcessGroupFp32(unittest.TestCase): broadcast_result = paddle.assign(tensor_x) if rank == 0: task = pg.broadcast(tensor_x, 0) - task.synchronize() - assert task.is_completed() assert np.array_equal(broadcast_result, tensor_x) else: task = pg.broadcast(tensor_y, 0) - task.synchronize() - assert task.is_completed() assert np.array_equal(broadcast_result, tensor_y) print("test broadcast api ok") + # test barrier + # rank 0 + if pg.rank() == 0: + task = pg.barrier() + task.wait() + # rank 1 + else: + task = pg.barrier() + task.wait() + + print("test barrier api ok\n") + + # test allgather + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], + [out_shape[0]]) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], + [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") + if __name__ == "__main__": unittest.main() -- GitLab From 28fd30cda4f2b39fdf770fed22a67f8db5130979 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 4 Mar 2022 14:04:39 +0800 Subject: [PATCH 049/261] [Phi] Remove cholsky solve deps with svd helper (#40119) * remove cholsky solve deps with svd helper * fix shape infer bug --- paddle/fluid/operators/cholesky_solve_op.h | 20 ++++---- paddle/phi/infermeta/unary.cc | 53 ++++++++++++++++++++++ paddle/phi/infermeta/unary.h | 4 ++ paddle/phi/kernels/transpose_kernel.h | 25 ++++++++++ 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h index f25fbbb0c69..74b961d4e55 100644 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ b/paddle/fluid/operators/cholesky_solve_op.h @@ -16,11 +16,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/complex.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { // namespace operators @@ -59,7 +59,9 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, framework::Tensor b_bst(bin.type()); TensorExpand(dev_ctx, bin, &b_bst, b_bst_dims_vec); - math::DeviceIndependenceTensorOperations helper(ctx); + auto &phi_dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE &>( + dev_ctx); // calculate u's conjugate for complex framework::Tensor u_conj(u_bst.type()); @@ -68,7 +70,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, u_bst.data(), u_bst.numel(), u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); u_for_range(u_functor); - u_conj = helper.Transpose(u_conj); + u_conj = phi::TransposeLast2Dim(phi_dev_ctx, u_conj); // calculate b's conjugate for complex framework::Tensor b_conj(b_bst.type()); @@ -77,7 +79,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, b_bst.data(), b_bst.numel(), b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); b_for_range(b_functor); - b_conj = helper.Transpose(b_conj); + b_conj = phi::TransposeLast2Dim(phi_dev_ctx, b_conj); auto ut_data = u_conj.mutable_data(dev_ctx.GetPlace()); auto uindims = u_bst.dims(); @@ -117,7 +119,7 @@ void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, out->data(), out->numel(), out->mutable_data(out->dims(), dev_ctx.GetPlace())); out_for_range(out_functor); - *out = helper.Transpose(*out); + *out = phi::TransposeLast2Dim(phi_dev_ctx, *out); } template @@ -145,7 +147,9 @@ class CholeskySolveGradKernel : public framework::OpKernel { auto upper = ctx.Attr("upper"); const auto &dev_ctx = ctx.template device_context(); - math::DeviceIndependenceTensorOperations helper(ctx); + auto &phi_dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE &>( + dev_ctx); std::vector u_bst_dims_vec; std::vector b_bst_dims_vec; @@ -177,7 +181,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { out->data(), out->numel(), out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); out_for_range(out_functor); - out_conj = helper.Transpose(out_conj); + out_conj = phi::TransposeLast2Dim(phi_dev_ctx, out_conj); framework::Tensor commonterm(out->type()); auto outdims = out_conj.dims(); @@ -200,7 +204,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { commonterm_conj.mutable_data(commonterm.dims(), dev_ctx.GetPlace())); commonterm_for_range(commonterm_functor); - commonterm_conj = helper.Transpose(commonterm_conj); + commonterm_conj = phi::TransposeLast2Dim(phi_dev_ctx, commonterm_conj); phi::AddRawKernel( static_castset_dims(output_dims); } +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out) { + auto x_dims = x.dims(); + size_t x_rank = x_dims.size(); + size_t axis_size = axis.size(); + + PADDLE_ENFORCE_EQ( + x_rank, + axis_size, + errors::InvalidArgument("The input tensor's dimension " + "should be equal to the axis's size. " + "But received input tensor's dimension is %d, " + "axis's size is %d", + x_rank, + axis_size)); + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE_GE( + axis[i], + 0, + errors::InvalidArgument("The axis should be greater than or equal to 0." + "But received %d of axis[%d]", + axis[i], + i)); + + PADDLE_ENFORCE_EQ( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + true, + errors::InvalidArgument( + "Each element of Attribute axis should " + "be a unique value range from 0 to (dims - 1), " + "where the dims is the axis's size, " + "unique value means this axis value can appear only once. " + "But received axis[%d] is %d, axis_size is %d, " + "count[axis[%d]] is %d", + i, + axis[i], + axis_size, + i, + count[axis[i]])); + } + + phi::DDim out_dims(x_dims); + for (size_t i = 0; i < axis_size; ++i) { + out_dims[i] = x_dims[axis[i]]; + } + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 3c0628981af..97ec6f7fa58 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -145,4 +145,8 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h index 303b4a9a8f0..3d89b324bab 100644 --- a/paddle/phi/kernels/transpose_kernel.h +++ b/paddle/phi/kernels/transpose_kernel.h @@ -15,7 +15,10 @@ #pragma once #include + #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/empty_kernel.h" namespace phi { @@ -25,4 +28,26 @@ void TransposeKernel(const Context& dev_ctx, const std::vector& axis, DenseTensor* out); +template +DenseTensor Transpose(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis) { + auto dense_out = Empty(dev_ctx); + MetaTensor meta_out(&dense_out); + TransposeInferMeta(x, axis, &meta_out); + TransposeKernel(dev_ctx, x, axis, &dense_out); + return dense_out; +} + +template +DenseTensor TransposeLast2Dim(const Context& dev_ctx, const DenseTensor& x) { + size_t rank = x.dims().size(); + std::vector axis(rank); + for (size_t i = 0; i < rank; ++i) { + axis[i] = i; + } + std::swap(axis[rank - 1], axis[rank - 2]); + return Transpose(dev_ctx, x, axis); +} + } // namespace phi -- GitLab From 8dbfc2aec8427dc414e3f129d4057b1f2fbef0d9 Mon Sep 17 00:00:00 2001 From: ceci3 Date: Fri, 4 Mar 2022 14:09:17 +0800 Subject: [PATCH 050/261] [paddle-inference]support setting fully connected in multi-head attention static shape branch to int8 (#39660) * fix inference int * update * add unittest --- .../tensorrt/convert/multihead_matmul_op.cc | 32 +- .../test_trt_convert_multihead_matmul.py | 384 ++++++++++++++++++ 2 files changed, 413 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index a432ff62810..f19b21d3e63 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter { reshape_before_fc_dim.d[4] = 1; auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + if (enable_int8) { + engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0), + in_scale); + } reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); reshape_before_fc_layer->setName( ("shuffle_before_multihead_mamul(Output: " + output_name + ")") .c_str()); // add layer fc - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, - weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n, + nv_ksize, weight.get(), bias.get()); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n, weight.get(), bias.get()); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("fc_out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } fc_layer->setName( ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); @@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + + if (enable_int8) { + with_fp16 = 1; + } plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index 2d2072d277e..97a94ef348a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -451,10 +451,394 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest): "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2." ) + def teller3(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt in int8 mode.") + def test(self): self.add_skip_trt_case() self.run_test() +class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest): + def sample_program_configs(self): + def generate_input1(batch, dim1): + return np.random.random((batch, dim1, 768)).astype(np.float32) + + def generate_input2(shape): + return np.random.random(shape).astype(np.float32) + + def generate_weight1(): + return np.random.random((768, 768)).astype(np.float32) + + def generate_weight2(): + return np.random.random(768).astype(np.float32) + + for batch in [1, 2, 4]: + self.batch = batch + for reshape_shape in [[0, 0, 12, 64]]: + for dim1 in [128]: + input2_shapes = [[batch, reshape_shape[2], dim1, dim1], + [batch, 1, 1, dim1]] + for input2_shape in input2_shapes: + for axis in [0]: + dics = [{ + "x_num_col_dims": 2, + "y_num_col_dims": 1, + "enable_int8": True, + "X_scale": 1.0, + "weight_scale": [1.0], + }, { + "axis": 2, + "out_threshold": 1.0, + }, { + "shape": reshape_shape + }, { + "axis": [0, 2, 1, 3] + }, { + "x_num_col_dims": 2, + "y_num_col_dims": 1, + "enable_int8": True, + "X_scale": 1.0, + "weight_scale": [1.0], + }, { + "axis": 2, + "out_threshold": 1.0, + }, { + "shape": reshape_shape + }, { + "axis": [0, 2, 1, 3] + }, { + "x_num_col_dims": 2, + "y_num_col_dims": 1, + "enable_int8": True, + "X_scale": 1.0, + "weight_scale": [1.0], + }, { + "axis": 2, + "out_threshold": 1.0, + }, { + "shape": reshape_shape + }, { + "axis": [0, 2, 1, 3] + }, { + "scale": 0.125, + "bias": 0.0, + "bias_after_scale": True + }, { + "alpha": 1.0, + "transpose_X": False, + "transpose_Y": True, + "fused_reshape_X": [], + "fused_reshape_Y": [], + "fused_transpose_X": [], + "fused_transpose_Y": [], + "fused_reshape_Out": [], + "fused_transpose_Out": [] + }, { + "axis": axis + }, { + "axis": -1, + "is_test": True + }, { + "seed": 0, + "dropout_prob": 0.10000000149011612, + "dropout_implementation": "upscale_in_train", + "fix_seed": False, + "is_test": True + }, { + "alpha": 1.0, + "transpose_X": False, + "transpose_Y": False, + "fused_reshape_X": [], + "fused_reshape_Y": [], + "fused_transpose_X": [], + "fused_transpose_Y": [], + "fused_reshape_Out": [], + "fused_transpose_Out": [] + }, { + "axis": [0, 2, 1, 3] + }, { + "shape": [0, 0, 768] + }, { + "x_num_col_dims": 2, + "y_num_col_dims": 1 + }] + + ops_config = [ + { + "op_type": "mul", + "op_inputs": { + "X": ["input_data1"], + "Y": ["mul1_weight"] + }, + "op_outputs": { + "Out": ["mul1_output"] + }, + "op_attrs": dics[0] + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["mul1_output"], + "Y": ["elementwise_add1_weight"] + }, + "op_outputs": { + "Out": ["elementwise_add1_output"] + }, + "op_attrs": dics[1] + }, + { + "op_type": "reshape2", + "op_inputs": { + "X": ["elementwise_add1_output"], + }, + "op_outputs": { + "Out": ["reshape21_output"], + "XShape": ["reshape21_output_xshape"] + }, + "op_attrs": dics[2] + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["reshape21_output"] + }, + "op_outputs": { + "Out": ["transpose21_output"], + "XShape": + ["transpose21_output_xshape"] + }, + "op_attrs": dics[3] + }, + { + "op_type": "mul", + "op_inputs": { + "X": ["input_data1"], + "Y": ["mul2_weight"] + }, + "op_outputs": { + "Out": ["mul2_output"] + }, + "op_attrs": dics[4] + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["mul2_output"], + "Y": ["elementwise_add2_weight"] + }, + "op_outputs": { + "Out": ["elementwise_add2_output"] + }, + "op_attrs": dics[5] + }, + { + "op_type": "reshape2", + "op_inputs": { + "X": ["elementwise_add2_output"] + }, + "op_outputs": { + "Out": ["reshape22_output"], + "XShape": ["reshape22_output_xshape"] + }, + "op_attrs": dics[6] + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["reshape22_output"] + }, + "op_outputs": { + "Out": ["transpose22_output"], + "XShape": + ["transpose22_output_xshape"] + }, + "op_attrs": dics[7] + }, + { + "op_type": "mul", + "op_inputs": { + "X": ["input_data1"], + "Y": ["mul3_weight"] + }, + "op_outputs": { + "Out": ["mul3_output"] + }, + "op_attrs": dics[8] + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["mul3_output"], + "Y": ["elementwise_add3_weight"] + }, + "op_outputs": { + "Out": ["elementwise_add3_output"] + }, + "op_attrs": dics[9] + }, + { + "op_type": "reshape2", + "op_inputs": { + "X": ["elementwise_add3_output"] + }, + "op_outputs": { + "Out": ["reshape23_output"], + "XShape": ["reshape23_output_xshape"] + }, + "op_attrs": dics[10] + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["reshape23_output"] + }, + "op_outputs": { + "Out": ["transpose23_output"], + "XShape": + ["transpose23_output_xshape"] + }, + "op_attrs": dics[11] + }, + { + "op_type": "scale", + "op_inputs": { + "X": ["transpose23_output"], + }, + "op_outputs": { + "Out": ["scale_output"] + }, + "op_attrs": dics[12] + }, + { + "op_type": "matmul", + "op_inputs": { + "X": ["scale_output"], + "Y": ["transpose22_output"], + }, + "op_outputs": { + "Out": ["matmul1_output"] + }, + "op_attrs": dics[13] + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["matmul1_output"], + "Y": ["input_data2"] + }, + "op_outputs": { + "Out": ["elementwise_add4_output"] + }, + "op_attrs": dics[14] + }, + { + "op_type": "softmax", + "op_inputs": { + "X": ["elementwise_add4_output"] + }, + "op_outputs": { + "Out": ["softmax_output"] + }, + "op_attrs": dics[15] + }, + { + "op_type": "dropout", + "op_inputs": { + "X": ["softmax_output"], + }, + "op_outputs": { + "Out": ["dropout3_output"] + }, + "op_attrs": dics[16] + }, + { + "op_type": "matmul", + "op_inputs": { + "X": ["dropout3_output"], + "Y": ["transpose21_output"], + }, + "op_outputs": { + "Out": ["matmul2_output"] + }, + "op_attrs": dics[17] + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["matmul2_output"] + }, + "op_outputs": { + "Out": ["transpose24_output"], + "XShape": + ["transpose24_output_xshape"] + }, + "op_attrs": dics[18] + }, + { + "op_type": "reshape2", + "op_inputs": { + "X": ["transpose24_output"] + }, + "op_outputs": { + "Out": ["reshape24_output"], + "XShape": ["reshape24_output_xshape"] + }, + "op_attrs": dics[19] + }, + # In order to fuse ops with + # multihead_matmul_fuse_pass_v2, the last op + # must be mul. + { + "op_type": "mul", + "op_inputs": { + "X": ["reshape24_output"], + "Y": ["mul4_weight"] + }, + "op_outputs": { + "Out": ["mul4_output"] + }, + "op_attrs": dics[20] + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "mul1_weight": TensorConfig( + data_gen=partial(generate_weight1)), + "mul2_weight": TensorConfig( + data_gen=partial(generate_weight1)), + "mul3_weight": TensorConfig( + data_gen=partial(generate_weight1)), + "mul4_weight": TensorConfig( + data_gen=partial(generate_weight1)), + "elementwise_add1_weight": TensorConfig( + data_gen=partial(generate_weight2)), + "elementwise_add2_weight": TensorConfig( + data_gen=partial(generate_weight2)), + "elementwise_add3_weight": TensorConfig( + data_gen=partial(generate_weight2)), + }, + inputs={ + "input_data1": TensorConfig( + data_gen=partial(generate_input1, batch, + dim1)), + "input_data2": TensorConfig( + data_gen=partial(generate_input2, + input2_shape)), + }, + outputs=["mul4_output"]) + + yield program_config + + if __name__ == "__main__": unittest.main() -- GitLab From 45385371ab0171034dcba0e26db118cc777282b7 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 4 Mar 2022 14:17:23 +0800 Subject: [PATCH 051/261] Fix bug caused by split infershape (#40116) * fix bug caused by split infershape * revert infer_shape of split * revert split --- paddle/fluid/operators/split_op.cc | 52 +++++++++++++++-- paddle/phi/infermeta/unary.cc | 80 ++++++++++++-------------- paddle/phi/kernels/cpu/split_kernel.cc | 17 ++++++ paddle/phi/kernels/gpu/split_kernel.cu | 17 ++++++ 4 files changed, 117 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 6678320f9ff..5b8922505cc 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -26,6 +26,52 @@ class SplitOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Input(X) of SplitOp should not be null.")); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of SplitOp should not be empty.")); + auto in_dims = ctx->GetInputDim("X"); + auto outs_names = ctx->Outputs("Out"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + size_t num = static_cast(ctx->Attrs().Get("num")); + std::vector sections = static_cast>( + ctx->Attrs().Get>("sections")); + const size_t outs_number = outs_names.size(); + + if (sections.size() > 0) { + PADDLE_ENFORCE_EQ( + sections.size(), outs_number, + platform::errors::InvalidArgument("tensor split sections size " + "should be equal to output size.")); + } + + if (ctx->HasInput("AxisTensor")) { + auto out_dims = phi::make_ddim(std::vector(in_dims.size(), -1)); + std::vector outs_dims(outs_number, out_dims); + ctx->SetOutputsDim("Out", outs_dims); + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + return; + } + + bool each_section_is_known = + (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList")); + + auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known, + in_dims, num, sections, axis, outs_number); + ctx->SetOutputsDim("Out", outs_dims); + if (axis != 0) { + // Only pass LoD when not spliting along the first dim. + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + } + } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -125,10 +171,6 @@ Example: namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor, - PT_INFER_META(phi::SplitInferMeta)); - REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker, - ops::SplitGradMaker, - SplitInferShapeFunctor); + ops::SplitGradMaker); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index ed95c8ff677..ff58c53ad9b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -508,17 +508,6 @@ void SplitInferMeta(const MetaTensor& x, const Scalar& axis, std::vector out, MetaConfig config) { - if (!config.is_runtime) { - if (axis.FromTensor() || num_or_sections.FromTensor()) { - auto out_dims = phi::make_ddim(std::vector(x.dims().size(), -1)); - for (auto* item : out) { - item->set_dims(out_dims); - item->share_lod(x); - } - return; - } - } - int axis_value = axis.to(); int rank = x.dims().size(); PADDLE_ENFORCE_EQ( @@ -533,34 +522,27 @@ void SplitInferMeta(const MetaTensor& x, axis_value = axis_value + rank; } - std::vector out_dims(out.size(), x.dims()); - auto input_axis_dim = x.dims().at(axis_value); auto num_or_sections_data = num_or_sections.GetData(); + // step1: get formated sections + std::vector sections; // num_or_sections is a number if (num_or_sections_data.size() == 1) { - if (config.is_runtime || input_axis_dim > 0) { - int num = num_or_sections_data.at(0); - PADDLE_ENFORCE_EQ( - input_axis_dim % num, - 0, - phi::errors::InvalidArgument( - "The input's size along the split dimension " - "must be evenly divisible by Attr(num_or_sections). " - "But received Attr(num_or_sections) " - "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", - num, - x.dims(), - axis_value)); + int num = num_or_sections_data.at(0); - size_t out_axis_dim = input_axis_dim / num; - for (auto& out_dim : out_dims) { - out_dim[axis_value] = out_axis_dim; - } - } else { - for (auto& out_dim : out_dims) { - out_dim[axis_value] = -1; - } + PADDLE_ENFORCE_EQ(input_axis_dim % num, + 0, + phi::errors::InvalidArgument( + "The input's size along the split dimension " + "must be evenly divisible by Attr(num_or_sections). " + "But received Attr(num_or_sections) " + "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", + num, + x.dims(), + axis_value)); + + for (int i = 0; i < num; ++i) { + sections.push_back(input_axis_dim / num); } } else { // num_or_sections is a sections @@ -568,9 +550,10 @@ void SplitInferMeta(const MetaTensor& x, int unknow_dim_idx = -1; int num_of_unknow = 0; int sum_of_section = 0; - std::vector sections = num_or_sections_data; for (size_t i = 0; i < num_or_sections_data.size(); ++i) { + sections.push_back(num_or_sections_data[i]); + if (num_or_sections_data[i] == unknow_dim_val) { num_of_unknow++; unknow_dim_idx = i; @@ -622,22 +605,31 @@ void SplitInferMeta(const MetaTensor& x, x.dims(), axis_value)); } - for (size_t i = 0; i < out_dims.size(); ++i) { + } + + // setp2: fill out dims + std::vector out_dims(sections.size(), x.dims()); + if (config.is_runtime || input_axis_dim > 0) { + for (size_t i = 0; i < sections.size(); ++i) { out_dims[i][axis_value] = sections[i]; } + } else { + for (size_t i = 0; i < sections.size(); ++i) { + out_dims[i][axis_value] = -1; + } } - for (size_t i = 0; i < out.size(); ++i) { + for (size_t i = 0; i < sections.size(); ++i) { if (axis_value != 0) { // Only pass LoD when not spliting along the first dim. - out.at(i)->set_dtype(x.dtype()); - out.at(i)->set_dims(out_dims[i]); - out.at(i)->set_layout(x.layout()); + out[i]->set_dtype(x.dtype()); + out[i]->set_dims(out_dims[i]); + out[i]->set_layout(x.layout()); } else { - out.at(i)->set_dtype(x.dtype()); - out.at(i)->set_dims(out_dims[i]); - out.at(i)->set_layout(x.layout()); - out.at(i)->share_lod(x); + out[i]->set_dtype(x.dtype()); + out[i]->set_dims(out_dims[i]); + out[i]->set_layout(x.layout()); + out[i]->share_lod(x); } } } diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 4acf9b02028..324798effbe 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -28,6 +28,23 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { + // need to infershape output + if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { + std::vector out_metas; + out_metas.reserve(outs.size()); + std::vector out_metas_ptr; + for (size_t i = 0; i < outs.size(); ++i) { + out_metas.push_back(outs[i]); + out_metas_ptr.push_back(&out_metas.back()); + } + + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + + for (size_t i = 0; i < out_metas.size(); ++i) { + outs[i]->Resize(out_metas[i].dims()); + } + } + std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index d2473d5b0b1..c28fc3794f0 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -27,6 +27,23 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { + // need to infershape output + if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { + std::vector out_metas; + out_metas.reserve(outs.size()); + std::vector out_metas_ptr; + for (size_t i = 0; i < outs.size(); ++i) { + out_metas.push_back(outs[i]); + out_metas_ptr.push_back(&out_metas.back()); + } + + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + + for (size_t i = 0; i < out_metas.size(); ++i) { + outs[i]->Resize(out_metas[i].dims()); + } + } + std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); -- GitLab From f3161c507b1f9c729cb24fe0dc5b6b7f7f5e1e6c Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 4 Mar 2022 14:26:45 +0800 Subject: [PATCH 052/261] [PHI] Remove emtpy kernel and infershape in fluid (#40146) * remove emtpy kernel and infershape in fluid * fix bug of infershape_utils --- paddle/fluid/framework/infershape_utils.cc | 4 ++ paddle/fluid/operators/empty_op.cc | 57 +++------------------- paddle/fluid/operators/empty_op.cu.cc | 26 ---------- paddle/fluid/operators/empty_op.h | 47 ------------------ 4 files changed, 12 insertions(+), 122 deletions(-) delete mode 100644 paddle/fluid/operators/empty_op.cu.cc delete mode 100644 paddle/fluid/operators/empty_op.h diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 57fb68e8042..7232a707916 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -381,6 +381,10 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr(std::move( + phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { infer_meta_context.EmplaceBackAttr( diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index e23342ebb5d..6baa504562e 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/empty_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/nullary.h" + namespace paddle { namespace operators { @@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* context) const override { - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty"); - - if (context->HasInput("ShapeTensor")) { - auto shape_dims = context->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - context->SetOutputDim("Out", phi::make_ddim(vec_dims)); - } else if (context->HasInputs("ShapeTensorList")) { - std::vector out_dims; - auto dims_list = context->GetInputsDim("ShapeTensorList"); - for (size_t i = 0; i < dims_list.size(); ++i) { - auto& dims = dims_list[i]; - PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}), - platform::errors::InvalidArgument( - "The shape of Tensor in list must be [1]. " - "But received the shape is [%s]", - dims)); - - out_dims.push_back(-1); - } - - context->SetOutputDim("Out", phi::make_ddim(out_dims)); - } else { - auto& shape = context->Attrs().Get>("shape"); - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE_GE( - shape[i], 0, - platform::errors::InvalidArgument( - "Each value of attribute 'shape' is expected to be no less " - "than 0. But recieved: shape[%u] = %d; shape = [%s].", - i, shape[i], phi::make_ddim(shape))); - } - context->SetOutputDim("Out", phi::make_ddim(shape)); - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, @@ -126,14 +88,11 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, + PT_INFER_META(phi::CreateInferMeta)); + REGISTER_OPERATOR( empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); + paddle::framework::EmptyGradOpMaker, + EmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc deleted file mode 100644 index 22799e507ae..00000000000 --- a/paddle/fluid/operators/empty_op.cu.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/empty_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h deleted file mode 100644 index cb466fffcd7..00000000000 --- a/paddle/fluid/operators/empty_op.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EmptyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor *out_tensor = context.Output("Out"); - - auto shape = GetShape(context); - out_tensor->Resize(shape); - - out_tensor->mutable_data(context.GetPlace(), - framework::TransToPhiDataType(dtype)); - } -}; - -} // namespace operators -} // namespace paddle -- GitLab From 0bfba16b66ca5c496760b01ffac56c788444decb Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 4 Mar 2022 14:51:57 +0800 Subject: [PATCH 053/261] Add digamma abs trunc yaml (#40024) * add digamma, abs, trunc; test=develop * fix bug and add diagonal; test=develop * add name coverter; test=develop * update tracer.py; test=develop * add test case; test=develop * fix bugs; test=develop --- paddle/fluid/operators/diagonal_op.cc | 77 ++----------------- paddle/phi/infermeta/backward.h | 1 + paddle/phi/infermeta/unary.cc | 75 ++++++++++++++++++ paddle/phi/infermeta/unary.h | 3 + paddle/phi/kernels/cpu/norm_grad_kernel.cc | 2 +- paddle/phi/kernels/digamma_grad_kernel.h | 2 +- paddle/phi/kernels/gpu/norm_grad_kernel.cu | 2 +- .../kernels/impl/digamma_grad_kernel_impl.h | 2 +- paddle/phi/kernels/norm_grad_kernel.h | 2 +- paddle/phi/ops/compat/digamma_sig.cc | 2 +- paddle/phi/ops/compat/norm_sig.cc | 2 +- python/paddle/fluid/dygraph/tracer.py | 23 ++++++ .../fluid/layers/layer_function_generator.py | 2 +- .../tests/unittests/test_activation_op.py | 2 +- .../fluid/tests/unittests/test_diagonal_op.py | 19 +++++ .../fluid/tests/unittests/test_trunc_op.py | 10 +++ python/paddle/tensor/math.py | 8 +- python/paddle/utils/code_gen/api.yaml | 46 +++++++++++ python/paddle/utils/code_gen/backward.yaml | 55 +++++++++++++ 19 files changed, 256 insertions(+), 79 deletions(-) diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc index b419f629a1e..20813f8bb44 100644 --- a/paddle/fluid/operators/diagonal_op.cc +++ b/paddle/fluid/operators/diagonal_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,74 +23,6 @@ namespace operators { class DiagonalOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal"); - - int offset_ = ctx->Attrs().Get("offset"); - int axis1 = ctx->Attrs().Get("axis1"); - int axis2 = ctx->Attrs().Get("axis2"); - - auto x_dims = ctx->GetInputDim("Input"); - int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; - int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; - - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::OutOfRange("Input's dim is out of range (expected at " - "least 2 dimensions, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - axis1_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis1)); - PADDLE_ENFORCE_LT( - axis2_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis2)); - PADDLE_ENFORCE_NE(axis1_, axis2_, - platform::errors::InvalidArgument( - "The dimensions should not be identical " - "%d vs %d.", - axis1, axis2)); - - auto out_dims = vectorize(x_dims); - // from out_dims get the dim size of axis1_. - auto axis1_size = out_dims[axis1_]; - auto axis2_size = out_dims[axis2_]; - // delete two dims by attr axis1 and axis2 from out_dims. - /* example: - out_dim = [2, 3, 4]; - axis1 = 0; - axis2 = 1; - according to the attr of axis1 and axis2, we get: - out_dim = [4]. - */ - out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); - out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); - - if (offset_ == 0) { - out_dims.push_back(std::min(axis1_size, axis2_size)); - } else if (offset_ > 0) { - if ((axis2_size - offset_) > 0) { - out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); - } else { - out_dims.push_back(0); - } - } else { - if ((axis1_size + offset_) > 0) { - out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); - } else { - out_dims.push_back(0); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - } }; class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker { @@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, + PT_INFER_META(phi::DiagonalInferMeta)); + REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, ops::DiagonalGradOpMaker, - ops::DiagonalGradOpMaker); + ops::DiagonalGradOpMaker, + DiagonalInferShapeFunctor); REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp, ops::DiagonalGradNoNeedBufferVarsInferer) diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index c7090ed664b..f2c0cf8a689 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index ff58c53ad9b..85db1547f16 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -706,6 +706,81 @@ void TraceInferMeta( out->set_dims(phi::make_ddim(sizes)); } +void DiagonalInferMeta(const MetaTensor& input, + int offset, + int axis1, + int axis2, + MetaTensor* out) { + auto x_dims = input.dims(); + int offset_ = offset; + int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; + int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::OutOfRange("Input's dim is out of range (expected at " + "least 2 dimensions, but got %ld).", + x_dims.size())); + PADDLE_ENFORCE_LT( + axis1_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis1) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis1)); + PADDLE_ENFORCE_LT( + axis2_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis2) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis2)); + PADDLE_ENFORCE_NE( + axis1_, + axis2_, + phi::errors::InvalidArgument("The dimensions should not be identical " + "%d vs %d.", + axis1, + axis2)); + + auto out_dims = vectorize(x_dims); + // from out_dims get the dim size of axis1_. + auto axis1_size = out_dims[axis1_]; + auto axis2_size = out_dims[axis2_]; + // delete two dims by attr axis1 and axis2 from out_dims. + /* example: + out_dim = [2, 3, 4]; + axis1 = 0; + axis2 = 1; + according to the attr of axis1 and axis2, we get: + out_dim = [4]. + */ + out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); + out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + + if (offset_ == 0) { + out_dims.push_back(std::min(axis1_size, axis2_size)); + } else if (offset_ > 0) { + if ((axis2_size - offset_) > 0) { + out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); + } else { + out_dims.push_back(0); + } + } else { + if ((axis1_size + offset_) > 0) { + out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); + } else { + out_dims.push_back(0); + } + } + out->set_dims(phi::make_ddim(out_dims)); +} + void UnfoldInferMeta(const MetaTensor& x, const std::vector& kernel_sizes, const std::vector& strides, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 97ec6f7fa58..d4e21fbd824 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -140,6 +140,9 @@ void DiagInferMeta(const MetaTensor& x, void SizeInferMeta(const MetaTensor& input, MetaTensor* out); +void DiagonalInferMeta( + const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); + void PixelShuffleInferMeta(const MetaTensor& x, int upscale_factor, const std::string& data_format, diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc index 597207a05a2..bd05e2c4c6e 100644 --- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc @@ -26,9 +26,9 @@ namespace phi { template void NormGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& norm, + const DenseTensor& out_grad, int axis, float epsilon, bool is_test, diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h index 38912a5ccc4..ae5346080d3 100644 --- a/paddle/phi/kernels/digamma_grad_kernel.h +++ b/paddle/phi/kernels/digamma_grad_kernel.h @@ -20,8 +20,8 @@ namespace phi { template void DigammaGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, + const DenseTensor& out_grad, DenseTensor* x_grad); } // namepsace phi diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu index ab38a82eceb..43a08b0603e 100644 --- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu @@ -75,9 +75,9 @@ __global__ void NormalizeGradient(const T* x, template void NormGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& norm, + const DenseTensor& out_grad, int axis, float epsilon, bool is_test, diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h index 74ded1569eb..92550de1800 100644 --- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h @@ -38,8 +38,8 @@ struct DigammaGradFunctor { template void DigammaGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, + const DenseTensor& out_grad, DenseTensor* x_grad) { x_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/phi/kernels/norm_grad_kernel.h b/paddle/phi/kernels/norm_grad_kernel.h index 7b09d6463d0..55714b8a4a0 100644 --- a/paddle/phi/kernels/norm_grad_kernel.h +++ b/paddle/phi/kernels/norm_grad_kernel.h @@ -20,9 +20,9 @@ namespace phi { template void NormGradKernel(const Context& ctx, - const DenseTensor& out_grad, const DenseTensor& x, const DenseTensor& out, + const DenseTensor& out_grad, int axis, float epsilon, bool is_test, diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc index fa693f92c6f..12ef3056f1e 100644 --- a/paddle/phi/ops/compat/digamma_sig.cc +++ b/paddle/phi/ops/compat/digamma_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature DigammaGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "digamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")}); + "digamma_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); } } // namespace phi diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc index 81d294b8424..a74db9b5686 100644 --- a/paddle/phi/ops/compat/norm_sig.cc +++ b/paddle/phi/ops/compat/norm_sig.cc @@ -23,7 +23,7 @@ KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("norm_grad", - {GradVarName("Out"), "X", "Norm"}, + {"X", "Norm", GradVarName("Out")}, {"axis", "epsilon", "is_test"}, {GradVarName("X")}); } diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index e0c594b07ae..563cd433910 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -29,6 +29,29 @@ final_state_name_mapping = { "x": "X", "y": "Y", "out": "Out", + }, + "trunc": { + "final_op_name": "final_state_trunc", + "x": "X", + "out": "Out", + }, + "abs": { + "final_op_name": "final_state_abs", + "x": "X", + "out": "Out", + }, + "digamma": { + "final_op_name": "final_state_digamma", + "x": "X", + "out": "Out", + }, + "diagonal": { + "final_op_name": "final_state_diagonal", + "x": "Input", + "offset": "offset", + "axis1": "axis1", + "axis2": "axis2", + "out": "Out", } } diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 56af7e341fd..676ee3e3c77 100755 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -20,7 +20,7 @@ import string from six.moves import cStringIO from ..proto import framework_pb2 -from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode +from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_, in_dygraph_mode, _in_eager_mode from ..layer_helper import LayerHelper from ..data_feeder import check_variable_and_dtype from paddle import _C_ops diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index d3d8fdd7031..b4b5944e27c 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -983,7 +983,7 @@ class TestAbs(TestActivation): def test_check_grad(self): if self.dtype == np.float16: return - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestCeil(TestActivation): diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py index 4dab7c0df40..b4854aea52a 100644 --- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py +++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py @@ -124,6 +124,25 @@ class TestDiagonalAPI(unittest.TestCase): self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True) paddle.enable_static() + def test_api_eager(self): + paddle.disable_static(self.place) + with _test_eager_guard(): + x_tensor = paddle.to_tensor(self.x) + out = paddle.diagonal(x_tensor) + out2 = paddle.diagonal(x_tensor, offset=0, axis1=2, axis2=1) + out3 = paddle.diagonal(x_tensor, offset=1, axis1=0, axis2=1) + out4 = paddle.diagonal(x_tensor, offset=0, axis1=1, axis2=2) + out_ref = np.diagonal(self.x) + self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True) + out2_ref = np.diagonal(self.x, offset=0, axis1=2, axis2=1) + self.assertEqual(np.allclose(out2.numpy(), out2_ref, rtol=1e-08), True) + out3_ref = np.diagonal(self.x, offset=1, axis1=0, axis2=1) + self.assertEqual(np.allclose(out3.numpy(), out3_ref, rtol=1e-08), True) + out4_ref = np.diagonal(self.x, offset=0, axis1=1, axis2=2) + self.assertEqual(np.allclose(out4.numpy(), out4_ref, rtol=1e-08), True) + + paddle.enable_static() + def test_api_eager_dygraph(self): with _test_eager_guard(): self.test_api_dygraph() diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py index 08a35db3ac4..b70fa04adc1 100644 --- a/python/paddle/fluid/tests/unittests/test_trunc_op.py +++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py @@ -79,6 +79,16 @@ class TestTruncAPI(unittest.TestCase): self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True) paddle.enable_static() + def test_api_eager(self): + paddle.disable_static(self.place) + + with _test_eager_guard(): + x_tensor = paddle.to_tensor(self.x) + out = paddle.trunc(x_tensor) + out_ref = np.trunc(self.x) + self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-08), True) + paddle.enable_static() + def test_api_eager_dygraph(self): with _test_eager_guard(): self.test_api_dygraph() diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index ce29e9dce81..9a013910565 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -27,7 +27,7 @@ from paddle.tensor import cast from paddle.tensor.attribute import _complex_to_real_dtype import paddle from paddle.static import Variable -from ..framework import core +from ..framework import core, _in_eager_mode from ..framework import _varbase_creator, convert_np_dtype_to_dtype_ from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype @@ -1083,6 +1083,8 @@ def trunc(input, name=None): # [0., 0.]])) ''' if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_trunc(input) return _C_ops.trunc(input) else: inputs = {"X": input} @@ -2425,6 +2427,8 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None): """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_diagonal(x, offset, axis1, axis2) return _C_ops.diagonal(x, 'offset', offset, 'axis1', axis1, 'axis2', axis2) def __check_input(input, offset, dim1, dim2): @@ -3184,6 +3188,8 @@ def digamma(x, name=None): """ if paddle.in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_digamma(x) return _C_ops.digamma(x) check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'digamma') diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 45a6aae5e6d..699e42f2373 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -193,3 +193,49 @@ args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED) output : Tensor invoke : full_like(x, 0, dtype, place) + +- api : digamma + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : digamma + backward : digamma_grad + +- api : abs + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : abs + backward : abs_grad + +- api : trunc + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : trunc + backward : trunc_grad + +# - api : norm +# args : (Tensor x, int axis, float epsilon, bool is_test) +# output : Tensor(out), Tensor(norm) +# infer_meta : +# func : NormInferMeta +# kernel : +# func : norm +# intermediate : norm +# backward : norm_grad + +- api : diagonal + args : (Tensor x, int offset, int axis1, int axis2) + output : Tensor + infer_meta : + func : DiagonalInferMeta + kernel : + func : diagonal + backward : diagonal_grad diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index cdda5cb1f05..c69bbf35b97 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -25,6 +25,61 @@ output : Tensor(x_grad) invoke : scale(out_grad, scale, bias, bias_after_scale) +- backward_api : digamma_grad + forward : digamma (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : digamma_grad + +- backward_api : abs_grad + forward : abs (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : abs_grad + +- backward_api : trunc_grad + forward : trunc (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : trunc_grad + +# - backward_api : norm_grad +# forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm) +# args : (Tensor out_grad, Tensor x, Tensor norm, int axis, float epsilon, bool is_test) +# output : Tensor(x_grad) +# infer_meta : +# func : UnchangedInferMeta +# param : [x] +# kernel : +# func : norm_grad + +- backward_api : diagonal_grad + forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : diagonal_grad + +# - backward_api : split_grad +# forward : split (Tensor x, ScalarArray num_or_sections, Scalar axis) -> Tensor[](out) +# args : (Tensor[] out_grad, Scalar axis) +# output : Tensor(x_grad) +# invoke : concat( out_grad, axis) # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. # - backward_api : matmul_triple_grad -- GitLab From b7bbe39c9fac0867e1e129e2958b33fd958d5206 Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Fri, 4 Mar 2022 15:40:03 +0800 Subject: [PATCH 054/261] [phi] move sigmoid_cross_entopy_with_logits log_loss cumsum auc kernel to phi (#39976) * move sigmoid cross entopy with logits to phi * fix ci * move log_loss to phi * move cumsum to phi * revert infershape * fix xpu ci * move auc to phi * remove comment * update sigmoid_cross_entropy_with_logits_op.cu * update sigmoid_cross_entropy_with_logits_op * Update log_loss --- paddle/fluid/operators/cum_op.h | 115 ------ paddle/fluid/operators/cumsum_op.cc | 7 +- paddle/fluid/operators/cumsum_op.cu | 325 ----------------- paddle/fluid/operators/cumsum_op_npu.cc | 2 +- paddle/fluid/operators/log_loss_op.cc | 12 +- paddle/fluid/operators/log_loss_op.h | 74 ---- paddle/fluid/operators/log_loss_op_npu.cc | 2 +- paddle/fluid/operators/log_loss_op_xpu.cc | 4 +- paddle/fluid/operators/metrics/auc_op.cc | 3 +- paddle/fluid/operators/metrics/auc_op.cu | 232 ------------ paddle/fluid/operators/metrics/auc_op.h | 186 ---------- .../sigmoid_cross_entropy_with_logits_op.cc | 14 +- .../sigmoid_cross_entropy_with_logits_op.cu | 264 -------------- .../sigmoid_cross_entropy_with_logits_op.h | 114 ------ ...igmoid_cross_entropy_with_logits_op_npu.cc | 3 +- ...igmoid_cross_entropy_with_logits_op_xpu.cc | 4 +- paddle/phi/kernels/auc_kernel.h | 36 ++ paddle/phi/kernels/cpu/auc_kernel.cc | 190 ++++++++++ paddle/phi/kernels/cpu/cumsum_kernel.cc | 143 ++++++++ .../phi/kernels/cpu/log_loss_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/log_loss_kernel.cc | 21 ++ ...d_cross_entropy_with_logits_grad_kernel.cc | 70 ++++ ...igmoid_cross_entropy_with_logits_kernel.cc | 71 ++++ paddle/phi/kernels/cumsum_kernel.h | 30 ++ paddle/phi/kernels/gpu/auc_kernel.cu | 258 ++++++++++++++ paddle/phi/kernels/gpu/cumsum_kernel.cu | 336 ++++++++++++++++++ .../phi/kernels/gpu/log_loss_grad_kernel.cu | 22 ++ paddle/phi/kernels/gpu/log_loss_kernel.cu | 21 ++ .../gpu/sigmoid_cross_entropy_with_logits.h | 69 ++++ ...d_cross_entropy_with_logits_grad_kernel.cu | 126 +++++++ ...igmoid_cross_entropy_with_logits_kernel.cu | 128 +++++++ .../kernels/impl/log_loss_grad_kernel_impl.h | 43 +++ .../phi/kernels/impl/log_loss_kernel_impl.h | 40 +++ paddle/phi/kernels/log_loss_grad_kernel.h | 29 ++ paddle/phi/kernels/log_loss_kernel.h | 28 ++ ...id_cross_entropy_with_logits_grad_kernel.h | 30 ++ ...sigmoid_cross_entropy_with_logits_kernel.h | 29 ++ paddle/phi/ops/compat/log_loss_sig.cc | 29 ++ .../sigmoid_cross_entropy_with_logits_sig.cc | 31 ++ 39 files changed, 1817 insertions(+), 1346 deletions(-) delete mode 100644 paddle/fluid/operators/cum_op.h delete mode 100644 paddle/fluid/operators/cumsum_op.cu delete mode 100644 paddle/fluid/operators/log_loss_op.h delete mode 100644 paddle/fluid/operators/metrics/auc_op.cu delete mode 100644 paddle/fluid/operators/metrics/auc_op.h delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h create mode 100644 paddle/phi/kernels/auc_kernel.h create mode 100644 paddle/phi/kernels/cpu/auc_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cumsum_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_loss_kernel.cc create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc create mode 100644 paddle/phi/kernels/cumsum_kernel.h create mode 100644 paddle/phi/kernels/gpu/auc_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cumsum_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_loss_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_loss_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu create mode 100644 paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/log_loss_kernel_impl.h create mode 100644 paddle/phi/kernels/log_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/log_loss_kernel.h create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h create mode 100644 paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h create mode 100644 paddle/phi/ops/compat/log_loss_sig.cc create mode 100644 paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h deleted file mode 100644 index ab3860ecafc..00000000000 --- a/paddle/fluid/operators/cum_op.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class CumKernel : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - - void Compute(const framework::ExecutionContext& context) const override { - auto& X = GET_DATA_SAFELY(context.Input("X"), "Input", - "X", "Cum"); - - auto& Out = GET_DATA_SAFELY(context.Output("Out"), - "Output", "Out", "Cum"); - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = Out.dims(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - Out.template mutable_data(context.GetPlace()); - - int pre = 1; - int post = 1; - int mid = out_dims[axis]; - for (int i = 0; i < axis; ++i) { - pre *= out_dims[i]; - } - for (int i = axis + 1; i < out_dims.size(); ++i) { - post *= out_dims[i]; - } - - auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); - auto* place = - context.template device_context().eigen_device(); - - using IndexT = Eigen::DenseIndex; - if (pre == 1) { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(mid), x, out, - /* axis= */ 0, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(mid, post), x, out, - /* axis= */ 0, reverse, exclusive); - } - } else { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, - /* axis= */ 1, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, - /* axis= */ 1, reverse, exclusive); - } - } - } - - private: - template - void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, - bool reverse, bool exclusive) const { - if (!reverse) { - out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); - } else { - std::array rev; - rev.fill(false); - rev[axis] = reverse; - out.reshape(dims).device(d) = - Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); - } - } -}; - -template -struct CumsumFunctor { - using ELEMENT_TYPE = T; - template - const typename X::TensorScanSumOp operator()(X x, int axis, - bool exclusive) const { - return x.cumsum(axis, exclusive); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 9fa355a9246..7c80917a713 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/cum_op.h" namespace paddle { namespace operators { @@ -91,11 +91,6 @@ using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, ops::CumsumGradMaker); -REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu deleted file mode 100644 index 3402f42521f..00000000000 --- a/paddle/fluid/operators/cumsum_op.cu +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/operators/cum_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -using Tensor = paddle::framework::Tensor; -using LoDTensor = paddle::framework::LoDTensor; - -namespace paddle { -namespace operators { - -template -__device__ void BlockReverse(const T* idata, T* odata, int src_base, - int dst_base, int valid_item) { - __shared__ T sh_mem[BLOCK_SIZE]; - int tx = threadIdx.x; - - int offset = tx; - int in_index = src_base + offset; - if (offset >= valid_item) { - sh_mem[offset] = 0; - } else { - int sh_mem_index = BLOCK_SIZE - offset - 1; - T data = idata[in_index]; - sh_mem[sh_mem_index] = data; - } - - __syncthreads(); - int out_index = dst_base - offset; - if (offset < valid_item) { - int sh_mem_index = BLOCK_SIZE - offset - 1; - odata[out_index] = sh_mem[sh_mem_index]; - } -} - -template -__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data, - int reverse_size, int outer_size, - int inner_size) { - int bx = blockIdx.x; - int by = blockIdx.y; - int item_per_block = 1024; - - for (int block_offset = 0; block_offset < reverse_size; - block_offset += item_per_block) { - int valid_item = (reverse_size - block_offset > item_per_block) - ? item_per_block - : reverse_size - block_offset; - int src_offset = - bx * reverse_size + block_offset + by * (inner_size * reverse_size); - int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + - reverse_size - 1 - block_offset; - if (reverse_size < item_per_block) { - valid_item = reverse_size; - } - - BlockReverse(matrix_data, reverse_data, src_offset, dst_offset, - valid_item); - } -} - -template -struct BlockPrefixCallbackOp { - // Running prefix - T running_total; - // Constructor - __device__ BlockPrefixCallbackOp(T running_total) - : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide - // scan. - __device__ T operator()(T block_aggregate) { - T old_prefix = running_total; - running_total = old_prefix + block_aggregate; - return old_prefix; - } -}; - -// No bank-conflict transpose -template -__global__ void MatrixTranspose(T* odata, const T* idata, size_t height, - size_t width) { - __shared__ T tile[TILE_DIM][TILE_DIM + 1]; - - int x = blockIdx.x * TILE_DIM + threadIdx.x; - int y = blockIdx.y * TILE_DIM + threadIdx.y; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < width && (y + j) < height) { - tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; - } else { - tile[threadIdx.y + j][threadIdx.x] = 0; - } - } - - __syncthreads(); - - x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset - y = blockIdx.x * TILE_DIM + threadIdx.y; - - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < height && (y + j) < width) { - odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; - } - } -} - -template -__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size, - int outer_size, int scan_size, bool exclusive) { - // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types - typedef cub::BlockLoad - BlockLoadT; - typedef cub::BlockStore - BlockStoreT; - typedef cub::BlockScan BlockScanT; - // Allocate type-safe, repurposable shared memory for collectives - __shared__ union { - typename BlockLoadT::TempStorage load; - typename BlockStoreT::TempStorage store; - typename BlockScanT::TempStorage scan; - } temp_storage; - - int bx = blockIdx.x; - int by = blockIdx.y; - - BlockPrefixCallbackOp prefix_op(0); - T block_aggregate = static_cast(0); - - // Obtain this block's segment of consecutive keys (blocked across threads) - int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; - for (int block_offset = 0; block_offset < scan_size; - block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { - int valid_item = (scan_size - block_offset > item_per_block) - ? item_per_block - : (scan_size - block_offset); - if (scan_size < item_per_block) { - valid_item = scan_size; - } - - int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); - - T thread_keys[ITEMS_PER_THREAD]; - BlockLoadT(temp_storage.load) - .Load(d_in + offset, thread_keys, valid_item, 0); - - __syncthreads(); - if (exclusive) { - T init_value = static_cast(0); - BlockScanT(temp_storage.scan) - .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } else { - BlockScanT(temp_storage.scan) - .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } - __syncthreads(); - - BlockStoreT(temp_storage.store) - .Store(d_out + offset, thread_keys, valid_item); - } -} - -template -class CumCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = out->dims(); - auto size = in->numel(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - T* out_data = out->mutable_data(context.GetPlace()); - const T* in_data = in->data(); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - if (size == out_dims[axis]) { - if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); - if (exclusive) { - thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } else { - thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } - thrust::reverse(thrust::device, out_data, out_data + size); - } else { - if (exclusive) { - thrust::exclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } else { - thrust::inclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } - } - return; - } - - size_t height = 1; - size_t width = 1; - for (size_t i = 0; i <= axis; i++) { - height *= out_dims[i]; - } - - for (size_t i = axis + 1; i < out_dims.size(); i++) { - width *= out_dims[i]; - } - int scan_size = out_dims[axis]; - bool transpose = (axis != out_dims.size() - 1); - - int tile_size = 32; - dim3 blocks(32, 8); - dim3 transpose_grids((width + tile_size - 1) / tile_size, - (height + tile_size - 1) / tile_size); - auto& dev_ctx = context.template device_context(); - framework::Tensor tmp; - tmp.Resize(out_dims); - auto* tmp_data = tmp.mutable_data(context.GetPlace()); - T* next_in_data = out_data; - T* next_out_data = tmp_data; - if (transpose) { - MatrixTranspose<<>>( - out_data, in_data, height, width); - next_in_data = out_data; - next_out_data = tmp_data; - } - auto swap_ptr = [](T*& ptr1, T*& ptr2) { - T* tmp = ptr2; - ptr2 = ptr1; - ptr1 = tmp; - }; - int outer_size = height / scan_size; - int inner_size = width; - // Consider the size of shared memory, here block size is 128 - dim3 scan_grid(outer_size, inner_size); - dim3 reverse_grid = scan_grid; - if (reverse) { - if (transpose) { - reverse_grid.x = scan_grid.y; - reverse_grid.y = scan_grid.x; - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - if (!transpose) next_in_data = tmp_data; - swap_ptr(next_in_data, next_out_data); - } else { - MatrixRowReverse<<>>( - in_data, out_data, scan_size, outer_size, inner_size); - } - } - if (!transpose && !reverse) { - BlockScanKernel<<>>( - out_data, in_data, outer_size, inner_size, scan_size, exclusive); - - } else { - BlockScanKernel<<>>( - next_out_data, next_in_data, outer_size, inner_size, scan_size, - exclusive); - } - swap_ptr(next_in_data, next_out_data); - if (reverse) { - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - swap_ptr(next_in_data, next_out_data); - } - if (transpose) { - transpose_grids.x = (height + tile_size - 1) / tile_size; - transpose_grids.y = (width + tile_size - 1) / tile_size; - MatrixTranspose<<>>( - next_out_data, next_in_data, width, height); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cumsum, ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc index 38bf53ca0aa..d197e4362e9 100644 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ b/paddle/fluid/operators/cumsum_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/cum_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index df4d0ebbccd..2e596ff3e62 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -149,13 +149,3 @@ REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, ops::LogLossGradMaker); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CPU_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h deleted file mode 100644 index e7985ab810b..00000000000 --- a/paddle/fluid/operators/log_loss_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - -template -class LogLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* loss_out = ctx.Output("Loss"); - - loss_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto loss = EigenVector::Flatten(*loss_out); - auto& place = *ctx.template device_context().eigen_device(); - - EigenLogLoss, T>::Eval( - place, loss, prediction, label, epsilon); - } -}; - -template -class LogLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = ctx.Output(framework::GradVarName("Predicted")); - - auto dl = EigenVector::Flatten(*dloss); - auto& place = *ctx.template device_context().eigen_device(); - - if (dpred) { - dpred->mutable_data(ctx.GetPlace()); - auto dx = framework::EigenVector::Flatten(*dpred); - EigenLogLossGrad, T>::Eval( - place, dx, dl, prediction, label, epsilon); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index 9775910bba5..f103a69707a 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index b2e68e9870d..aa5fdd86745 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -10,11 +10,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class LogLossXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 2a3a0fa5d1f..54ecba08a82 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/auc_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -146,4 +146,3 @@ There are two types of possible curves: namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); -REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu deleted file mode 100644 index 1cb7eba8775..00000000000 --- a/paddle/fluid/operators/metrics/auc_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/auc_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] -= pos[cur_step_begin + i]; - neg[sum_step_begin + i] -= neg[cur_step_begin + i]; - pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; - } -} - -__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] += pos[cur_step_begin + i]; - neg[sum_step_begin + i] += neg[cur_step_begin + i]; - } -} - -template -__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data, - const int inference_width, - const int num_thresholds, int64_t *pos, - int64_t *neg, const int numel, - const int slide_steps) { - int cur_step_begin = 0; - if (slide_steps > 0) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % - slide_steps; - cur_step_begin = cur_step_index * (1 + num_thresholds); - } - CUDA_KERNEL_LOOP(i, numel) { - auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); - PADDLE_ENFORCE(predict_data >= 0, - "The predict data must gather or equal 0."); - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i]) { - paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); - } else { - paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); - } - } -} -__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, double *auc, - bool need_add_batch_num) { - *auc = 0.0f; - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - if (need_add_batch_num) { - stat_pos[num_thresholds + 1] += 1; - stat_neg[num_thresholds + 1] += 1; - } -} - -template -class AucCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); -#ifdef PADDLE_WITH_CUDA - if (stat_pos_in_tensor != stat_pos) { - cudaMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - cudaMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } -#else - if (stat_pos_in_tensor != stat_pos) { - hipMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - hipMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } -#endif - - statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - int sum_offset = slide_steps * (num_thresholds + 1); - auto stream = - ctx.template device_context().stream(); - CalcAucKernel<<<1, 1, 0, stream>>>( - origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value, slide_steps > 0); - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - auto stream = - ctx.template device_context().stream(); - if (slide_steps == 0) { - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - - ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(auc, - ops::AucCUDAKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h deleted file mode 100644 index 10403472c69..00000000000 --- a/paddle/fluid/operators/metrics/auc_op.h +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AucKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - // Just for pass UT, since UT's input & output connot be set same var - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); - if (stat_pos_in_tensor != stat_pos) { - memcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - if (stat_neg_in_tensor != stat_neg) { - memcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - - int sum_offset = slide_steps * (num_thresholds + 1); - calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value); - if (slide_steps) { - origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; - origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; - } - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - if (slide_steps == 0) { - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[binIdx] += 1; - } - } - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] -= - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] -= - origin_stat_neg[cur_step_begin + i]; - } - - std::memset(origin_stat_pos + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - std::memset(origin_stat_neg + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[cur_step_begin + binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[cur_step_begin + binIdx] += 1; - } - } - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] += - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] += - origin_stat_neg[cur_step_begin + i]; - } - } - - inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg, - int num_thresholds, double *auc) { - *auc = 0.0f; - - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index a4e80343903..8e502fc04db 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: @@ -209,14 +210,3 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel, - ops::SigmoidCrossEntropyWithLogitsKernel); -REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, float>, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu deleted file mode 100644 index 18402d908c4..00000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ /dev/null @@ -1,264 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#ifdef __HIPCC__ -static constexpr int kNumCUDAThreads = 256; -#else -static constexpr int kNumCUDAThreads = 512; -#endif -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -struct NonzeroFunctor { - HOSTDEVICE explicit inline NonzeroFunctor() {} - HOSTDEVICE inline T operator()(const T x) const { - return static_cast(static_cast(x) != 0); - } -}; - -template -struct SigmoidFwdFunctor { - T ignore_index_; - T eps = static_cast(1e-5); - - HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) - : ignore_index_(ignore_index) {} - - HOSTDEVICE inline phi::Array operator()(const T x, const T label) { - T counts; - T out_data; - - T diff = label - static_cast(ignore_index_); - if ((diff > -eps) && (diff < eps)) { - out_data = static_cast(0.); - counts = 0; - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - - out_data = term1 - term2 + term3; - counts = 1; - } - phi::Array outs; - - outs[0] = out_data; - outs[1] = counts; - return outs; - } -}; - -template -struct SigmoidBwdFunctor { - T ignore_index_; - T eps = static_cast(1e-5); - - HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) - : ignore_index_(ignore_index) {} - - HOSTDEVICE inline phi::Array operator()(const T x, const T label, - const T dout) { - T counts; - T dx_data; - - T diff = label - static_cast(ignore_index_); - if ((diff > -eps) && (diff < eps)) { - dx_data = static_cast(0.); - counts = 0; - } else { - T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); - T diff = simoid_x - label; - dx_data = dout * diff; - counts = 1; - } - phi::Array outs; - - outs[0] = dx_data; - outs[1] = counts; - return outs; - } -}; - -template -struct DivFunctor { - const T norm_; - HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} - - HOSTDEVICE inline T operator()(T loss) { - loss /= norm_; - return loss; - } -}; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.cuda_device_context(); - bool normalize = context.Attr("normalize"); - - // Temporary memory - Tensor *counts_tensor = new Tensor(); - counts_tensor->mutable_data(context.GetPlace(), - Labels->numel() * sizeof(T)); - counts_tensor->Resize(Out->dims()); - int limit = Out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - std::vector ins = {X, Labels}; - std::vector outs = {Out, counts_tensor}; - auto functor = SigmoidFwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel(dev_ctx, ins, - &outs, functor); - if (normalize) { - T *counts = counts_tensor->mutable_data(context.GetPlace()); - Tensor *norm_tensor = new Tensor(); - norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); - auto dims = phi::vectorize(counts_tensor->dims()); - std::vector reduce_dim = {}; - for (int i = 0; i < dims.size(); i++) { - reduce_dim.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), *counts_tensor, norm_tensor, - NonzeroFunctor(), reduce_dim, dev_ctx.stream()); - T *norm = norm_tensor->mutable_data(context.GetPlace()); - auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); - T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); - memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, - sizeof(T), dev_ctx.stream()); - auto eps = static_cast(1e-5); - *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - - std::vector div_ins = {Out}; - std::vector div_outs = {Out}; - auto div_functor = DivFunctor(*norm_cpu_ptr); - phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, - div_functor); - - delete norm_tensor; - delete counts_tensor; - } - } -}; - -// dX = sigmoid(X) - labels -template -class GPUSigmoidCrossEntropyWithLogitsGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - - auto &dev_ctx = context.cuda_device_context(); - // Temporary memory - Tensor *counts_tensor = new Tensor(); - counts_tensor->mutable_data(context.GetPlace(), - Labels->numel() * sizeof(T)); - counts_tensor->Resize(dX->dims()); - - int limit = dX->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - std::vector ins = {X, Labels, dOut}; - std::vector outs = {dX, counts_tensor}; - auto functor = SigmoidBwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel(dev_ctx, ins, - &outs, functor); - bool normalize = context.Attr("normalize"); - if (normalize) { - T *counts = counts_tensor->mutable_data(context.GetPlace()); - Tensor *norm_tensor = new Tensor(); - norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); - auto dims = phi::vectorize(counts_tensor->dims()); - std::vector reduce_dim = {}; - for (int i = 0; i < dims.size(); i++) { - reduce_dim.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), *counts_tensor, norm_tensor, - NonzeroFunctor(), reduce_dim, dev_ctx.stream()); - T *norm = norm_tensor->mutable_data(context.GetPlace()); - auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); - T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); - memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, - sizeof(T), dev_ctx.stream()); - auto eps = static_cast(1e-5); - *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - - std::vector div_ins = {dX}; - std::vector div_outs = {dX}; - auto div_functor = DivFunctor(*norm_cpu_ptr); - phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, - div_functor); - delete norm_tensor; - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, double>); -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h deleted file mode 100644 index d2ced490cef..00000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -const int kIgnoreIndex = -100; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - int limit = Out->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - if (static_cast(label) == ignore_index) { - out_data[idx] = static_cast(0.); - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); - out_data[idx] = term1 - term2 + term3; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -// dX = sigmoid(X) - labels -template -class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - int limit = dX->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto dout_data = dOut->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - T dout = dout_data[idx]; - if (static_cast(label) == ignore_index) { - dx_data[idx] = static_cast(0.); - } else { - T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); - T diff = simoid_x - label; - dx_data[idx] = dout * diff; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc index 40852425997..f186f95a2b9 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +const int kIgnoreIndex = -100; void CheckAttrs(const framework::ExecutionContext& ctx) { // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc index 6395aa1caa0..c37731580d1 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc @@ -17,13 +17,15 @@ #include #include -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h new file mode 100644 index 00000000000..acbd17c7801 --- /dev/null +++ b/paddle/phi/kernels/auc_kernel.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AucKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& stat_pos, + const DenseTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + DenseTensor* auc, + DenseTensor* stat_pos_out, + DenseTensor* stat_neg_out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/auc_kernel.cc b/paddle/phi/kernels/cpu/auc_kernel.cc new file mode 100644 index 00000000000..bc25091de75 --- /dev/null +++ b/paddle/phi/kernels/cpu/auc_kernel.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/auc_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) { + return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; +} + +template +void statAuc(const DenseTensor &label, + const DenseTensor &predict, + const int num_thresholds, + const int slide_steps, + int64_t *origin_stat_pos, + int64_t *origin_stat_neg) { + size_t batch_size = predict.dims()[0]; + size_t inference_width = predict.dims()[1]; + const T *inference_data = predict.data(); + const auto *label_data = label.data(); + const int bucket_length = num_thresholds + 1; + if (slide_steps == 0) { + for (size_t i = 0; i < batch_size; i++) { + // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob + // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob + auto predict_data = + inference_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE_LE(predict_data, + 1, + phi::errors::PreconditionNotMet( + "The predict data must less or equal 1.")); + PADDLE_ENFORCE_GE(predict_data, + 0, + phi::errors::PreconditionNotMet( + "The predict data must gather or equal 0.")); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i] > 0) { + origin_stat_pos[binIdx] += 1; + } else if (label_data[i] == 0) { + origin_stat_neg[binIdx] += 1; + } + } + return; + } + // the last number of origin_stat_pos store the index should be used in + // current step + int cur_step_index = + static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % + slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + for (int i = 0; i < bucket_length; ++i) { + origin_stat_pos[sum_step_begin + i] -= origin_stat_pos[cur_step_begin + i]; + origin_stat_neg[sum_step_begin + i] -= origin_stat_neg[cur_step_begin + i]; + } + + std::memset( + origin_stat_pos + cur_step_begin, 0, bucket_length * sizeof(int64_t)); + std::memset( + origin_stat_neg + cur_step_begin, 0, bucket_length * sizeof(int64_t)); + + for (size_t i = 0; i < batch_size; i++) { + // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob + // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob + auto predict_data = + inference_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE_LE(predict_data, + 1, + phi::errors::PreconditionNotMet( + "The predict data must less or equal 1.")); + PADDLE_ENFORCE_GE(predict_data, + 0, + phi::errors::PreconditionNotMet( + "The predict data must gather or equal 0.")); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i] > 0) { + origin_stat_pos[cur_step_begin + binIdx] += 1; + } else if (label_data[i] == 0) { + origin_stat_neg[cur_step_begin + binIdx] += 1; + } + } + for (int i = 0; i < bucket_length; ++i) { + origin_stat_pos[sum_step_begin + i] += origin_stat_pos[cur_step_begin + i]; + origin_stat_neg[sum_step_begin + i] += origin_stat_neg[cur_step_begin + i]; + } +} + +inline static void calcAuc(const int64_t *stat_pos, + const int64_t *stat_neg, + int num_thresholds, + double *auc) { + *auc = 0.0f; + + double totPos = 0.0; + double totNeg = 0.0; + double totPosPrev = 0.0; + double totNegPrev = 0.0; + + int idx = num_thresholds; + + while (idx >= 0) { + totPosPrev = totPos; + totNegPrev = totNeg; + totPos += stat_pos[idx]; + totNeg += stat_neg[idx]; + *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); + --idx; + } + + if (totPos > 0.0 && totNeg > 0.0) { + *auc = *auc / totPos / totNeg; + } +} + +template +void AucKernel(const Context &dev_ctx, + const DenseTensor &input, + const DenseTensor &label, + const DenseTensor &stat_pos, + const DenseTensor &stat_neg, + const std::string &curve, + int num_thresholds, + int slide_steps, + DenseTensor *auc, + DenseTensor *stat_pos_out, + DenseTensor *stat_neg_out) { + // Only use output var for now, make sure it's persistable and + // not cleaned up for each batch. + auto *origin_stat_pos = dev_ctx.template Alloc(stat_pos_out); + auto *origin_stat_neg = dev_ctx.template Alloc(stat_neg_out); + auto *auc_value = dev_ctx.template Alloc(auc); + + // Just for pass UT, since UT's input & output connot be set same var + auto *stat_pos_in_tensor = &stat_pos; + auto *stat_neg_in_tensor = &stat_neg; + auto *pos_in_data = stat_pos.data(); + auto *neg_in_data = stat_neg.data(); + if (stat_pos_in_tensor != stat_pos_out) { + memcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t)); + } + if (stat_neg_in_tensor != stat_neg_out) { + memcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t)); + } + statAuc(label, + input, + num_thresholds, + slide_steps, + origin_stat_pos, + origin_stat_neg); + + int sum_offset = slide_steps * (num_thresholds + 1); + calcAuc(origin_stat_pos + sum_offset, + origin_stat_neg + sum_offset, + num_thresholds, + auc_value); + if (slide_steps) { + origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; + origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(auc, CPU, ALL_LAYOUT, phi::AucKernel, float) {} diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc new file mode 100644 index 00000000000..d32e18479aa --- /dev/null +++ b/paddle/phi/kernels/cpu/cumsum_kernel.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumsum_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +struct CumsumFunctor { + template + const typename X::TensorScanSumOp operator()(X x, + int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +template +void ComputeImp(Device d, + const Dim& dims, + X x, + Out out, + int axis, + bool reverse, + bool exclusive) { + if (!reverse) { + out.reshape(dims).device(d) = + CumsumFunctor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive) + .reverse(rev); + } +} + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out) { + auto out_dims = out->dims(); + + PADDLE_ENFORCE_EQ( + axis < out_dims.size() && axis >= (0 - out_dims.size()), + true, + phi::errors::OutOfRange( + "Attr(axis) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis) = %d.", + out_dims.size(), + out_dims.size() - 1, + axis)); + if (axis < 0) { + axis += out_dims.size(); + } + + dev_ctx.template Alloc(out); + + int pre = 1; + int post = 1; + int mid = out_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= out_dims[i]; + } + for (int i = axis + 1; i < out_dims.size(); ++i) { + post *= out_dims[i]; + } + + auto x0 = EigenVector::Flatten(x); + auto out0 = EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(place, + Eigen::DSizes(mid), + x0, + out0, + /* axis= */ 0, + reverse, + exclusive); + } else { + ComputeImp(place, + Eigen::DSizes(mid, post), + x0, + out0, + /* axis= */ 0, + reverse, + exclusive); + } + } else { + if (post == 1) { + ComputeImp(place, + Eigen::DSizes(pre, mid), + x0, + out0, + /* axis= */ 1, + reverse, + exclusive); + } else { + ComputeImp(place, + Eigen::DSizes(pre, mid, post), + x0, + out0, + /* axis= */ 1, + reverse, + exclusive); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumsum, + CPU, + ALL_LAYOUT, + phi::CumsumKernel, + float, + double, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc new file mode 100644 index 00000000000..2e2d94df59e --- /dev/null +++ b/paddle/phi/kernels/cpu/log_loss_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + log_loss_grad, CPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {} diff --git a/paddle/phi/kernels/cpu/log_loss_kernel.cc b/paddle/phi/kernels/cpu/log_loss_kernel.cc new file mode 100644 index 00000000000..38e93486f7b --- /dev/null +++ b/paddle/phi/kernels/cpu/log_loss_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h" + +PD_REGISTER_KERNEL(log_loss, CPU, ALL_LAYOUT, phi::LogLossKernel, float) {} diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc new file mode 100644 index 00000000000..468db18aa21 --- /dev/null +++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& out_grad, + bool normalize, + int ignore_index, + DenseTensor* in_grad) { + auto dx_data = dev_ctx.template Alloc(in_grad); + + int limit = in_grad->numel(); + auto x_data = x.data(); + auto label_data = label.data(); + auto dout_data = out_grad.data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + T dout = dout_data[idx]; + if (static_cast(label) == ignore_index) { + dx_data[idx] = static_cast(0.); + } else { + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + T diff = simoid_x - label; + dx_data[idx] = dout * diff; + } + } + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(dx_data, dx_data + limit, [norm](T& v) { v = v / norm; }); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad, + CPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc new file mode 100644 index 00000000000..366d300320b --- /dev/null +++ b/paddle/phi/kernels/cpu/sigmoid_cross_entropy_with_logits_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + bool normalize, + int ignore_index, + DenseTensor* out) { + auto out_data = dev_ctx.template Alloc(out); + int limit = out->numel(); + auto x_data = x.data(); + auto label_data = label.data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + T label = label_data[idx]; + if (static_cast(label) == ignore_index) { + out_data[idx] = static_cast(0.); + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); + out_data[idx] = term1 - term2 + term3; + } + } + + if (normalize) { + int norm = 0; + T eps = static_cast(1e-6); + for (int idx = 0; idx < limit; ++idx) { + T diff = label_data[idx] - static_cast(ignore_index); + if ((diff < -eps) || (diff > eps)) { + norm += 1; + } + } + eps = static_cast(1e-5); + norm = norm > eps ? norm : eps; + std::for_each(out_data, out_data + limit, [norm](T& v) { v = v / norm; }); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits, + CPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h new file mode 100644 index 00000000000..fd90c7b8f5e --- /dev/null +++ b/paddle/phi/kernels/cumsum_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu new file mode 100644 index 00000000000..5a1bb9874fe --- /dev/null +++ b/paddle/phi/kernels/gpu/auc_kernel.cu @@ -0,0 +1,258 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/auc_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +__global__ void ClearObsoleteDataKernel(int64_t *pos, + int64_t *neg, + const int bucket_length, + const int slide_steps) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + CUDA_KERNEL_LOOP(i, bucket_length) { + pos[sum_step_begin + i] -= pos[cur_step_begin + i]; + neg[sum_step_begin + i] -= neg[cur_step_begin + i]; + pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; + } +} + +__global__ void UpdateSumDataKernel(int64_t *pos, + int64_t *neg, + const int bucket_length, + const int slide_steps) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + CUDA_KERNEL_LOOP(i, bucket_length) { + pos[sum_step_begin + i] += pos[cur_step_begin + i]; + neg[sum_step_begin + i] += neg[cur_step_begin + i]; + } +} + +template +__global__ void AddDataKernel(const int64_t *label_data, + const T *pred_data, + const int inference_width, + const int num_thresholds, + int64_t *pos, + int64_t *neg, + const int numel, + const int slide_steps) { + int cur_step_begin = 0; + if (slide_steps > 0) { + int cur_step_index = + static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % + slide_steps; + cur_step_begin = cur_step_index * (1 + num_thresholds); + } + CUDA_KERNEL_LOOP(i, numel) { + auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; + PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); + PADDLE_ENFORCE(predict_data >= 0, + "The predict data must gather or equal 0."); + uint32_t binIdx = static_cast(predict_data * num_thresholds); + if (label_data[i]) { + paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); + } else { + paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); + } + } +} + +__global__ void CalcAucKernel(int64_t *stat_pos, + int64_t *stat_neg, + int num_thresholds, + double *auc, + bool need_add_batch_num) { + *auc = 0.0f; + double totPos = 0.0; + double totNeg = 0.0; + double totPosPrev = 0.0; + double totNegPrev = 0.0; + + int idx = num_thresholds; + + while (idx >= 0) { + totPosPrev = totPos; + totNegPrev = totNeg; + totPos += stat_pos[idx]; + totNeg += stat_neg[idx]; + *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; + --idx; + } + + if (totPos > 0.0 && totNeg > 0.0) { + *auc = *auc / totPos / totNeg; + } + if (need_add_batch_num) { + stat_pos[num_thresholds + 1] += 1; + stat_neg[num_thresholds + 1] += 1; + } +} + +inline static double trapezoidArea(double X1, double X2, double Y1, double Y2) { + return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; +} + +template +void statAuc(const Context &dev_ctx, + const DenseTensor &label, + const DenseTensor &predict, + const int num_thresholds, + const int slide_steps, + int64_t *origin_stat_pos, + int64_t *origin_stat_neg) { + size_t batch_size = predict.dims()[0]; + size_t inference_width = predict.dims()[1]; + const T *inference_data = predict.data(); + const auto *label_data = label.data(); + const int bucket_length = num_thresholds + 1; + + if (slide_steps == 0) { + AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>(label_data, + inference_data, + inference_width, + num_thresholds, + origin_stat_pos, + origin_stat_neg, + batch_size, + slide_steps); + return; + } + // the last number of origin_stat_pos store the index should be used in + // current step + int cur_step_index = + static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % + slide_steps; + int cur_step_begin = cur_step_index * bucket_length; + int sum_step_begin = slide_steps * bucket_length; + + ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>( + origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); + + AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>(label_data, + inference_data, + inference_width, + num_thresholds, + origin_stat_pos, + origin_stat_neg, + batch_size, + slide_steps); + UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + dev_ctx.stream()>>>( + origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); +} + +template +void AucKernel(const Context &dev_ctx, + const DenseTensor &input, + const DenseTensor &label, + const DenseTensor &stat_pos, + const DenseTensor &stat_neg, + const std::string &curve, + int num_thresholds, + int slide_steps, + DenseTensor *auc, + DenseTensor *stat_pos_out, + DenseTensor *stat_neg_out) { + // Only use output var for now, make sure it's persistable and + // not cleaned up for each batch. + auto *origin_stat_pos = dev_ctx.template Alloc(stat_pos_out); + auto *origin_stat_neg = dev_ctx.template Alloc(stat_neg_out); + auto *auc_value = dev_ctx.template Alloc(auc); + + auto *stat_pos_in_tensor = &stat_pos; + auto *stat_neg_in_tensor = &stat_neg; + auto *pos_in_data = stat_pos.data(); + auto *neg_in_data = stat_neg.data(); +#ifdef PADDLE_WITH_CUDA + if (stat_pos_in_tensor != stat_pos_out) { + cudaMemcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + cudaMemcpyDeviceToDevice); + } + if (stat_neg_in_tensor != stat_neg_out) { + cudaMemcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + cudaMemcpyDeviceToDevice); + } +#else + if (stat_pos_in_tensor != stat_pos_out) { + hipMemcpy( + origin_stat_pos, + pos_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + hipMemcpyDeviceToDevice); + } + if (stat_neg_in_tensor != stat_neg_out) { + hipMemcpy( + origin_stat_neg, + neg_in_data, + ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) * + sizeof(int64_t), + hipMemcpyDeviceToDevice); + } +#endif + + statAuc(dev_ctx, + label, + input, + num_thresholds, + slide_steps, + origin_stat_pos, + origin_stat_neg); + int sum_offset = slide_steps * (num_thresholds + 1); + CalcAucKernel<<<1, 1, 0, dev_ctx.stream()>>>(origin_stat_pos + sum_offset, + origin_stat_neg + sum_offset, + num_thresholds, + auc_value, + slide_steps > 0); +} + +} // namespace phi + +PD_REGISTER_KERNEL(auc, GPU, ALL_LAYOUT, phi::AucKernel, float) {} diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu new file mode 100644 index 00000000000..a253e6f4ad2 --- /dev/null +++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu @@ -0,0 +1,336 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumsum_kernel.h" + +#include +#include +#include +#include +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__device__ void BlockReverse( + const T* idata, T* odata, int src_base, int dst_base, int valid_item) { + __shared__ T sh_mem[BLOCK_SIZE]; + int tx = threadIdx.x; + + int offset = tx; + int in_index = src_base + offset; + if (offset >= valid_item) { + sh_mem[offset] = 0; + } else { + int sh_mem_index = BLOCK_SIZE - offset - 1; + T data = idata[in_index]; + sh_mem[sh_mem_index] = data; + } + + __syncthreads(); + int out_index = dst_base - offset; + if (offset < valid_item) { + int sh_mem_index = BLOCK_SIZE - offset - 1; + odata[out_index] = sh_mem[sh_mem_index]; + } +} + +template +__global__ void MatrixRowReverse(const T* matrix_data, + T* reverse_data, + int reverse_size, + int outer_size, + int inner_size) { + int bx = blockIdx.x; + int by = blockIdx.y; + int item_per_block = 1024; + + for (int block_offset = 0; block_offset < reverse_size; + block_offset += item_per_block) { + int valid_item = (reverse_size - block_offset > item_per_block) + ? item_per_block + : reverse_size - block_offset; + int src_offset = + bx * reverse_size + block_offset + by * (inner_size * reverse_size); + int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + + reverse_size - 1 - block_offset; + if (reverse_size < item_per_block) { + valid_item = reverse_size; + } + + BlockReverse( + matrix_data, reverse_data, src_offset, dst_offset, valid_item); + } +} + +template +struct BlockPrefixCallbackOp { + // Running prefix + T running_total; + // Constructor + __device__ BlockPrefixCallbackOp(T running_total) + : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ T operator()(T block_aggregate) { + T old_prefix = running_total; + running_total = old_prefix + block_aggregate; + return old_prefix; + } +}; + +// No bank-conflict transpose +template +__global__ void MatrixTranspose(T* odata, + const T* idata, + size_t height, + size_t width) { + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; + + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + if (x < width && (y + j) < height) { + tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; + } else { + tile[threadIdx.y + j][threadIdx.x] = 0; + } + } + + __syncthreads(); + + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { + if (x < height && (y + j) < width) { + odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; + } + } +} + +template +__global__ void BlockScanKernel(T* d_out, + const T* d_in, + int inner_size, + int outer_size, + int scan_size, + bool exclusive) { + // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types + typedef cub:: + BlockLoad + BlockLoadT; + typedef cub:: + BlockStore + BlockStoreT; + typedef cub::BlockScan BlockScanT; + // Allocate type-safe, repurposable shared memory for collectives + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockStoreT::TempStorage store; + typename BlockScanT::TempStorage scan; + } temp_storage; + + int bx = blockIdx.x; + int by = blockIdx.y; + + BlockPrefixCallbackOp prefix_op(0); + T block_aggregate = static_cast(0); + + // Obtain this block's segment of consecutive keys (blocked across threads) + int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; + for (int block_offset = 0; block_offset < scan_size; + block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { + int valid_item = (scan_size - block_offset > item_per_block) + ? item_per_block + : (scan_size - block_offset); + if (scan_size < item_per_block) { + valid_item = scan_size; + } + + int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); + + T thread_keys[ITEMS_PER_THREAD]; + BlockLoadT(temp_storage.load) + .Load(d_in + offset, thread_keys, valid_item, 0); + + __syncthreads(); + if (exclusive) { + T init_value = static_cast(0); + BlockScanT(temp_storage.scan) + .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); + } else { + BlockScanT(temp_storage.scan) + .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); + } + __syncthreads(); + + BlockStoreT(temp_storage.store) + .Store(d_out + offset, thread_keys, valid_item); + } +} + +template +void CumsumKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + DenseTensor* out) { + auto out_dims = out->dims(); + auto size = x.numel(); + + PADDLE_ENFORCE_EQ( + axis < out_dims.size() && axis >= (0 - out_dims.size()), + true, + phi::errors::OutOfRange( + "Attr(axis) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis) = %d.", + out_dims.size(), + out_dims.size() - 1, + axis)); + if (axis < 0) { + axis += out_dims.size(); + } + + T* out_data = dev_ctx.template Alloc(out); + const T* in_data = x.data(); + + // Use thrust for parallel acceleration when the input size is equal to the + // length of the ‘axis’ dimension. + if (size == out_dims[axis]) { + if (reverse) { + thrust::device_ptr dev_ptr = + thrust::device_pointer_cast(in_data); + thrust::device_vector vec(dev_ptr, dev_ptr + size); + if (exclusive) { + thrust::exclusive_scan( + thrust::device, vec.rbegin(), vec.rend(), out_data); + } else { + thrust::inclusive_scan( + thrust::device, vec.rbegin(), vec.rend(), out_data); + } + thrust::reverse(thrust::device, out_data, out_data + size); + } else { + if (exclusive) { + thrust::exclusive_scan( + thrust::device, in_data, in_data + size, out_data); + } else { + thrust::inclusive_scan( + thrust::device, in_data, in_data + size, out_data); + } + } + return; + } + + size_t height = 1; + size_t width = 1; + for (size_t i = 0; i <= axis; i++) { + height *= out_dims[i]; + } + + for (size_t i = axis + 1; i < out_dims.size(); i++) { + width *= out_dims[i]; + } + int scan_size = out_dims[axis]; + bool transpose = (axis != out_dims.size() - 1); + + int tile_size = 32; + dim3 blocks(32, 8); + dim3 transpose_grids((width + tile_size - 1) / tile_size, + (height + tile_size - 1) / tile_size); + out->Resize(out_dims); + auto* tmp_data = out->data(); + + T* next_in_data = out_data; + T* next_out_data = tmp_data; + if (transpose) { + MatrixTranspose<<>>( + out_data, in_data, height, width); + next_in_data = out_data; + next_out_data = tmp_data; + } + auto swap_ptr = [](T*& ptr1, T*& ptr2) { + T* tmp = ptr2; + ptr2 = ptr1; + ptr1 = tmp; + }; + int outer_size = height / scan_size; + int inner_size = width; + // Consider the size of shared memory, here block size is 128 + dim3 scan_grid(outer_size, inner_size); + dim3 reverse_grid = scan_grid; + if (reverse) { + if (transpose) { + reverse_grid.x = scan_grid.y; + reverse_grid.y = scan_grid.x; + MatrixRowReverse<<>>( + next_in_data, next_out_data, scan_size, outer_size, inner_size); + if (!transpose) next_in_data = tmp_data; + swap_ptr(next_in_data, next_out_data); + } else { + MatrixRowReverse<<>>( + in_data, out_data, scan_size, outer_size, inner_size); + } + } + if (!transpose && !reverse) { + BlockScanKernel<<>>( + out_data, in_data, outer_size, inner_size, scan_size, exclusive); + + } else { + BlockScanKernel<<>>( + next_out_data, + next_in_data, + outer_size, + inner_size, + scan_size, + exclusive); + } + swap_ptr(next_in_data, next_out_data); + if (reverse) { + MatrixRowReverse<<>>( + next_in_data, next_out_data, scan_size, outer_size, inner_size); + swap_ptr(next_in_data, next_out_data); + } + if (transpose) { + transpose_grids.x = (height + tile_size - 1) / tile_size; + transpose_grids.y = (width + tile_size - 1) / tile_size; + MatrixTranspose<<>>( + next_out_data, next_in_data, width, height); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumsum, + GPU, + ALL_LAYOUT, + phi::CumsumKernel, + float, + double, + int16_t, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu new file mode 100644 index 00000000000..3bb256ad032 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {} diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu new file mode 100644 index 00000000000..0934520ea4a --- /dev/null +++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_loss_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/log_loss_kernel_impl.h" + +PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {} diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h new file mode 100644 index 00000000000..6f9cda83a9a --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/operators/math.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +namespace phi { + +#ifdef __HIPCC__ +static constexpr int kNumCUDAThreads = 256; +#else +static constexpr int kNumCUDAThreads = 512; +#endif +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +struct NonzeroFunctor { + HOSTDEVICE explicit inline NonzeroFunctor() {} + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(static_cast(x) != 0); + } +}; + +template +struct DivFunctor { + const T norm_; + HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} + + HOSTDEVICE inline T operator()(T loss) { + loss /= norm_; + return loss; + } +}; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu new file mode 100644 index 00000000000..ae3cefd9e82 --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h" + +#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" + +namespace phi { + +template +struct SigmoidBwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, + const T label, + const T dout) { + T counts; + T dx_data; + + T diff = label - static_cast(ignore_index_); + if ((diff > -eps) && (diff < eps)) { + dx_data = static_cast(0.); + counts = 0; + } else { + T simoid_x = static_cast(1) / + (static_cast(1) + paddle::operators::real_exp(-x)); + T diff = simoid_x - label; + dx_data = dout * diff; + counts = 1; + } + phi::Array outs; + + outs[0] = dx_data; + outs[1] = counts; + return outs; + } +}; + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &label, + const DenseTensor &out_grad, + bool normalize, + int ignore_index, + DenseTensor *in_grad) { + auto dx_data = dev_ctx.template Alloc(in_grad); + + // Temporary memory + DenseTensor *counts_tensor = new DenseTensor(); + + int64_t out_dims = label.numel() * sizeof(T); + counts_tensor->Resize({out_dims}); + dev_ctx.template Alloc(counts_tensor); + counts_tensor->Resize(in_grad->dims()); + + int limit = in_grad->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + std::vector ins = {&x, &label, &out_grad}; + std::vector outs = {in_grad, counts_tensor}; + auto functor = SigmoidBwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, functor); + if (normalize) { + T *counts = dev_ctx.template Alloc(counts_tensor); + DenseTensor *norm_tensor = new DenseTensor(); + norm_tensor->Resize({sizeof(T)}); + dev_ctx.template Alloc(norm_tensor); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + kernels::TensorReduceImpl>( + dev_ctx, + *counts_tensor, + norm_tensor, + NonzeroFunctor(), + reduce_dim, + dev_ctx.stream()); + T *norm = dev_ctx.template Alloc(norm_tensor); + auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + paddle::memory::Copy(phi::CPUPlace(), + norm_cpu_ptr, + dev_ctx.GetPlace(), + norm, + sizeof(T), + dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {in_grad}; + std::vector div_outs = {in_grad}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); + delete norm_tensor; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits_grad, + GPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu new file mode 100644 index 00000000000..fb63badf56a --- /dev/null +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -0,0 +1,128 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h" + +#include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h" + +namespace phi { + +template +struct SigmoidFwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label) { + T counts; + T out_data; + + T diff = label - static_cast(ignore_index_); + if ((diff > -eps) && (diff < eps)) { + out_data = static_cast(0.); + counts = 0; + } else { + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = paddle::operators::real_log( + static_cast(1) + + paddle::operators::real_exp(static_cast(-abs(x)))); + + out_data = term1 - term2 + term3; + counts = 1; + } + phi::Array outs; + + outs[0] = out_data; + outs[1] = counts; + return outs; + } +}; + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &label, + bool normalize, + int ignore_index, + DenseTensor *out) { + auto out_data = dev_ctx.template Alloc(out); + + // Temporary memory + DenseTensor *counts_tensor = new DenseTensor(); + + int64_t out_dims = label.numel() * sizeof(T); + counts_tensor->Resize({out_dims}); + dev_ctx.template Alloc(counts_tensor); + counts_tensor->Resize(out->dims()); + + int limit = out->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + std::vector ins = {&x, &label}; + std::vector outs = {out, counts_tensor}; + auto functor = SigmoidFwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel( + dev_ctx, ins, &outs, functor); + if (normalize) { + T *counts = dev_ctx.template Alloc(counts_tensor); + DenseTensor *norm_tensor = new DenseTensor(); + norm_tensor->Resize({sizeof(T)}); + dev_ctx.template Alloc(norm_tensor); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + kernels::TensorReduceImpl>( + dev_ctx, + *counts_tensor, + norm_tensor, + NonzeroFunctor(), + reduce_dim, + dev_ctx.stream()); + T *norm = dev_ctx.template Alloc(norm_tensor); + auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + paddle::memory::Copy(phi::CPUPlace(), + norm_cpu_ptr, + dev_ctx.GetPlace(), + norm, + sizeof(T), + dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {out}; + std::vector div_outs = {out}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); + + delete norm_tensor; + delete counts_tensor; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(sigmoid_cross_entropy_with_logits, + GPU, + ALL_LAYOUT, + phi::SigmoidCrossEntropyWithLogitsKernel, + float, + double) {} diff --git a/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h new file mode 100644 index 00000000000..6f84133d5f4 --- /dev/null +++ b/paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void LogLossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + float epsilon, + DenseTensor* in_grad) { + auto prediction = EigenVector::Flatten(input); + auto label_out = EigenVector::Flatten(label); + + auto dl = EigenVector::Flatten(out_grad); + auto& place = *dev_ctx.eigen_device(); + + if (in_grad) { + dev_ctx.template Alloc(in_grad); + auto dx = EigenVector::Flatten(*in_grad); + phi::funcs::EigenLogLossGrad, T>::Eval( + place, dx, dl, prediction, label_out, epsilon); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/log_loss_kernel_impl.h b/paddle/phi/kernels/impl/log_loss_kernel_impl.h new file mode 100644 index 00000000000..d49144c8354 --- /dev/null +++ b/paddle/phi/kernels/impl/log_loss_kernel_impl.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void LogLossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + float epsilon, + DenseTensor* out) { + dev_ctx.template Alloc(out); + + auto prediction = EigenVector::Flatten(input); + auto label_out = EigenVector::Flatten(label); + + auto loss = EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + + phi::funcs::EigenLogLoss, T>::Eval( + place, loss, prediction, label_out, epsilon); +} + +} // namespace phi diff --git a/paddle/phi/kernels/log_loss_grad_kernel.h b/paddle/phi/kernels/log_loss_grad_kernel.h new file mode 100644 index 00000000000..6853140b19b --- /dev/null +++ b/paddle/phi/kernels/log_loss_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogLossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + float epsilon, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/log_loss_kernel.h b/paddle/phi/kernels/log_loss_kernel.h new file mode 100644 index 00000000000..cd16c0f2c7c --- /dev/null +++ b/paddle/phi/kernels/log_loss_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogLossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + float epsilon, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h new file mode 100644 index 00000000000..6bc75b7670f --- /dev/null +++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& out_grad, + bool normalize, + int ignore_index, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h new file mode 100644 index 00000000000..7ea3e6589f7 --- /dev/null +++ b/paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + bool normalize, + int ignore_index, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc new file mode 100644 index 00000000000..c4ae746e975 --- /dev/null +++ b/paddle/phi/ops/compat/log_loss_sig.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_loss_grad", + {"Predicted", "Labels", GradVarName("Loss")}, + {"epsilon"}, + {GradVarName("Predicted")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc new file mode 100644 index 00000000000..61ad9627a96 --- /dev/null +++ b/paddle/phi/ops/compat/sigmoid_cross_entropy_with_logits_sig.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sigmoid_cross_entropy_with_logits_grad", + {"X", "Label", GradVarName("Out")}, + {"normalize", "ignore_index"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN( + sigmoid_cross_entropy_with_logits_grad, + phi::SigmoidCrossEntropyWithLogitsKernelGradOpArgumentMapping); -- GitLab From 837406551260414ab18689251e3b2422a10faf69 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Fri, 4 Mar 2022 15:47:46 +0800 Subject: [PATCH 055/261] [Phi] move gaussian_random, fix fp16 (#40122) [Phi] move gaussian_random, fix fp16 --- paddle/phi/kernels/gpu/gaussian_random_kernel.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu index d5acc60a360..da16800ad02 100644 --- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -81,22 +81,25 @@ void GaussianRandomKernel(const Context& dev_ctx, int device_id = dev_ctx.GetPlace().GetDeviceId(); auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); - using MT = typename phi::kps::details::MPTypeTrait::Type; if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { + using MT = typename phi::kps::details::MPTypeTrait::Type; funcs::normal_distribution dist; funcs::normal_transform trans(mean, std); funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; - auto func = - GaussianGenerator(mean, std, seed_offset.first, gen_offset); - IndexKernel>(dev_ctx, tensor, func); + auto func = GaussianGenerator(static_cast(mean), + static_cast(std), + seed_offset.first, + gen_offset); + IndexKernel>(dev_ctx, tensor, func); } } else { - auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_ctx, tensor, func); + auto func = + GaussianGenerator(static_cast(mean), static_cast(std), seed); + IndexKernel>(dev_ctx, tensor, func); } } -- GitLab From a7e4cdaf658697b50cd2be3616e017b6e1c49cb0 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 4 Mar 2022 16:03:30 +0800 Subject: [PATCH 056/261] [ROCm] fix hip test to update LD_LIBRARY_PATH, test=develop (#40153) --- cmake/generic.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index da81575188f..ba59eae392c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -651,6 +651,7 @@ function(hip_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH") endif() endfunction(hip_test) -- GitLab From 880dec0fef853f9aed034d7686d5a11fed9673d6 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 4 Mar 2022 16:55:08 +0800 Subject: [PATCH 057/261] Enable eager model test (#40154) * enable eager model; test=develop * set bs = 5; test=develop --- .../test_imperative_ocr_attention_model.py | 26 +++++++++++- .../test_imperative_reinforcement.py | 23 ++++++++++- .../unittests/test_imperative_se_resnext.py | 41 ++++++++++++++++++- ..._imperative_transformer_sorted_gradient.py | 36 ++++++++++++++-- 4 files changed, 119 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 973c5598579..09868520b4c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -22,6 +22,7 @@ from paddle.fluid import core from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope +from paddle.fluid.framework import _test_eager_guard class Config(object): @@ -371,7 +372,7 @@ class OCRAttention(fluid.dygraph.Layer): class TestDygraphOCRAttention(unittest.TestCase): - def test_while_op(self): + def test_ocr_test(self): seed = 90 epoch_num = 1 if core.is_compiled_with_cuda(): @@ -400,7 +401,7 @@ class TestDygraphOCRAttention(unittest.TestCase): i * Config.max_length, dtype='int64').reshape([1, Config.max_length]))) - with fluid.dygraph.guard(): + def run_dygraph(): fluid.set_flags({'FLAGS_sort_sum_gradient': True}) paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -452,6 +453,16 @@ class TestDygraphOCRAttention(unittest.TestCase): for param in ocr_attention.parameters(): dy_param_value[param.name] = param.numpy() + return dy_out, dy_param_init_value, dy_param_value + + with fluid.dygraph.guard(): + dy_out, dy_param_init_value, dy_param_value = run_dygraph() + + with fluid.dygraph.guard(): + with _test_eager_guard(): + eager_out, eager_param_init_value, eager_param_value = run_dygraph( + ) + with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -537,6 +548,17 @@ class TestDygraphOCRAttention(unittest.TestCase): for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05)) + # check eager here + self.assertTrue(np.allclose(static_out, eager_out)) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.array_equal(value, eager_param_init_value[key])) + + for key, value in six.iteritems(static_param_value): + self.assertTrue( + np.allclose( + value, eager_param_value[key], rtol=1e-05)) + if __name__ == '__main__': paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py index a89628c594d..08320d04d99 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py @@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear import paddle.fluid.dygraph.nn as nn from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope +from paddle.fluid.framework import _test_eager_guard class Policy(fluid.dygraph.Layer): @@ -63,7 +64,7 @@ class TestImperativeMnist(unittest.TestCase): mask_list = [[0, 1]] mask = np.array(mask_list).astype("float32") - with fluid.dygraph.guard(): + def run_dygraph(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -104,6 +105,16 @@ class TestImperativeMnist(unittest.TestCase): for param in policy.parameters(): dy_param_value[param.name] = param.numpy() + return dy_out, dy_param_init_value, dy_param_value + + with fluid.dygraph.guard(): + dy_out, dy_param_init_value, dy_param_value = run_dygraph() + + with fluid.dygraph.guard(): + with _test_eager_guard(): + eager_out, eager_param_init_value, eager_param_value = run_dygraph( + ) + with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -171,6 +182,16 @@ class TestImperativeMnist(unittest.TestCase): for key, value in six.iteritems(static_param_value): self.assertTrue(np.equal(value, dy_param_value[key]).all()) + # check eager + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.equal(value, eager_param_init_value[key]).all()) + + self.assertTrue(np.equal(static_out, eager_out).all()) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.equal(value, eager_param_value[key]).all()) + if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py index 8f8890557ad..3fbb7f4cf7b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -24,6 +24,7 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope +from paddle.fluid.framework import _test_eager_guard if fluid.is_compiled_with_cuda(): fluid.set_flags({'FLAGS_cudnn_deterministic': True}) @@ -310,7 +311,8 @@ class TestImperativeResneXt(unittest.TestCase): batch_size = train_parameters["batch_size"] batch_num = 1 epoch_num = 1 - with fluid.dygraph.guard(): + + def run_dygraph(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -371,6 +373,17 @@ class TestImperativeResneXt(unittest.TestCase): for param in se_resnext.parameters(): dy_param_value[param.name] = param.numpy() + return dy_out, dy_param_init_value, dy_param_value, dy_grad_value + + with fluid.dygraph.guard(): + dy_out, dy_param_init_value, dy_param_value, dy_grad_value = run_dygraph( + ) + + with fluid.dygraph.guard(): + with _test_eager_guard(): + eager_out, eager_param_init_value, eager_param_value, eager_grad_value = run_dygraph( + ) + with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -479,6 +492,32 @@ class TestImperativeResneXt(unittest.TestCase): self.assertTrue(np.isfinite(value.all())) self.assertFalse(np.isnan(value.any())) + # check eager + self.assertTrue( + np.allclose(static_out, eager_out), + "\nstatic_out: {}\neager_out: {}".format(static_out, eager_out)) + + self.assertEqual( + len(eager_param_init_value), len(static_param_init_value)) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, eager_param_init_value[key])) + + self.assertEqual(len(eager_grad_value), len(static_grad_value)) + + for key, value in six.iteritems(static_grad_value): + self.assertTrue( + np.allclose(value, eager_grad_value[key]), + "\nstatic_grad_value: {}\neager_grad_value: {}".format( + value, eager_grad_value[key])) + + self.assertEqual(len(eager_param_value), len(static_param_value)) + for key, value in six.iteritems(static_param_value): + self.assertTrue( + np.allclose(value, eager_param_value[key]), + "\nstatic_param_value: {}\neagear_param_value: {}".format( + value, eager_param_value[key])) + if __name__ == '__main__': paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 3f129cae44a..010c8aeccac 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -21,6 +21,7 @@ from paddle.fluid import Embedding, LayerNorm, Linear, Layer from paddle.fluid.dygraph import to_variable, guard from paddle.fluid.dygraph import TracedLayer from test_imperative_base import new_program_scope +from paddle.fluid.framework import _test_eager_guard from paddle.fluid import core import numpy as np import six @@ -949,8 +950,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): def transformer_sort_gradient_float32(self, is_sparse): seed = 90 - with guard(): - fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + def run_dygraph(): # NOTE(xiongkun03): In new executor, the inplace strategy is on by default, which will cause result of sumop have some differences. So we disable inplace. fluid.set_flags({'FLAGS_new_executor_use_inplace': False}) paddle.seed(seed) @@ -998,7 +998,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): for i in range(batch_num): enc_inputs, dec_inputs, label, weights = create_data() - if i % 2 == 0: + if False: outs, traced_layer = TracedLayer.trace( transformer, [enc_inputs, dec_inputs, label, weights]) @@ -1036,6 +1036,14 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): dy_predict_value = dy_predict.numpy() dy_token_num_value = dy_token_num.numpy() + return dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \ + dy_param_init, dy_param_updated + + with guard(): + fluid.set_flags({'FLAGS_sort_sum_gradient': True}) + dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \ + dy_param_init, dy_param_updated = run_dygraph() + with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) @@ -1122,6 +1130,28 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): for key, value in six.iteritems(static_param_updated): self.assertTrue(np.array_equal(value, dy_param_updated[key])) + # check eager result + with guard(): + fluid.set_flags({'FLAGS_sort_sum_gradient': False}) + dy_avg_cost_value, dy_sum_cost_value, dy_predict_value, dy_token_num_value, \ + dy_param_init, dy_param_updated = run_dygraph() + + with guard(): + with _test_eager_guard(): + eager_avg_cost_value, eager_sum_cost_value, eager_predict_value, eager_token_num_value, \ + eager_param_init, eager_param_updated = run_dygraph() + self.assertTrue(np.allclose(dy_avg_cost_value, eager_avg_cost_value)) + self.assertTrue(np.allclose(dy_sum_cost_value, eager_sum_cost_value)) + + self.assertTrue(np.allclose(dy_predict_value, eager_predict_value)) + self.assertTrue(np.allclose(dy_token_num_value, eager_token_num_value)) + + for key, value in six.iteritems(static_param_init): + self.assertTrue(np.array_equal(value, eager_param_init[key])) + for key, value in six.iteritems(dy_param_updated): + self.assertTrue(np.allclose(value, eager_param_updated[key])) + if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From 70540b2684c5bef920f3bd0c445b391ce9f9fb49 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Fri, 4 Mar 2022 17:02:51 +0800 Subject: [PATCH 058/261] [phi] move cpu_vec (#39714) move cpu_vec.h to phi/kernels/funcs. --- paddle/fluid/operators/attention_lstm_op.cc | 18 +- .../fused/fused_embedding_fc_lstm_op.cc | 6 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 6 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/phi/kernels/funcs/cpu_vec.h | 675 ++++++++++++++++++ paddle/phi/tests/kernels/CMakeLists.txt | 2 + .../tests/kernels/test_cpu_vec.cc} | 112 +-- 7 files changed, 756 insertions(+), 64 deletions(-) create mode 100644 paddle/phi/kernels/funcs/cpu_vec.h rename paddle/{fluid/operators/math/cpu_vec_test.cc => phi/tests/kernels/test_cpu_vec.cc} (75%) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index a23e484d0a8..78ea8b6b6fb 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + phi::funcs::vec_add_bias(n, *bias, x, y); + phi::funcs::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + phi::funcs::vec_relu(n, x, y); } } @@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + phi::funcs::vec_add_bias(n, -scalar, x, y); // sub + phi::funcs::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { scalar += y[i]; } - math::vec_scal(n, static_cast(1) / scalar, y); // scale + phi::funcs::vec_scal(n, static_cast(1) / scalar, y); // scale } template @@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 0c83c36b475..7308f307792 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { @@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ if (platform::MayIUse(platform::avx)) { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 88fb7349d53..1000d0488dc 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ba047355ad7..14b12ca3acb 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -70,7 +70,6 @@ if(WITH_GPU AND (NOT WITH_ROCM)) endif() endif() -cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) if(WITH_TESTING AND TEST im2col_test) set_tests_properties(im2col_test PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h new file mode 100644 index 00000000000..7bb2a5fcfb3 --- /dev/null +++ b/paddle/phi/kernels/funcs/cpu_vec.h @@ -0,0 +1,675 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +namespace phi { +namespace funcs { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 + +#define YMM_FLOAT_BLOCK 8 +#define AVX_DOUBLE_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define AVX2_DOUBLE_BLOCK 4 +#define ZMM_FLOAT_BLOCK 16 +#define AVX512_DOUBLE_BLOCK 8 + +template +inline void vec_exp(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +inline void vec_scal(const int n, const T a, T* x) { + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +} + +#ifdef PADDLE_WITH_MKLML +template <> +inline void vec_exp(const int n, const float* x, float* y) { + constexpr int small_enough = 128; + if (n < small_enough) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } + } else { + paddle::platform::dynload::vsExp(n, x, y); + } +} + +template <> +inline void vec_exp(const int n, const double* x, double* y) { + paddle::platform::dynload::vdExp(n, x, y); +} + +template <> +inline void vec_scal(const int n, const float a, float* x) { + paddle::platform::dynload::cblas_sscal(n, a, x, 1); +} + +template <> +inline void vec_scal(const int n, const double a, double* x) { + paddle::platform::dynload::cblas_dscal(n, a, x, 1); +} +#endif + +// MKL scal only support inplace, choose this if src and dst are not equal +template +inline void vec_scal(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = a * x[i]; + } +} + +template <> +inline void vec_scal(const int n, + const float a, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_scal(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 scalar = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = a * x[i]; + } +#else + vec_scal(n, a, x, y); +#endif +} + +template <> +inline void vec_scal(const int n, + const float a, + const float* x, + float* y) { + vec_scal(n, a, x, y); +} + +template <> +inline void vec_scal(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_scal(n, a, x, y); +} + +template +inline void vec_sum(const size_t n, const T* x, T* s) { + s[0] = x[0]; + for (size_t i = 1; i < n; ++i) { + s[0] += x[i]; + } +} + +template <> +inline void vec_sum(const size_t n, + const float* x, + float* s) { +#ifdef __AVX__ + constexpr unsigned int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_sum(n, x, s); + return; + } + + unsigned int i, end; + i = end = 0; + s[0] = 0.f; + + end = n & ~(block - 1); + __m256 tmp = _mm256_setzero_ps(); + for (i = 0; i < end; i += block) { + tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i)); + } + + __m256 hsum = _mm256_hadd_ps(tmp, tmp); + hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1)); + _mm_store_ss( + s, + _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum))); + + for (; i < n; i++) { + s[0] += x[i]; + } +#else + vec_sum(n, x, s); +#endif +} + +template +inline void vec_mul(const size_t n, const T* x, const T* y, T* z) { + for (size_t i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +template <> +inline void vec_mul(const size_t n, + const float* x, + const float* y, + float* z) { +#ifdef __AVX__ + constexpr unsigned int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_mul(n, x, y, z); + return; + } + + unsigned int i = 0, end = 0; + end = n & ~(block - 1); + for (i = 0; i < end; i += block) { + _mm256_storeu_ps( + z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i))); + } + + for (; i < n; i++) { + z[i] = x[i] * y[i]; + } +#else + vec_mul(n, x, y, z); +#endif +} + +template +inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) { + z[0] = x[0] * y[0]; + for (size_t i = 1; i < n; ++i) { + z[0] += x[i] * y[i]; + } +} + +template <> +inline void vec_mul_reduce(const size_t n, + const float* x, + const float* y, + float* z) { +#ifdef __AVX__ + constexpr unsigned int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_mul_reduce(n, x, y, z); + return; + } + + unsigned int i = 0, end = 0; + z[0] = 0.f; + + end = n & ~(block - 1); + __m256 tmp = _mm256_setzero_ps(); + for (i = 0; i < end; i += block) { + tmp = _mm256_add_ps( + tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i))); + } + + __m256 hsum = _mm256_hadd_ps(tmp, tmp); + hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1)); + _mm_store_ss( + z, + _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum))); + + for (; i < n; i++) { + z[0] += x[i] * y[i]; + } +#else + vec_mul_reduce(n, x, y, z); +#endif +} + +template +inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = a - x[i]; + } +} + +template <> +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_bias_sub(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_sub_ps(bias, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = a - x[i]; + } +#else + vec_bias_sub(n, a, x, y); +#endif +} + +template <> +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { + vec_bias_sub(n, a, x, y); +} + +template <> +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_bias_sub(n, a, x, y); +} + +// out = x*y + (1-x)*z +template +inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { + for (int i = 0; i < n; ++i) { + out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; + } +} + +template <> +inline void vec_cross( + const int n, const float* x, const float* y, const float* z, float* out) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_cross(n, x, y, z, out); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(1.f); + __m256 tmpx, tmpy, tmpz; + for (i = 0; i < end; i += block) { + tmpx = _mm256_loadu_ps(x + i); + tmpy = _mm256_loadu_ps(y + i); + tmpz = _mm256_loadu_ps(z + i); + tmpy = _mm256_mul_ps(tmpx, tmpy); + tmpx = _mm256_sub_ps(bias, tmpx); + tmpz = _mm256_mul_ps(tmpx, tmpz); + tmpz = _mm256_add_ps(tmpy, tmpz); + _mm256_storeu_ps(out + i, tmpz); + } + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; + } +#else + vec_cross(n, x, y, z, out); +#endif +} + +template <> +inline void vec_cross( + const int n, const float* x, const float* y, const float* z, float* out) { + vec_cross(n, x, y, z, out); +} + +template <> +inline void vec_cross( + const int n, const float* x, const float* y, const float* z, float* out) { + // TODO(TJ): enable me + vec_cross(n, x, y, z, out); +} + +template +inline void vec_clip(const size_t n, const T a, const T* x, T* y) { + for (size_t i = 0; i < n; ++i) { + y[i] = x[i] < a ? a : x[i]; + } +} + +template <> +inline void vec_clip(const size_t n, + const float a, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr unsigned int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_clip(n, a, x, y); + return; + } + + unsigned int i = 0, end = 0; + end = n & ~(block - 1); + __m256 threshold = _mm256_set1_ps(a); + + for (i = 0; i < end; i += block) { + _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold)); + } + + for (; i < n; i++) { + y[i] = x[i] < a ? a : x[i]; + } +#else + vec_clip(n, a, x, y); +#endif +} + +template +inline void vec_add_bias(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +template <> +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_add_bias(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_add_ps(tmp, bias); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = x[i] + a; + } +#else + vec_add_bias(n, a, x, y); +#endif +} + +template <> +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { + vec_add_bias(n, a, x, y); +} + +template <> +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_add_bias(n, a, x, y); +} + +template +inline void vec_identity(const int n, const T* x, T* y) { + // do nothing + return; +} + +template +inline void vec_sigmoid(const int n, const T* x, T* y) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + vec_exp(n, y, y); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template <> +inline void vec_sigmoid(const int n, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block) { + vec_sigmoid(n, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(zeros, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest != 0) { + // can not continue move step since the src and dst address could be equal + const float xmin = SIGMOID_THRESHOLD_MIN; + const float xmax = SIGMOID_THRESHOLD_MAX; + for (i = n - rest; i < n; ++i) { + y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i])); + } + } + + vec_exp(n, y, y); + + __m256 ones = _mm256_set1_ps(1.0f); +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(y + i); \ + tmp = _mm256_add_ps(ones, tmp); \ + tmp = _mm256_div_ps(ones, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step + for (i = n - rest; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +#else + vec_sigmoid(n, x, y); +#endif +} + +template <> +inline void vec_sigmoid(const int n, + const float* x, + float* y) { + vec_sigmoid(n, x, y); +} + +template <> +inline void vec_sigmoid(const int n, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_sigmoid(n, x, y); +} + +template +inline void vec_tanh(const int n, const T* x, T* y) { + vec_scal(n, static_cast(2), x, y); + vec_sigmoid(n, y, y); + vec_scal(n, static_cast(2), y); + vec_add_bias(n, static_cast(-1), y, y); +} + +// TODO(TJ): make relu clip +template +inline void vec_relu(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template <> +inline void vec_relu(const int n, + const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = YMM_FLOAT_BLOCK; + if (n < block * 4) { + vec_relu(n, x, y); + return; + } + + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } + if (rest == 0) { + return; + } + i = n - block; + MOVE_ONE_STEP; +#undef MOVE_ONE_STEP + +#else + vec_relu(n, x, y); +#endif +} + +template <> +inline void vec_relu(const int n, + const float* x, + float* y) { + vec_relu(n, x, y); +} + +template <> +inline void vec_relu(const int n, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_relu(n, x, y); +} + +// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary + +template +class VecActivations { + public: + std::function operator()( + const std::string& type) { + if (type == "sigmoid") { + return vec_sigmoid; + } else if (type == "relu") { + return vec_relu; + } else if (type == "tanh") { + return vec_tanh; + } else if (type == "identity" || type == "") { + return vec_identity; + } + PADDLE_THROW(phi::errors::InvalidArgument( + "Expected type should be one of sigmod, relu, tanh, identity. But got " + "not support type: %s.", + type)); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt index c92e10f8dd7..317dcce92c8 100644 --- a/paddle/phi/tests/kernels/CMakeLists.txt +++ b/paddle/phi/tests/kernels/CMakeLists.txt @@ -22,3 +22,5 @@ endif() if(WITH_ROCM) hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function) endif() + +cc_test(test_cpu_vec SRCS test_cpu_vec.cc DEPS blas cpu_info) diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/phi/tests/kernels/test_cpu_vec.cc similarity index 75% rename from paddle/fluid/operators/math/cpu_vec_test.cc rename to paddle/phi/tests/kernels/test_cpu_vec.cc index 859afec3781..271143f9f6f 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/phi/tests/kernels/test_cpu_vec.cc @@ -18,7 +18,10 @@ limitations under the License. */ #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" + +namespace phi { +namespace tests { inline double GetCurrentUS() { struct timeval time; @@ -62,7 +65,9 @@ void ref_relu(const int n, const T* x, T* y) { } template -void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), +void RandomVec(const int n, + T* a, + const T lower = static_cast(-20.f), const T upper = static_cast(20.f)) { static unsigned int seed = 100; std::mt19937 rng(seed++); @@ -73,7 +78,8 @@ void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), } template -void TestAndBench(const int n, std::function tgt, +void TestAndBench(const int n, + std::function tgt, std::function ref) { std::vector x(n); std::vector ytgt(n), yref(n); @@ -101,47 +107,48 @@ void TestAndBench(const int n, std::function tgt, TEST(CpuVecTest, sigmoid) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, - ref_sigmoid); - TestAndBench(sz, vec_sigmoid, - ref_sigmoid); - TestAndBench(sz, vec_sigmoid, - ref_sigmoid); + TestAndBench( + sz, vec_sigmoid, ref_sigmoid); + TestAndBench( + sz, vec_sigmoid, ref_sigmoid); + TestAndBench( + sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, - ref_tanh); + TestAndBench( + sz, vec_tanh, ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, - ref_relu); + TestAndBench( + sz, vec_relu, ref_relu); } TestAndBench(30, vec_relu, ref_relu); } template -void compare_sum(size_t n, std::function tgt, +void compare_sum(size_t n, + std::function tgt, std::function ref) { std::vector x(n); T ytgt_data, yref_data; @@ -155,18 +162,19 @@ void compare_sum(size_t n, std::function tgt, TEST(CpuVecTest, vec_sum) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { compare_sum(sz, vec_sum, vec_sum); - compare_sum(sz, vec_sum, - vec_sum); + compare_sum( + sz, vec_sum, vec_sum); } compare_sum(30U, vec_sum, vec_sum); } template void compare_clip( - size_t n, T threshold, + size_t n, + T threshold, std::function tgt, std::function ref) { std::vector x(n); @@ -185,20 +193,23 @@ void compare_clip( TEST(CpuVecTest, vec_clip) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { - compare_clip(sz, -4.f, vec_clip, - vec_clip); - compare_clip(sz, -1.1f, vec_clip, + compare_clip( + sz, -4.f, vec_clip, vec_clip); + compare_clip(sz, + -1.1f, + vec_clip, vec_clip); } - compare_clip(30U, 1.0, vec_clip, - vec_clip); + compare_clip( + 30U, 1.0, vec_clip, vec_clip); } template void compare_mul( - size_t n, std::function tgt, + size_t n, + std::function tgt, std::function ref) { std::vector x(n), y(n); std::vector ztgt(n), zref(n); @@ -220,18 +231,19 @@ void compare_mul( TEST(CpuVecTest, vec_mul) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { compare_mul(sz, vec_mul, vec_mul); - compare_mul(sz, vec_mul, - vec_mul); + compare_mul( + sz, vec_mul, vec_mul); } compare_mul(30U, vec_mul, vec_mul); } template void compare_mul_reduce( - size_t n, std::function tgt, + size_t n, + std::function tgt, std::function ref) { std::vector x(n), y(n); T ztgt_data, zref_data; @@ -249,19 +261,21 @@ void compare_mul_reduce( TEST(CpuVecTest, vec_mul_reduce) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { - compare_mul_reduce(sz, vec_mul_reduce, - vec_mul_reduce); - compare_mul_reduce(sz, vec_mul_reduce, + compare_mul_reduce( + sz, vec_mul_reduce, vec_mul_reduce); + compare_mul_reduce(sz, + vec_mul_reduce, vec_mul_reduce); } - compare_mul_reduce(30U, vec_mul_reduce, - vec_mul_reduce); + compare_mul_reduce( + 30U, vec_mul_reduce, vec_mul_reduce); } template -void TestInplace(const int n, std::function tgt, +void TestInplace(const int n, + std::function tgt, std::function ref) { std::vector x(n); std::vector ytgt(n), yref(n); @@ -283,22 +297,22 @@ void TestInplace(const int n, std::function tgt, TEST(CpuVecTest, inplace_sigmoid) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, - ref_sigmoid); - TestInplace(sz, vec_sigmoid, - ref_sigmoid); - TestInplace(sz, vec_sigmoid, - ref_sigmoid); + TestInplace( + sz, vec_sigmoid, ref_sigmoid); + TestInplace( + sz, vec_sigmoid, ref_sigmoid); + TestInplace( + sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); @@ -310,7 +324,7 @@ TEST(CpuVecTest, inplace_tanh) { TEST(CpuVecTest, inplace_relu) { namespace platform = paddle::platform; - using namespace paddle::operators::math; // NOLINT + using namespace phi::funcs; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); @@ -319,3 +333,5 @@ TEST(CpuVecTest, inplace_relu) { } TestInplace(30, vec_relu, ref_relu); } +} // namespace tests +} // namespace phi -- GitLab From 1ca379bf27af4bb4044c11e736881ebe9385c9f4 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 4 Mar 2022 17:04:15 +0800 Subject: [PATCH 059/261] Move gather_nd/scatter/scatter_nd_add op to the phi library (#40090) * move gather_nd/scatter/scatter_nd_add * fix npu/xpu ci * follow comments * small fix --- paddle/fluid/operators/gather_nd_op.cc | 81 +++--------- paddle/fluid/operators/gather_nd_op.cu | 109 ---------------- paddle/fluid/operators/gather_nd_op.h | 97 --------------- paddle/fluid/operators/gather_nd_op_npu.cc | 5 +- paddle/fluid/operators/gather_nd_op_xpu.cc | 11 +- paddle/fluid/operators/scatter_nd_add_op.cc | 112 +++-------------- paddle/fluid/operators/scatter_nd_add_op.cu | 101 --------------- paddle/fluid/operators/scatter_nd_add_op.h | 89 -------------- paddle/fluid/operators/scatter_op.cc | 74 ++--------- paddle/fluid/operators/scatter_op.cu | 116 ------------------ paddle/fluid/operators/scatter_op.h | 113 ----------------- paddle/fluid/operators/scatter_op_npu.cc | 1 - paddle/fluid/operators/scatter_op_xpu.cc | 5 +- paddle/phi/infermeta/backward.cc | 45 +++++++ paddle/phi/infermeta/backward.h | 14 +++ paddle/phi/infermeta/binary.cc | 33 +++++ paddle/phi/infermeta/binary.h | 4 + paddle/phi/infermeta/ternary.cc | 103 ++++++++++++++++ paddle/phi/infermeta/ternary.h | 16 +++ .../phi/kernels/cpu/gather_nd_grad_kernel.cc | 64 ++++++++++ paddle/phi/kernels/cpu/gather_nd_kernel.cc | 60 +++++++++ paddle/phi/kernels/cpu/scatter_grad_kernel.cc | 73 +++++++++++ paddle/phi/kernels/cpu/scatter_kernel.cc | 63 ++++++++++ .../kernels/cpu/scatter_nd_add_grad_kernel.cc | 55 +++++++++ .../phi/kernels/cpu/scatter_nd_add_kernel.cc | 60 +++++++++ paddle/phi/kernels/gather_nd_grad_kernel.h | 28 +++++ paddle/phi/kernels/gather_nd_kernel.h | 27 ++++ .../phi/kernels/gpu/gather_nd_grad_kernel.cu | 65 ++++++++++ paddle/phi/kernels/gpu/gather_nd_kernel.cu | 60 +++++++++ paddle/phi/kernels/gpu/scatter_grad_kernel.cu | 74 +++++++++++ paddle/phi/kernels/gpu/scatter_kernel.cu | 62 ++++++++++ .../kernels/gpu/scatter_nd_add_grad_kernel.cu | 55 +++++++++ .../phi/kernels/gpu/scatter_nd_add_kernel.cu | 58 +++++++++ paddle/phi/kernels/scatter_grad_kernel.h | 29 +++++ paddle/phi/kernels/scatter_kernel.h | 29 +++++ .../phi/kernels/scatter_nd_add_grad_kernel.h | 29 +++++ paddle/phi/kernels/scatter_nd_add_kernel.h | 28 +++++ paddle/phi/ops/compat/gather_scatter_sig.cc | 46 +++++++ 38 files changed, 1241 insertions(+), 853 deletions(-) delete mode 100644 paddle/fluid/operators/gather_nd_op.cu delete mode 100644 paddle/fluid/operators/gather_nd_op.h delete mode 100644 paddle/fluid/operators/scatter_nd_add_op.cu delete mode 100644 paddle/fluid/operators/scatter_nd_add_op.h delete mode 100644 paddle/fluid/operators/scatter_op.cu delete mode 100644 paddle/fluid/operators/scatter_op.h create mode 100644 paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/gather_nd_kernel.cc create mode 100644 paddle/phi/kernels/cpu/scatter_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/scatter_kernel.cc create mode 100644 paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc create mode 100644 paddle/phi/kernels/gather_nd_grad_kernel.h create mode 100644 paddle/phi/kernels/gather_nd_kernel.h create mode 100644 paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/gather_nd_kernel.cu create mode 100644 paddle/phi/kernels/gpu/scatter_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/scatter_kernel.cu create mode 100644 paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu create mode 100644 paddle/phi/kernels/scatter_grad_kernel.h create mode 100644 paddle/phi/kernels/scatter_kernel.h create mode 100644 paddle/phi/kernels/scatter_nd_add_grad_kernel.h create mode 100644 paddle/phi/kernels/scatter_nd_add_kernel.h create mode 100644 paddle/phi/ops/compat/gather_scatter_sig.cc diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index 8da900d84f9..fcd3384ac24 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" -#include -#include -#include -#include "paddle/phi/core/ddim.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -25,48 +25,10 @@ class GatherNdOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherNdOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_dims_size = x_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], x_dims_size, - platform::errors::InvalidArgument( - "Input(Index).shape[-1] should be no greater than Input(X).rank")); - PADDLE_ENFORCE_GE(index_dims_size, 1UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1")); - - std::vector result_dims; - // The result dims is - // Index.shape[:-1] + X.shape[Index.shape[-1]:] - for (int i = 0; i < index_dims_size - 1; ++i) { - result_dims.emplace_back(index_dims[i]); - } - for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) { - result_dims.emplace_back(x_dims[i]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(result_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto* x = ctx.Input("X"); const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); return framework::OpKernelType( x_type, @@ -80,11 +42,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,23 +130,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X"); namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, + PT_INFER_META(phi::GatherNdInferMeta)); + +DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, + PT_INFER_META(phi::GatherNdGradInferMeta)); + REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, ops::GatherNdGradOpMaker, - ops::GatherNdGradOpMaker); + ops::GatherNdGradOpMaker, + GatherNdInferShapeFunctor); REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp, - ops::GatherNdGradNoNeedBufferVarInferer); - -REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel); - -REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel); + ops::GatherNdGradNoNeedBufferVarInferer, + GatherNdGradInferShapeFunctor); diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu deleted file mode 100644 index 338c4411618..00000000000 --- a/paddle/fluid/operators/gather_nd_op.cu +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_nd_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherNdOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - const auto &index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherNdGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - const auto &index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h deleted file mode 100644 index d54261008e4..00000000000 --- a/paddle/fluid/operators/gather_nd_op.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherNdOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - - auto index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherNdGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - auto index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index 995ab5d0ddf..c916f44b874 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc index 9f4c522bd14..d4cb799e825 100644 --- a/paddle/fluid/operators/gather_nd_op_xpu.cc +++ b/paddle/fluid/operators/gather_nd_op_xpu.cc @@ -11,7 +11,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -20,9 +23,9 @@ template class GatherNdXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *out = ctx.Output("Out"); out->template mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index bb02bb541e1..b7be4cfb2a3 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_nd_add_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterNdAddOp should not be null.")); - - auto ref_dims = ctx->GetInputDim("X"); - auto ref_dims_size = ref_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - auto updates_dims = ctx->GetInputDim("Updates"); - auto updates_dims_size = updates_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], ref_dims_size, - platform::errors::InvalidArgument( - "The last dimension of Input(Index)'s shape should be no greater " - "than the rank of Input(X), but received the last dimension of " - "Input(Index)'s shape is %d, the rank of Input(X) is %d.", - index_dims[index_dims_size - 1], ref_dims_size)); - PADDLE_ENFORCE_GE(index_dims_size, 2UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1, " - "but received the rank of Input(Index) is %d.", - index_dims_size)); - - // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] - std::vector r_updates_dims; - for (int64_t i = 0; i < index_dims_size - 1; ++i) { - r_updates_dims.emplace_back(index_dims[i]); - } - for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) { - r_updates_dims.emplace_back(ref_dims[i]); - } - - PADDLE_ENFORCE_EQ( - r_updates_dims.size(), updates_dims_size, - platform::errors::InvalidArgument( - "Updates has wrong shape. The shape of Updates and Input(Updates) " - "should be same, but received the shape of Updates is %d, " - "the shape of Input(Updates) is %d.", - r_updates_dims.size(), updates_dims_size)); - - for (int64_t i = 0; i < updates_dims_size; ++i) { - PADDLE_ENFORCE_EQ( - r_updates_dims[i], updates_dims[i], - platform::errors::InvalidArgument( - "Updates has wrong shape. The dimensions of Updates and " - "Input(Updates) should match, but received Updates's" - "%d-th dimension is %d, Input(Updates)'s %d-th " - "dimension is %d.", - i, r_updates_dims[i], i, updates_dims[i])); - } - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "Ref and Updates must have same type")); return framework::OpKernelType( - framework::TransToProtoVarType(ctx.Input("X")->type()), + framework::TransToProtoVarType( + ctx.Input("X")->type()), ctx.device_context()); } }; @@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, + PT_INFER_META(phi::ScatterNdAddInferMeta)); + +DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, + ScatterNdAddGradInferShapeFunctor, + PT_INFER_META(phi::ScatterNdAddGradInferMeta)); + REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, ops::ScatterNdAddGradMaker, - ops::ScatterNdAddGradMaker); + ops::ScatterNdAddGradMaker, + ScatterNdAddInferShapeFunctor); REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp, - ops::ScatterNdAddGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel); + ops::ScatterNdAddGradNoNeedBufferVarsInferer, + ScatterNdAddGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu deleted file mode 100644 index 2fe3fcb759d..00000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.cu +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter_nd_add_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class ScatterNdAddOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.cuda_device_context(); - // Gradient by Gather - const auto &index_type = Ids->dtype(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h deleted file mode 100644 index 81c95fe55ab..00000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.h +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterNdAddOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - // In place output: Out = X - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - const auto &index_type = Ids->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3174f07e96e..fec003305fd 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true, - platform::errors::InvalidArgument( - "Input(Ids) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterOp should not be null.")); - - auto updates_dims = ctx->GetInputDim("Updates"); - auto ref_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Ids").size(), 1, - platform::errors::InvalidArgument( - "The size of Input(Ids)'s shape should be equal to 1, but " - "received the rank of Input(Ids) is %d.", - ctx->GetInputDim("Ids").size())); - PADDLE_ENFORCE_EQ( - ref_dims.size(), updates_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Updates) should have the same shape size, " - "but received the size of Input(x)'s shape is %d, the size of " - "Input(Updates)'s shape is %d.", - ref_dims.size(), updates_dims.size())); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0], - platform::errors::InvalidArgument( - "Input(Updates) and Input(Ids) should have same batch-size, but" - " received Input(Updates)'s batch-size is %d, Input(Ids)'s " - "batch-size is %d.", - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0])); - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, + PT_INFER_META(phi::ScatterInferMeta)); + +DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, + PT_INFER_META(phi::ScatterGradInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, ops::ScatterGradMaker, ops::ScatterGradMaker, - ops::ScatterInplaceInferer); + ops::ScatterInplaceInferer, ScatterInferShapeFunctor); REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, - ops::ScatterGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel, - ops::ScatterOpKernel, ops::ScatterOpKernel, - ops::ScatterOpKernel); -REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel); + ops::ScatterGradNoNeedBufferVarsInferer, + ScatterGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu deleted file mode 100644 index 7755e376bc1..00000000000 --- a/paddle/fluid/operators/scatter_op.cu +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class ScatterOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - bool overwrite = ctx.Attr("overwrite"); - - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // use template class to support int32_t and int64_t - auto index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, - overwrite); - } else { - phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, - overwrite); - } - } -}; - -template -class ScatterGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - auto index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.cuda_device_context(); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); - } else { - phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - scatter_grad, ops::ScatterGradOpCUDAKernel, - ops::ScatterGradOpCUDAKernel, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h deleted file mode 100644 index 7733181a93f..00000000000 --- a/paddle/fluid/operators/scatter_op.h +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - double overwrite = ctx.Attr("overwrite"); - - // In place output: Out = X, Out[Ids] = Updates - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // Apply ScatterUpdate: Out[index] = Updates[:] - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.template device_context(); - if (overwrite) { - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); - } - } else { - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); - } - } - } -}; - -template -class ScatterGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); - } else { - phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index fa5f03a0928..815984ac307 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/operators/scatter_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc index 9f0b74e8a3f..07dd2f2d85f 100644 --- a/paddle/fluid/operators/scatter_op_xpu.cc +++ b/paddle/fluid/operators/scatter_op_xpu.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 7d403fee943..4ddef5b0002 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -105,4 +105,49 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dx->share_meta(dout); } +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad) { + const auto& dtype = out_grad.dtype(); + x_grad->set_dims(x.dims()); + x_grad->share_lod(x); + x_grad->set_dtype(dtype); +} + +void ScatterGradInferMeta(const MetaTensor& index, + const MetaTensor& updates, + const MetaTensor& out_grad, + bool overwrite, + MetaTensor* x_grad, + MetaTensor* updates_grad) { + const auto& dtype = out_grad.dtype(); + if (updates_grad) { + updates_grad->set_dims(updates.dims()); + updates_grad->set_dtype(dtype); + } + + if (x_grad) { + x_grad->set_dims(out_grad.dims()); + x_grad->set_dtype(dtype); + } +} + +void ScatterNdAddGradInferMeta(const MetaTensor& index, + const MetaTensor& updates, + const MetaTensor& out_grad, + MetaTensor* x_grad, + MetaTensor* updates_grad) { + const auto& dtype = out_grad.dtype(); + if (updates_grad) { + updates_grad->set_dims(updates.dims()); + updates_grad->set_dtype(dtype); + } + + if (x_grad) { + x_grad->set_dims(out_grad.dims()); + x_grad->set_dtype(dtype); + } +} + } // namespace phi diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index f2c0cf8a689..f7b0eed5dd9 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -46,4 +46,18 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, MetaTensor* dx); + +void ScatterGradInferMeta(const MetaTensor& index, + const MetaTensor& updates, + const MetaTensor& out_grad, + bool overwrite, + MetaTensor* x_grad, + MetaTensor* updates_grad); + +void ScatterNdAddGradInferMeta(const MetaTensor& index, + const MetaTensor& updates, + const MetaTensor& out_grad, + MetaTensor* x_grad, + MetaTensor* updates_grad); + } // namespace phi diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 1f6f0b211b6..745ddffabbe 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -397,6 +397,39 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void GatherNdInferMeta(const MetaTensor& x, + const MetaTensor& index, + MetaTensor* out) { + auto x_dims = x.dims(); + auto x_dims_size = x_dims.size(); + auto index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + + PADDLE_ENFORCE_LE( + index_dims[index_dims_size - 1], + x_dims_size, + phi::errors::InvalidArgument( + "Input(Index).shape[-1] should be no greater than Input(X).rank")); + PADDLE_ENFORCE_GE(index_dims_size, + 1UL, + phi::errors::InvalidArgument( + "The rank of Input(Index) should be greater than 1")); + + std::vector result_dims; + // The result dims is + // Index.shape[:-1] + X.shape[Index.shape[-1]:] + for (int i = 0; i < index_dims_size - 1; ++i) { + result_dims.emplace_back(index_dims[i]); + } + for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) { + result_dims.emplace_back(x_dims[i]); + } + + out->set_dims(phi::make_ddim(result_dims)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out) { diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 47745f8ce13..2ec74463698 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -78,6 +78,10 @@ void BCELossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void GatherNdInferMeta(const MetaTensor& x, + const MetaTensor& index, + MetaTensor* out); + void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 1c1497fb0e4..c3472a24801 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -89,6 +89,109 @@ void AddmmInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void ScatterInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + bool overwrite, + MetaTensor* out) { + const auto& updates_dims = updates.dims(); + const auto& ref_dims = x.dims(); + const auto& index_dims = index.dims(); + PADDLE_ENFORCE_EQ( + index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The size of Input(Ids)'s shape should be equal to 1, but " + "received the rank of Input(Ids) is %d.", + index_dims.size())); + PADDLE_ENFORCE_EQ( + ref_dims.size(), + updates_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Updates) should have the same shape size, " + "but received the size of Input(x)'s shape is %d, the size of " + "Input(Updates)'s shape is %d.", + ref_dims.size(), + updates_dims.size())); + PADDLE_ENFORCE_EQ( + updates_dims[0], + index_dims[0], + phi::errors::InvalidArgument( + "Input(Updates) and Input(Ids) should have same batch-size, but" + " received Input(Updates)'s batch-size is %d, Input(Ids)'s " + "batch-size is %d.", + updates_dims[0], + index_dims[0])); + out->set_dims(ref_dims); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +void ScatterNdAddInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + MetaTensor* out) { + const auto& ref_dims = x.dims(); + auto ref_dims_size = ref_dims.size(); + const auto& index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + const auto& updates_dims = updates.dims(); + auto updates_dims_size = updates_dims.size(); + + PADDLE_ENFORCE_LE( + index_dims[index_dims_size - 1], + ref_dims_size, + phi::errors::InvalidArgument( + "The last dimension of Input(Index)'s shape should be no greater " + "than the rank of Input(X), but received the last dimension of " + "Input(Index)'s shape is %d, the rank of Input(X) is %d.", + index_dims[index_dims_size - 1], + ref_dims_size)); + PADDLE_ENFORCE_GE(index_dims_size, + 2UL, + phi::errors::InvalidArgument( + "The rank of Input(Index) should be greater than 1, " + "but received the rank of Input(Index) is %d.", + index_dims_size)); + + // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] + std::vector r_updates_dims; + for (int64_t i = 0; i < index_dims_size - 1; ++i) { + r_updates_dims.emplace_back(index_dims[i]); + } + for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) { + r_updates_dims.emplace_back(ref_dims[i]); + } + + PADDLE_ENFORCE_EQ( + r_updates_dims.size(), + updates_dims_size, + phi::errors::InvalidArgument( + "Updates has wrong shape. The shape of Updates and Input(Updates) " + "should be same, but received the shape of Updates is %d, " + "the shape of Input(Updates) is %d.", + r_updates_dims.size(), + updates_dims_size)); + + for (int64_t i = 0; i < updates_dims_size; ++i) { + PADDLE_ENFORCE_EQ( + r_updates_dims[i], + updates_dims[i], + phi::errors::InvalidArgument( + "Updates has wrong shape. The dimensions of Updates and " + "Input(Updates) should match, but received Updates's" + "%d-th dimension is %d, Input(Updates)'s %d-th " + "dimension is %d.", + i, + r_updates_dims[i], + i, + updates_dims[i])); + } + out->set_dims(ref_dims); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void LerpInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 5679c5b533f..cff57e1ba70 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -37,6 +37,22 @@ void AddmmInferMeta(const MetaTensor& input, float beta, MetaTensor* out); +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad); + +void ScatterInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + bool overwrite, + MetaTensor* out); + +void ScatterNdAddInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + MetaTensor* out); + void LerpInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc new file mode 100644 index 00000000000..b375a7ec469 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_nd_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void GatherNdGradKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &out_grad, + DenseTensor *x_grad) { + ctx.template Alloc(x_grad); + auto dxt = phi::EigenVector::Flatten(*x_grad); + auto &place = *ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (out_grad.numel() == 0) return; + + auto index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(ctx, out_grad, index, x_grad); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::ScatterNdAdd(ctx, out_grad, index, x_grad); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_nd_grad, + CPU, + ALL_LAYOUT, + phi::GatherNdGradKernel, + float, + double, + int64_t, + int, + uint8_t) {} diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc new file mode 100644 index 00000000000..aa32d036934 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_nd_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +template +void GatherNdKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { + ctx.template Alloc(out); + if (x.numel() == 0) return; + + auto index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGatherNd(ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_nd, + CPU, + ALL_LAYOUT, + phi::GatherNdKernel, + float, + double, + int64_t, + int, + int16_t, + bool, + uint8_t) {} diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc new file mode 100644 index 00000000000..62fd58704c4 --- /dev/null +++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/scatter_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void ScatterGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + bool overwrite, + DenseTensor *x_grad, + DenseTensor *updates_grad) { + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "scatter_op index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (x_grad) { + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUScatterGradForX(ctx, index, x_grad); + } else { + phi::funcs::CPUScatterGradForX(ctx, index, x_grad); + } + } + + if (updates_grad) { + ctx.template Alloc(updates_grad); + // Gradient by Gather: dUpdates = dO[Ids] + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(ctx, out_grad, index, updates_grad); + } else { + phi::funcs::CPUGather(ctx, out_grad, index, updates_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_grad, + CPU, + ALL_LAYOUT, + phi::ScatterGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc new file mode 100644 index 00000000000..d48ceaf29a0 --- /dev/null +++ b/paddle/phi/kernels/cpu/scatter_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/scatter_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void ScatterKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + bool overwrite, + DenseTensor *out) { + // In place output: Out = X, Out[Ids] = Updates + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + // Apply ScatterUpdate: Out[index] = Updates[:] + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + if (overwrite) { + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssign(ctx, updates, index, out); + } else { + phi::funcs::ScatterAssign(ctx, updates, index, out); + } + } else { + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssignAdd(ctx, updates, index, out); + } else { + phi::funcs::ScatterAssignAdd(ctx, updates, index, out); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + scatter, CPU, ALL_LAYOUT, phi::ScatterKernel, float, double, int, int64_t) { +} diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc new file mode 100644 index 00000000000..cc143ba8d0e --- /dev/null +++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +template +void ScatterNdAddGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + DenseTensor *x_grad, + DenseTensor *updates_grad) { + if (x_grad) { + Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + } + if (updates_grad) { + ctx.template Alloc(updates_grad); + // Gradient by Gather: dUpdates = dO[Ids] + const auto &index_type = index.dtype(); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(ctx, out_grad, index, updates_grad); + } else { + phi::funcs::CPUGatherNd(ctx, out_grad, index, updates_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_nd_add_grad, + CPU, + ALL_LAYOUT, + phi::ScatterNdAddGradKernel, + float, + double, + int64_t, + int, + uint8_t) {} diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc new file mode 100644 index 00000000000..04ae10f5e8b --- /dev/null +++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/scatter_nd_add_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void ScatterNdAddKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + DenseTensor *out) { + // In place output: Out = X + Copy(ctx, x, ctx.GetPlace(), true, out); + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(ctx, updates, index, out); + } else { + phi::funcs::ScatterNdAdd(ctx, updates, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_nd_add, + CPU, + ALL_LAYOUT, + phi::ScatterNdAddKernel, + float, + double, + int64_t, + int, + uint8_t) {} diff --git a/paddle/phi/kernels/gather_nd_grad_kernel.h b/paddle/phi/kernels/gather_nd_grad_kernel.h new file mode 100644 index 00000000000..05003471495 --- /dev/null +++ b/paddle/phi/kernels/gather_nd_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GatherGradNdKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &out_grad, + DenseTensor *x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/gather_nd_kernel.h b/paddle/phi/kernels/gather_nd_kernel.h new file mode 100644 index 00000000000..d2393eb3b07 --- /dev/null +++ b/paddle/phi/kernels/gather_nd_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GatherNdKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu new file mode 100644 index 00000000000..5273902804a --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_nd_grad_kernel.cu @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/gather_nd_grad_kernel.h" + +namespace phi { + +template +void GatherNdGradKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &out_grad, + DenseTensor *x_grad) { + ctx.template Alloc(x_grad); + auto dxt = phi::EigenVector::Flatten(*x_grad); + auto &place = *ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (out_grad.numel() == 0) return; + + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + + PADDLE_ENFORCE_EQ( + index_type_match, + true, + phi::errors::InvalidArgument("Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(ctx, out_grad, index, x_grad); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUScatterNdAdd(ctx, out_grad, index, x_grad); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_nd_grad, + GPU, + ALL_LAYOUT, + phi::GatherNdGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu new file mode 100644 index 00000000000..33745ef5f07 --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/gather_nd_kernel.h" + +namespace phi { + +template +void GatherNdKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { + ctx.template Alloc(out); + if (x.numel() == 0) return; + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGatherNd(ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_nd, + GPU, + ALL_LAYOUT, + phi::GatherNdKernel, + float, + double, + int64_t, + int, + int16_t, + bool, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu new file mode 100644 index 00000000000..75506e2a0a1 --- /dev/null +++ b/paddle/phi/kernels/gpu/scatter_grad_kernel.cu @@ -0,0 +1,74 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/scatter_grad_kernel.h" + +namespace phi { + +template +void ScatterGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + bool overwrite, + DenseTensor *x_grad, + DenseTensor *updates_grad) { + auto index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "scatter_op index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (x_grad) { + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterGradForX(ctx, index, x_grad); + } else { + phi::funcs::GPUScatterGradForX(ctx, index, x_grad); + } + } + + if (updates_grad) { + ctx.template Alloc(updates_grad); + // Gradient by Gather: dUpdates = dO[Ids] + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(ctx, out_grad, index, updates_grad); + } else { + phi::funcs::GPUGather(ctx, out_grad, index, updates_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_grad, + GPU, + ALL_LAYOUT, + phi::ScatterGradKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/scatter_kernel.cu b/paddle/phi/kernels/gpu/scatter_kernel.cu new file mode 100644 index 00000000000..811eae1bc02 --- /dev/null +++ b/paddle/phi/kernels/gpu/scatter_kernel.cu @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/scatter_kernel.h" + +namespace phi { + +template +void ScatterKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + bool overwrite, + DenseTensor *out) { + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + // use template class to support int32_t and int64_t + auto index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "scatter_op Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterAssign( + ctx, updates, index, out, overwrite); + } else { + phi::funcs::GPUScatterAssign( + ctx, updates, index, out, overwrite); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter, + GPU, + ALL_LAYOUT, + phi::ScatterKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu new file mode 100644 index 00000000000..71924befe8c --- /dev/null +++ b/paddle/phi/kernels/gpu/scatter_nd_add_grad_kernel.cu @@ -0,0 +1,55 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h" + +namespace phi { + +template +void ScatterNdAddGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + DenseTensor *x_grad, + DenseTensor *updates_grad) { + if (x_grad) { + Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + } + if (updates_grad) { + ctx.template Alloc(updates_grad); + // Gradient by Gather + const auto &index_type = index.dtype(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(ctx, out_grad, index, updates_grad); + } else { + phi::funcs::GPUGatherNd(ctx, out_grad, index, updates_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_nd_add_grad, + GPU, + ALL_LAYOUT, + phi::ScatterNdAddGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu new file mode 100644 index 00000000000..eadd91773c0 --- /dev/null +++ b/paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu @@ -0,0 +1,58 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/scatter_nd_add_kernel.h" + +namespace phi { + +template +void ScatterNdAddKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + DenseTensor *out) { + Copy(ctx, x, ctx.GetPlace(), true, out); + const auto &index_type = index.dtype(); + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(ctx, updates, index, out); + } else { + phi::funcs::GPUScatterNdAdd(ctx, updates, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(scatter_nd_add, + GPU, + ALL_LAYOUT, + phi::ScatterNdAddKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/scatter_grad_kernel.h b/paddle/phi/kernels/scatter_grad_kernel.h new file mode 100644 index 00000000000..cf1482fca7f --- /dev/null +++ b/paddle/phi/kernels/scatter_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ScatterGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + bool overwrite, + DenseTensor *x_grad, + DenseTensor *updates_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/scatter_kernel.h b/paddle/phi/kernels/scatter_kernel.h new file mode 100644 index 00000000000..5191d6bce45 --- /dev/null +++ b/paddle/phi/kernels/scatter_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ScatterKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + bool overwrite, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/scatter_nd_add_grad_kernel.h b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h new file mode 100644 index 00000000000..bcfdb2cdb2f --- /dev/null +++ b/paddle/phi/kernels/scatter_nd_add_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ScatterNdAddGradKernel(const Context &ctx, + const DenseTensor &index, + const DenseTensor &updates, + const DenseTensor &out_grad, + DenseTensor *x_grad, + DenseTensor *updates_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/scatter_nd_add_kernel.h b/paddle/phi/kernels/scatter_nd_add_kernel.h new file mode 100644 index 00000000000..c20709dccc0 --- /dev/null +++ b/paddle/phi/kernels/scatter_nd_add_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ScatterNdAddKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &updates, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/gather_scatter_sig.cc b/paddle/phi/ops/compat/gather_scatter_sig.cc new file mode 100644 index 00000000000..f71e30f85b0 --- /dev/null +++ b/paddle/phi/ops/compat/gather_scatter_sig.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GatherNdGradArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gather_nd_grad", + {"X", "Index", GradVarName("Out")}, + {}, + {GradVarName("X")}); +} + +KernelSignature ScatterGradArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("scatter_grad", + {"Ids", "Updates", GradVarName("Out")}, + {"overwrite"}, + {GradVarName("X"), GradVarName("Updates")}); +} + +KernelSignature ScatterNdAddGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("scatter_nd_add_grad", + {"Index", "Updates", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Updates")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad, + phi::ScatterNdAddGradArgumentMapping); -- GitLab From 5496a7ab3beb60e908a2deb6eb5bca9834ac7a8b Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Fri, 4 Mar 2022 17:20:26 +0800 Subject: [PATCH 060/261] Dump cpu xingneng (#40068) * dump cpu * code format --- .../framework/fleet/heter_ps/hashtable_inl.h | 76 ++++++++++++------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index b7cb2ce0f01..59220fc9cda 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -186,45 +186,63 @@ void HashTable::insert(const KeyType* d_keys, size_t len, template void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { container_->prefetch(cudaCpuDeviceId, stream); + std::vector threads; size_t num = container_->size(); KeyType unuse_key = std::numeric_limits::max(); thrust::pair* kv = container_->data(); - for (size_t i = 0; i < num; ++i) { - if (kv[i].first == unuse_key) { - continue; - } - ValType& gpu_val = kv[i].second; + + int thread_num = 8; + int len_per_thread = num / thread_num; + int remain = num % thread_num; + int begin = 0; + + auto dump_func = [unuse_key, kv](int left, int right) { + for (int i = left; i < right; i++) { + if (kv[i].first == unuse_key) { + continue; + } + ValType& gpu_val = kv[i].second; #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[1] = gpu_val.delta_score; - cpu_val[2] = gpu_val.show; - cpu_val[3] = gpu_val.clk; - cpu_val[4] = gpu_val.lr; - cpu_val[5] = gpu_val.lr_g2sum; - cpu_val[6] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[1] = gpu_val.delta_score; + cpu_val[2] = gpu_val.show; + cpu_val[3] = gpu_val.clk; + cpu_val[4] = gpu_val.lr; + cpu_val[5] = gpu_val.lr_g2sum; + cpu_val[6] = gpu_val.slot; + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } } - } #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); - downpour_value->count_ = gpu_val.show; - for (int x = 0; x < gpu_val.mf_size; x++) { - downpour_value->data_[x] = gpu_val.mf[x]; - } + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } #endif + } + }; + + for (int i = 0; i < thread_num; i++) { + threads.push_back(std::thread( + dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0))); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); } - container_->prefetch(devid, stream); + // container_->prefetch(devid, stream); } template -- GitLab From 0f9259684ac40d6dbd429ccd6dd43d5fbc9cf27d Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 4 Mar 2022 17:27:54 +0800 Subject: [PATCH 061/261] Fixed GradNode default attributes issues (#40132) * Fixed GradNode default attributes issues * Reverted changes on yaml files --- .../final_state_generator/eager_gen.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 7de7747ebf0..d1e20854153 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -509,11 +509,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format( aname, GetConstReference(atype), aname, saved_attr_name, aname) - ATTRIBUTE_MEMBER_TEMPLATE = """ - {} {} = {}; -""" - attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( - RemoveConstAndReference(atype), saved_attr_name, default_val) + if default_val: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {} = {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name, default_val) + else: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name) # End: SetAttributes & Attribute Members grad_node_name = GetGradNodeName(fwd_api_name) -- GitLab From 5dc766371cfe6d1fdc16ece613799f7b4f734eae Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 4 Mar 2022 18:42:29 +0800 Subject: [PATCH 062/261] change namespace to fix conflict (#40164) --- .../gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu | 2 +- .../phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index ae3cefd9e82..598b0138fb3 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( dev_ctx, *counts_tensor, norm_tensor, diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index fb63badf56a..13d63f8d97e 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -89,7 +89,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - kernels::TensorReduceImpl>( + funcs::TensorReduceImpl>( dev_ctx, *counts_tensor, norm_tensor, -- GitLab From 12346cdce77fb97af708d6b8f04ff9231a311229 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 4 Mar 2022 20:07:16 +0800 Subject: [PATCH 063/261] [PHI] Move header of selected_rows kernel to selected_rows dir (#40128) * move selected_rows kernel head to selected_rows dir * update license * add sr namespace * refacter selected_rows kernel funciton name * fix bug --- paddle/phi/kernels/full_kernel.h | 8 --- paddle/phi/kernels/scale_kernel.h | 9 ---- .../phi/kernels/selected_rows/full_kernel.cc | 19 ++++--- .../phi/kernels/selected_rows/full_kernel.h | 32 ++++++++++++ .../phi/kernels/selected_rows/scale_kernel.cc | 21 ++++---- .../phi/kernels/selected_rows/scale_kernel.h | 32 ++++++++++++ .../selected_rows/uniform_random_kernel.cc | 50 +++++++++++-------- .../selected_rows/uniform_random_kernel.h | 45 +++++++++++++++++ paddle/phi/kernels/uniform_random_kernel.h | 22 -------- 9 files changed, 160 insertions(+), 78 deletions(-) create mode 100644 paddle/phi/kernels/selected_rows/full_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/scale_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/uniform_random_kernel.h diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index c7b1f9af0e3..05929ba83f3 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -17,7 +17,6 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -31,13 +30,6 @@ void FullKernel(const Context& dev_ctx, DataType dtype, DenseTensor* out); -template -void FullSR(const Context& dev_ctx, - const ScalarArray& shape, - const Scalar& val, - DataType dtype, - SelectedRows* out); - template void FullLikeKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h index b3958343698..22e6efb03ac 100644 --- a/paddle/phi/kernels/scale_kernel.h +++ b/paddle/phi/kernels/scale_kernel.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" namespace phi { @@ -29,14 +28,6 @@ void ScaleKernel(const Context& dev_ctx, bool bias_after_scale, DenseTensor* out); -template -void ScaleSR(const Context& dev_ctx, - const SelectedRows& x, - const Scalar& scale, - float bias, - bool bias_after_scale, - SelectedRows* out); - template DenseTensor Scale(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc index 02231867fdd..39fd009cd65 100644 --- a/paddle/phi/kernels/selected_rows/full_kernel.cc +++ b/paddle/phi/kernels/selected_rows/full_kernel.cc @@ -12,34 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/selected_rows/full_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/gpu/gpu_context.h" #endif #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" namespace phi { +namespace sr { template -void FullSR(const Context& dev_ctx, - const ScalarArray& shape, - const Scalar& val, - DataType dtype, - SelectedRows* out) { +void FullKernel(const Context& dev_ctx, + const ScalarArray& shape, + const Scalar& val, + DataType dtype, + SelectedRows* out) { phi::FullKernel(dev_ctx, shape, val, dtype, out->mutable_value()); } +} // namespace sr } // namespace phi PD_REGISTER_KERNEL(full_sr, CPU, ALL_LAYOUT, - phi::FullSR, + phi::sr::FullKernel, float, double, uint8_t, @@ -56,7 +59,7 @@ PD_REGISTER_KERNEL(full_sr, PD_REGISTER_KERNEL(full_sr, GPU, ALL_LAYOUT, - phi::FullSR, + phi::sr::FullKernel, float, double, uint8_t, diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h new file mode 100644 index 00000000000..d84ddcc0d3f --- /dev/null +++ b/paddle/phi/kernels/selected_rows/full_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void FullKernel(const Context& dev_ctx, + const ScalarArray& shape, + const Scalar& val, + DataType dtype, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc index 094b6f4d120..38a0cb75101 100644 --- a/paddle/phi/kernels/selected_rows/scale_kernel.cc +++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc @@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/scale_kernel.h" +#include "paddle/phi/kernels/selected_rows/scale_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/scale_kernel.h" namespace phi { +namespace sr { template -void ScaleSR(const Context& dev_ctx, - const SelectedRows& x, - const Scalar& scale, - float bias, - bool bias_after_scale, - SelectedRows* out) { +void ScaleKernel(const Context& dev_ctx, + const SelectedRows& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + SelectedRows* out) { if (x.value().Holder() != out->value().Holder() || x.value().data() != out->value().data()) { out->set_rows(x.rows()); @@ -36,12 +38,13 @@ void ScaleSR(const Context& dev_ctx, dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value()); } +} // namespace sr } // namespace phi PD_REGISTER_KERNEL(scale_sr, CPU, ALL_LAYOUT, - phi::ScaleSR, + phi::sr::ScaleKernel, float, double, phi::dtype::bfloat16, @@ -55,7 +58,7 @@ PD_REGISTER_KERNEL(scale_sr, PD_REGISTER_KERNEL(scale_sr, GPU, ALL_LAYOUT, - phi::ScaleSR, + phi::sr::ScaleKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.h b/paddle/phi/kernels/selected_rows/scale_kernel.h new file mode 100644 index 00000000000..85c2c4ddff0 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/scale_kernel.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void ScaleKernel(const Context& dev_ctx, + const SelectedRows& x, + const Scalar& scale, + float bias, + bool bias_after_scale, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc index 881180b71b1..b3dd1d1b7d2 100644 --- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc @@ -12,22 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/uniform_random_kernel.h" namespace phi { +namespace sr { template -void UniformRandomRawSRKernel(const Context& dev_ctx, - const ScalarArray& shape, - DataType dtype, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - SelectedRows* out) { +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out) { phi::UniformRandomRawKernel(dev_ctx, shape, dtype, @@ -41,23 +46,24 @@ void UniformRandomRawSRKernel(const Context& dev_ctx, } template -void UniformRandomSRKernel(const Context& dev_ctx, - const ScalarArray& shape, - DataType dtype, - float min, - float max, - int seed, - SelectedRows* out) { +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out) { phi::UniformRandomKernel( dev_ctx, shape, dtype, min, max, seed, out->mutable_value()); } +} // namespace sr } // namespace phi PD_REGISTER_KERNEL(uniform_random_raw_sr, CPU, ALL_LAYOUT, - phi::UniformRandomRawSRKernel, + phi::sr::UniformRandomRawKernel, float, double, phi::dtype::bfloat16) {} @@ -65,7 +71,7 @@ PD_REGISTER_KERNEL(uniform_random_raw_sr, PD_REGISTER_KERNEL(uniform_random_sr, CPU, ALL_LAYOUT, - phi::UniformRandomSRKernel, + phi::sr::UniformRandomKernel, float, double, phi::dtype::bfloat16) {} @@ -75,14 +81,14 @@ PD_REGISTER_KERNEL(uniform_random_sr, PD_REGISTER_KERNEL(uniform_random_raw_sr, GPU, ALL_LAYOUT, - phi::UniformRandomRawSRKernel, + phi::sr::UniformRandomRawKernel, float, double) {} PD_REGISTER_KERNEL(uniform_random_sr, GPU, ALL_LAYOUT, - phi::UniformRandomSRKernel, + phi::sr::UniformRandomKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h new file mode 100644 index 00000000000..aee7a4c7aaf --- /dev/null +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out); + +template +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h index 5bba1272785..36ce4c3f9ee 100644 --- a/paddle/phi/kernels/uniform_random_kernel.h +++ b/paddle/phi/kernels/uniform_random_kernel.h @@ -17,7 +17,6 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" -#include "paddle/phi/core/selected_rows.h" namespace phi { @@ -42,25 +41,4 @@ void UniformRandomKernel(const Context& dev_ctx, int seed, DenseTensor* out); -template -void UniformRandomRawSRKernel(const Context& dev_ctx, - const ScalarArray& shape, - DataType dtype, - float min, - float max, - int seed, - int diag_num, - int diag_step, - float diag_val, - SelectedRows* out); - -template -void UniformRandomSRKernel(const Context& dev_ctx, - const ScalarArray& shape, - DataType dtype, - float min, - float max, - int seed, - SelectedRows* out); - } // namespace phi -- GitLab From faece3829ae14c9256313f3ff31ce9adfd41ddf9 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 4 Mar 2022 22:09:40 +0800 Subject: [PATCH 064/261] Move yolo box to phi (#40112) * add yolo box kernel; test=develop * fix comile error; test=develop --- .../tensorrt/plugin/yolo_box_op_plugin.cu | 1 - .../fluid/operators/detection/CMakeLists.txt | 2 +- .../fluid/operators/detection/yolo_box_op.cc | 3 - .../fluid/operators/detection/yolo_box_op.cu | 143 -------------- .../fluid/operators/detection/yolo_box_op.h | 180 ----------------- paddle/phi/kernels/cpu/yolo_box_kernel.cc | 128 ++++++++++++ paddle/phi/kernels/funcs/yolo_box_util.h | 112 +++++++++++ paddle/phi/kernels/gpu/yolo_box_kernel.cu | 182 ++++++++++++++++++ paddle/phi/kernels/yolo_box_kernel.h | 36 ++++ paddle/phi/ops/compat/yolo_box_sig.cc | 35 ++++ .../fluid/tests/unittests/test_yolo_box_op.py | 3 +- 11 files changed, 496 insertions(+), 329 deletions(-) delete mode 100644 paddle/fluid/operators/detection/yolo_box_op.cu delete mode 100644 paddle/fluid/operators/detection/yolo_box_op.h create mode 100644 paddle/phi/kernels/cpu/yolo_box_kernel.cc create mode 100644 paddle/phi/kernels/funcs/yolo_box_util.h create mode 100644 paddle/phi/kernels/gpu/yolo_box_kernel.cu create mode 100644 paddle/phi/kernels/yolo_box_kernel.h create mode 100644 paddle/phi/ops/compat/yolo_box_sig.cc diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 57177cfa8b4..336005d883b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -16,7 +16,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 1ebafa54598..568c7982cfc 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) -detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) +detection_library(yolo_box_op SRCS yolo_box_op.cc) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 511d8e0eed1..0d9fbf612f7 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -240,8 +239,6 @@ REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, - ops::YoloBoxKernel); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu deleted file mode 100644 index fb5c214a59e..00000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, - T* scores, const float conf_thresh, - const int* anchors, const int n, const int h, - const int w, const int an_num, const int class_num, - const int box_num, int input_size_h, - int input_size_w, bool clip_bbox, const float scale, - const float bias, bool iou_aware, - const float iou_aware_factor) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - T box[4]; - for (; tid < n * box_num; tid += stride) { - int grid_num = h * w; - int i = tid / box_num; - int j = (tid % box_num) / grid_num; - int k = (tid % grid_num) / w; - int l = tid % w; - - int an_stride = (5 + class_num) * grid_num; - int img_height = imgsize[2 * i]; - int img_width = imgsize[2 * i + 1]; - - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, - iou_aware); - T conf = sigmoid(input[obj_idx]); - if (iou_aware) { - int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); - T iou = sigmoid(input[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, - iou_aware); - GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, - input_size_w, box_idx, grid_num, img_height, img_width, scale, - bias); - box_idx = (i * box_num + j * grid_num + k * w + l) * 4; - CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, - 5, iou_aware); - int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; - CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, - grid_num); - } -} - -template -class YoloBoxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* img_size = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); - int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - const auto cplace = platform::CPUPlace(); - memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, - dev_ctx.stream()); - - const T* input_data = input->data(); - const int* imgsize_data = img_size->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, boxes, static_cast(0)); - set_zero(dev_ctx, scores, static_cast(0)); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); - - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - - KeYoloBoxFw<<>>( - input_data, imgsize_data, boxes_data, scores_data, conf_thresh, - anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, - input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, - ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h deleted file mode 100644 index 2cd69c60b7c..00000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -HOSTDEVICE inline T sigmoid(T x) { - return 1.0 / (1.0 + std::exp(-x)); -} - -template -HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, - int j, int an_idx, int grid_size_h, - int grid_size_w, int input_size_h, - int input_size_w, int index, int stride, - int img_height, int img_width, float scale, - float bias) { - box[0] = (i + sigmoid(x[index]) * scale + bias) * img_width / grid_size_w; - box[1] = (j + sigmoid(x[index + stride]) * scale + bias) * img_height / - grid_size_h; - box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / - input_size_w; - box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * - img_height / input_size_h; -} - -HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, - int an_num, int an_stride, int stride, - int entry, bool iou_aware) { - if (iou_aware) { - return (batch * an_num + an_idx) * an_stride + - (batch * an_num + an_num + entry) * stride + hw_idx; - } else { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; - } -} - -HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, - int an_stride, int stride) { - return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + - hw_idx; -} - -template -HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, - const int img_height, - const int img_width, bool clip_bbox) { - boxes[box_idx] = box[0] - box[2] / 2; - boxes[box_idx + 1] = box[1] - box[3] / 2; - boxes[box_idx + 2] = box[0] + box[2] / 2; - boxes[box_idx + 3] = box[1] + box[3] / 2; - - if (clip_bbox) { - boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); - boxes[box_idx + 1] = - boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); - boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 - ? boxes[box_idx + 2] - : static_cast(img_width - 1); - boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 - ? boxes[box_idx + 3] - : static_cast(img_height - 1); - } -} - -template -HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, - const int label_idx, const int score_idx, - const int class_num, const T conf, - const int stride) { - for (int i = 0; i < class_num; i++) { - scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); - } -} - -template -class YoloBoxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* imgsize = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - const int stride = h * w; - const int an_stride = (class_num + 5) * stride; - - Tensor anchors_; - auto anchors_data = - anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); - std::copy(anchors.begin(), anchors.end(), anchors_data); - - const T* input_data = input->data(); - const int* imgsize_data = imgsize->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - memset(boxes_data, 0, boxes->numel() * sizeof(T)); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - memset(scores_data, 0, scores->numel() * sizeof(T)); - - T box[4]; - for (int i = 0; i < n; i++) { - int img_height = imgsize_data[2 * i]; - int img_width = imgsize_data[2 * i + 1]; - - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 4, iou_aware); - T conf = sigmoid(input_data[obj_idx]); - if (iou_aware) { - int iou_idx = - GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); - T iou = sigmoid(input_data[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 0, iou_aware); - GetYoloBox(box, input_data, anchors_data, l, k, j, h, w, - input_size_h, input_size_w, box_idx, stride, - img_height, img_width, scale, bias); - box_idx = (i * box_num + j * stride + k * w + l) * 4; - CalcDetectionBox(boxes_data, box, box_idx, img_height, img_width, - clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 5, iou_aware); - int score_idx = (i * box_num + j * stride + k * w + l) * class_num; - CalcLabelScore(scores_data, input_data, label_idx, score_idx, - class_num, conf, stride); - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc new file mode 100644 index 00000000000..a83bc019fc3 --- /dev/null +++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/yolo_box_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/yolo_box_util.h" + +namespace phi { + +template +void YoloBoxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + DenseTensor* boxes, + DenseTensor* scores) { + auto* input = &x; + auto* imgsize = &img_size; + float scale = scale_x_y; + float bias = -0.5 * (scale - 1.); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size_h = downsample_ratio * h; + int input_size_w = downsample_ratio * w; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + DenseTensor anchors_; + auto anchors_data = + anchors_.mutable_data({an_num * 2}, dev_ctx.GetPlace()); + std::copy(anchors.begin(), anchors.end(), anchors_data); + + const T* input_data = input->data(); + const int* imgsize_data = imgsize->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, dev_ctx.GetPlace()); + memset(boxes_data, 0, boxes->numel() * sizeof(T)); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, dev_ctx.GetPlace()); + memset(scores_data, 0, scores->numel() * sizeof(T)); + + T box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, stride, 4, iou_aware); + T conf = funcs::sigmoid(input_data[obj_idx]); + if (iou_aware) { + int iou_idx = + funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); + T iou = funcs::sigmoid(input_data[iou_idx]); + conf = pow(conf, static_cast(1. - iou_aware_factor)) * + pow(iou, static_cast(iou_aware_factor)); + } + if (conf < conf_thresh) { + continue; + } + + int box_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, stride, 0, iou_aware); + funcs::GetYoloBox(box, + input_data, + anchors_data, + l, + k, + j, + h, + w, + input_size_h, + input_size_w, + box_idx, + stride, + img_height, + img_width, + scale, + bias); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + funcs::CalcDetectionBox( + boxes_data, box, box_idx, img_height, img_width, clip_bbox); + + int label_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, stride, 5, iou_aware); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + funcs::CalcLabelScore(scores_data, + input_data, + label_idx, + score_idx, + class_num, + conf, + stride); + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + yolo_box, CPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/yolo_box_util.h b/paddle/phi/kernels/funcs/yolo_box_util.h new file mode 100644 index 00000000000..337af2d7a23 --- /dev/null +++ b/paddle/phi/kernels/funcs/yolo_box_util.h @@ -0,0 +1,112 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +template +HOSTDEVICE inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +HOSTDEVICE inline void GetYoloBox(T* box, + const T* x, + const int* anchors, + int i, + int j, + int an_idx, + int grid_size_h, + int grid_size_w, + int input_size_h, + int input_size_w, + int index, + int stride, + int img_height, + int img_width, + float scale, + float bias) { + box[0] = (i + sigmoid(x[index]) * scale + bias) * img_width / grid_size_w; + box[1] = (j + sigmoid(x[index + stride]) * scale + bias) * img_height / + grid_size_h; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size_w; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size_h; +} + +HOSTDEVICE inline int GetEntryIndex(int batch, + int an_idx, + int hw_idx, + int an_num, + int an_stride, + int stride, + int entry, + bool iou_aware) { + if (iou_aware) { + return (batch * an_num + an_idx) * an_stride + + (batch * an_num + an_num + entry) * stride + hw_idx; + } else { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + } +} + +HOSTDEVICE inline int GetIoUIndex( + int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride) { + return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + + hw_idx; +} + +template +HOSTDEVICE inline void CalcDetectionBox(T* boxes, + T* box, + const int box_idx, + const int img_height, + const int img_width, + bool clip_bbox) { + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + if (clip_bbox) { + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); + } +} + +template +HOSTDEVICE inline void CalcLabelScore(T* scores, + const T* input, + const int label_idx, + const int score_idx, + const int class_num, + const T conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu new file mode 100644 index 00000000000..2719dcd9e54 --- /dev/null +++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu @@ -0,0 +1,182 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/yolo_box_util.h" +#include "paddle/phi/kernels/yolo_box_kernel.h" + +namespace phi { + +template +__global__ void KeYoloBoxFw(const T* input, + const int* imgsize, + T* boxes, + T* scores, + const float conf_thresh, + const int* anchors, + const int n, + const int h, + const int w, + const int an_num, + const int class_num, + const int box_num, + int input_size_h, + int input_size_w, + bool clip_bbox, + const float scale, + const float bias, + bool iou_aware, + const float iou_aware_factor) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, grid_num, 4, iou_aware); + T conf = funcs::sigmoid(input[obj_idx]); + if (iou_aware) { + int iou_idx = + funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); + T iou = funcs::sigmoid(input[iou_idx]); + conf = pow(conf, static_cast(1. - iou_aware_factor)) * + pow(iou, static_cast(iou_aware_factor)); + } + if (conf < conf_thresh) { + continue; + } + + int box_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, grid_num, 0, iou_aware); + funcs::GetYoloBox(box, + input, + anchors, + l, + k, + j, + h, + w, + input_size_h, + input_size_w, + box_idx, + grid_num, + img_height, + img_width, + scale, + bias); + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + funcs::CalcDetectionBox( + boxes, box, box_idx, img_height, img_width, clip_bbox); + + int label_idx = funcs::GetEntryIndex( + i, j, k * w + l, an_num, an_stride, grid_num, 5, iou_aware); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + funcs::CalcLabelScore( + scores, input, label_idx, score_idx, class_num, conf, grid_num); + } +} + +template +void YoloBoxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + DenseTensor* boxes, + DenseTensor* scores) { + auto* input = &x; + float scale = scale_x_y; + float bias = -0.5 * (scale - 1.); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size_h = downsample_ratio * h; + int input_size_w = downsample_ratio * w; + + int bytes = sizeof(int) * anchors.size(); + auto anchors_ptr = + paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); + int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); + const auto gplace = dev_ctx.GetPlace(); + const auto cplace = phi::CPUPlace(); + paddle::memory::Copy( + gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream()); + + const T* input_data = input->data(); + const int* imgsize_data = img_size.data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, dev_ctx.GetPlace()); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, dev_ctx.GetPlace()); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, boxes, static_cast(0)); + set_zero(dev_ctx, scores, static_cast(0)); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * box_num); + + dim3 thread_num = config.thread_per_block; +#ifdef WITH_NV_JETSON + if (config.compute_capability == 53 || config.compute_capability == 62) { + thread_num = 512; + } +#endif + + KeYoloBoxFw<<>>( + input_data, + imgsize_data, + boxes_data, + scores_data, + conf_thresh, + anchors_data, + n, + h, + w, + an_num, + class_num, + box_num, + input_size_h, + input_size_w, + clip_bbox, + scale, + bias, + iou_aware, + iou_aware_factor); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + yolo_box, GPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {} diff --git a/paddle/phi/kernels/yolo_box_kernel.h b/paddle/phi/kernels/yolo_box_kernel.h new file mode 100644 index 00000000000..9553d300cad --- /dev/null +++ b/paddle/phi/kernels/yolo_box_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void YoloBoxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + DenseTensor* boxes, + DenseTensor* scores); + +} // namespace phi diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc new file mode 100644 index 00000000000..bb39e72a64f --- /dev/null +++ b/paddle/phi/ops/compat/yolo_box_sig.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("yolo_box", + {"X", "ImgSize"}, + {"anchors", + "class_num", + "conf_thresh", + "downsample_ratio", + "clip_bbox", + "scale_x_y", + "iou_aware", + "iou_aware_factor"}, + {"Boxes", "Scores"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py index 043c5c1651a..f210d97362c 100644 --- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py @@ -260,5 +260,6 @@ class TestYoloBoxOpHW(TestYoloBoxOp): self.iou_aware_factor = 0.5 -if (__name__ == '__main__'): +if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From bcaf88d20db551f53fbe7672000cbc4053fcae03 Mon Sep 17 00:00:00 2001 From: wangguanqun Date: Sat, 5 Mar 2022 00:11:53 +0800 Subject: [PATCH 065/261] Ps optimizer multi programs (#39883) * fix benchmark and communicator config * fix bugs of the_one_ps * multi program and fix bug in optimizer * multi program in the_one_ps * public commcontext * ps optimizer multi programs * the one ps merge * fix bug in test --- python/paddle/distributed/collective.py | 2 +- .../fleet/meta_optimizers/ps_optimizer.py | 43 +++++++++++++--- .../distributed/passes/ps_trainer_pass.py | 5 ++ python/paddle/distributed/ps/the_one_ps.py | 51 ++++++++++--------- .../ps/utils/ps_program_builder.py | 12 +++-- python/paddle/distributed/ps/utils/public.py | 5 +- .../test_ps_trainer_pass.py | 4 ++ .../tests/unittests/ps/ps_dnn_trainer.py | 1 + 8 files changed, 85 insertions(+), 38 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 3731332d1e7..8042aced6bb 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -29,7 +29,6 @@ from ..fluid.layers import utils from ..fluid.dygraph import layers from ..fluid.dygraph.parallel import prepare_context import paddle -from .fleet import fleet import paddle.fluid as fluid import paddle.fluid.core as core from paddle import _C_ops @@ -1422,6 +1421,7 @@ def split(x, "graph mode, plese use ParallelEmbedding, ParallelRowLinear, " "ParallelColumnLinear instead.") else: + from .fleet import fleet assert fleet._role_maker, ("To use paddle.distributed.split, " "you must call fleet.init() firstly.") rank = fleet.worker_index() diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 00937dbe7a4..f786f665ad4 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -31,14 +31,19 @@ class ParameterServerOptimizer(MetaOptimizerBase): self.inner_opt = optimizer # we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [] - self.pass_ctx = PassContext() def _set_basic_info(self, loss, role_maker, user_defined_optimizer, user_defined_strategy): super(ParameterServerOptimizer, self)._set_basic_info( loss, role_maker, user_defined_optimizer, user_defined_strategy) + def _set_origin_programs(self, losses): + self.origin_main_programs = [] + for loss in losses: + self.origin_main_programs.append(loss.block.program) + def _init_ps_pass_context(self, loss, startup_program): + self.pass_ctx = PassContext() attrs = {} # trainer attrs["env"] = get_dist_env() @@ -46,9 +51,9 @@ class ParameterServerOptimizer(MetaOptimizerBase): attrs['loss'] = loss attrs['min_block_size'] = 81920 attrs['origin_main_program'] = loss.block.program - attrs['origin_main_programs'] = [loss.block.program] attrs['origin_startup_program'] = startup_program - attrs['origin_startup_programs'] = [startup_program] + + attrs['origin_main_programs'] = self.origin_main_programs attrs['cloned_main'] = attrs['origin_main_program'].clone() attrs['cloned_startup'] = attrs['origin_startup_program'].clone() @@ -90,10 +95,11 @@ class ParameterServerOptimizer(MetaOptimizerBase): return False def _can_apply(self): - if self._attrs['role_maker']._is_collective or self._attrs[ - 'k_steps'] < 0: + if self.role_maker._is_collective: return False - return True + + k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] + return True if k_steps >= 0 else False def minimize_impl(self, loss, @@ -104,12 +110,37 @@ class ParameterServerOptimizer(MetaOptimizerBase): no_grad_set) if startup_program == None: startup_program = paddle.static.default_startup_program() + print("program after inner optimizer minimize:", + str(loss.block.program)) + self._set_origin_programs([loss]) self._init_ps_pass_context(loss, startup_program) ps_builder = PsProgramBuilderFactory()._create_ps_program_builder( self.pass_ctx) ps_builder._build_programs() return None, None + def minimize_losses_impl(self, + losses, + startup_program=None, + parameter_list=None, + no_grad_set=None): + if parameter_list is None: + parameter_list = [None] * len(losses) + for idx, loss in enumerate(losses): + startup_prog = startup_program[idx] + parameters = parameter_list[idx] + self.inner_opt.minimize(loss, startup_prog, parameters, no_grad_set) + self._set_origin_programs(losses) + for idx, loss in enumerate(losses): + print("ps_optimizer idx loss:", idx, loss) + startup_prog = startup_program[idx] + self._init_ps_pass_context(loss, startup_prog) + ps_builder = PsProgramBuilderFactory()._create_ps_program_builder( + self.pass_ctx) + ps_builder._build_programs() + startup_program[idx] = self.pass_ctx._attrs['cloned_startup'] + return None, None + def _can_apply_geo(self, program): def get_sys_free_mem(): plat = platform.system() diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py index 284365ce066..6f72cf1b159 100755 --- a/python/paddle/distributed/passes/ps_trainer_pass.py +++ b/python/paddle/distributed/passes/ps_trainer_pass.py @@ -74,6 +74,8 @@ class AppendSendOpsPass(PassBase): # 该 pass 被多种模式复用 def _apply_single_impl(self, main_program, startup_program, pass_ctx): attrs = pass_ctx._attrs + print("pass loss program id:", id(attrs['loss'].block.program)) + print("pass main program id:", id(main_program)) ps_mode = attrs['ps_mode'] if ps_mode == DistributedMode.GEO: send_ctx = get_geo_trainer_send_context(attrs) # geo 模式 @@ -84,6 +86,8 @@ class AppendSendOpsPass(PassBase): # 该 pass 被多种模式复用 for merged_name, send in send_ctx.items(): if send.is_sparse() and ps_mode != DistributedMode.GEO: continue + if send.program_id() != id(attrs['loss'].block.program): + continue logger.info('merged_name, send: {}, {}'.format(merged_name, send)) is_sparse = 1 if send.is_sparse() else 0 is_sparse = 2 if send.is_distributed() else is_sparse @@ -496,6 +500,7 @@ class DeleteOptimizesPass(PassBase): persistable=True) def _apply_single_impl(self, main_program, startup_program, pass_ctx): + print("delete_optimizer_pass") attrs = pass_ctx._attrs optimizer_ops = get_optimize_ops(main_program) lr_ops = get_lr_ops(main_program) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index cc744bc9d9e..5170684b432 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -40,12 +40,12 @@ def get_program_by_id(context, program_id): programs = context["origin_main_programs"] for i, program in enumerate(programs): if id(program) == program_id: - return program, context["origin_startup_programs"][i] - return None, None + return program, context["origin_startup_programs"][i], i + return None, None, None def parse_table_class(varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) + main_program, startup_program, idx = get_program_by_id(context, program_id) for op in main_program.global_block().ops: if not is_distributed_sparse_op(op) and not is_sparse_op(op): continue @@ -60,7 +60,7 @@ def parse_table_class(varname, program_id, context): def check_embedding_dim(accessor_proto, varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) + main_program, startup_program, idx = get_program_by_id(context, program_id) embedding_dim = 0 for var in main_program.list_vars(): if var.name == varname: @@ -94,10 +94,9 @@ class Service: class GpuService(Service): def __init__(self): - super(GpuService).__init__(self) + super(GpuService, self).__init__() def _set(self, service_proto): - super(GpuService)._set(service_proto) service_proto.server_class = 'PsLocalServer' service_proto.client_class = 'PsLocalClient' @@ -111,7 +110,8 @@ class Accessor: # TableAccessorParameter accessor def _set(self, accessor_proto, varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) + main_program, startup_program, idx = get_program_by_id(context, + program_id) embedding_dim = 0 for var in main_program.list_vars(): if var.name == varname: @@ -236,7 +236,8 @@ class CommonAccessor(Accessor): self.opt_init_map = opt_init_map def parse_entry(self, varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) + main_program, startup_program, idx = get_program_by_id(context, + program_id) for op in main_program.global_block().ops: if not is_distributed_sparse_op(op) and not is_sparse_op(op): continue @@ -290,8 +291,8 @@ class CommonAccessor(Accessor): print("parse_by_optimizer table_id:{} is_datanorm:{}".format( ctx.table_id(), ctx.is_datanorm_table())) - main_program, startup_program = get_program_by_id(context, - ctx.program_id()) + main_program, startup_program, idx = get_program_by_id(context, + ctx.program_id()) pserver_id = get_role_id(context['role_maker']) pserver_num = len(get_ps_endpoints(context['role_maker'])) optimizer_ops = get_optimize_ops(main_program) @@ -359,10 +360,11 @@ class CommonAccessor(Accessor): param = main_program.global_block().vars[oop.input( formal_name)[0]] #TODO: for dense learning_rate, can be different from sparse lr - if formal_name == "LearningRate" and param.name != "learning_rate_0": + if formal_name == "LearningRate" and param.name != "learning_rate_" + str( + idx): warnings.warn("will support decay soon") param = main_program.global_block().vars[ - "learning_rate_0"] + "learning_rate_" + str(idx)] initializer = self.get_initializer_attr(param.name, startup_program) @@ -404,10 +406,11 @@ class CommonAccessor(Accessor): else: param = main_program.global_block().vars[oop.input( formal_name)[0]] - if formal_name == "LearningRate" and param.name != "learning_rate_0": + if formal_name == "LearningRate" and param.name != "learning_rate_" + str( + idx): warnings.warn("will support decay soon") param = main_program.global_block().vars[ - "learning_rate_0"] + "learning_rate_" + str(idx)] if shape is None: if is_sparse: @@ -707,6 +710,7 @@ class PsDescBuilder(object): self.ps_mode = context['ps_mode'] self.is_heter_ps_mode = context['is_heter_ps_mode'] self.use_ps_gpu = context['use_ps_gpu'] + self.barrier_table_id = None self.send_ctx = get_the_one_send_context( self.context, use_origin_program=True, @@ -767,6 +771,8 @@ class PsDescBuilder(object): table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( ) table._set(table_proto) + if type(table) == BarrierTable and self.barrier_table_id is None: + self.barrier_table_id = table.idx self.service._set( self.ps_desc.server_param.downpour_server_param.service_param) return text_format.MessageToString(self.ps_desc) @@ -820,9 +826,9 @@ class TheOnePSRuntime(RuntimeBase): self.context['tensor_table'] = {} build_var_distributed(self.context) - endpoints = get_ps_endpoints(self.role_maker) + self.endpoints = get_ps_endpoints(self.role_maker) self.string_hosts = [] - for idx, ep in enumerate(endpoints): + for idx, ep in enumerate(self.endpoints): host, port = ep.split(":") pshost = fluid.core.PSHost(host, int(port), idx) self.string_hosts.append(pshost.serialize_to_string()) @@ -848,7 +854,7 @@ class TheOnePSRuntime(RuntimeBase): kwargs["trainer_id"] = self.role_maker._worker_index() return kwargs - proto_txt = worker_desc + "\n" + server_desc + proto_txt = worker_desc debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) if debug: print("worker: \n{}".format(proto_txt)) @@ -859,7 +865,7 @@ class TheOnePSRuntime(RuntimeBase): self.context, split_dense_table=self.is_heter_ps_mode, use_origin_program=self.is_heter_ps_mode, - ep_list=endpoints) + ep_list=self.endpoints) trainer_config = self.context['trainer'] debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) @@ -876,10 +882,7 @@ class TheOnePSRuntime(RuntimeBase): kwargs["trainer_id"] = self.role_maker._role_id() kwargs["trainers"] = self.role_maker._worker_num() - for table in server.servers[0].tables: #TODO - if table.table_class == "BarrierTable": - kwargs["barrier_table_id"] = table.id - break + kwargs["barrier_table_id"] = self.ps_desc_builder.barrier_table_id if self.context['ps_mode'] == DistributedMode.SYNC: sync_kwargs = sync_strategy_envs() @@ -1009,7 +1012,7 @@ class TheOnePSRuntime(RuntimeBase): if origin_varname.endswith("@GRAD"): return False - if origin_varname == "learning_rate_0": + if origin_varname.startswith("learning_rate_"): return False if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ @@ -1113,7 +1116,7 @@ class TheOnePSRuntime(RuntimeBase): "in fleet.save() function, executor must be as Executor type") if main_program is None: - main_program = self.context['origin_ps_main_program'] + main_program = self.context['origin_main_program'] if isinstance(main_program, CompiledProgram): raise TypeError( diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py index d737542f323..ff99f9d071e 100755 --- a/python/paddle/distributed/ps/utils/ps_program_builder.py +++ b/python/paddle/distributed/ps/utils/ps_program_builder.py @@ -88,7 +88,7 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式 self.attrs['origin_main_program'] = self.cloned_main if self.launch_barrier and self.launch_barrier_flag: - wait_server_ready(server_endpoints) + wait_server_ready(self.server_endpoints) return @@ -103,10 +103,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): format(self.ps_mode, "PsProgramBuilder")) def _build_trainer_programs(self): + print("build trainer program entry") + print("before ps program builder program:", self.cloned_main) add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", self.attrs) add_lr_decay_table_pass.apply([], [], self.pass_ctx) + print("before distributed op pass") distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs) distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx) @@ -126,9 +129,10 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): self.attrs['origin_main_program'] = self.cloned_main self.attrs['origin_startup_program'] = self.cloned_startup + print("after ps program builder program:", self.cloned_main) if self.launch_barrier and self.launch_barrier_flag: - wait_server_ready(server_endpoints) + wait_server_ready(self.server_endpoints) return @@ -167,7 +171,7 @@ class GpuPsProgramBuilder(PsProgramBuilder): self.attrs['origin_startup_program'] = self.cloned_startup if self.launch_barrier and self.launch_barrier_flag: - wait_server_ready(server_endpoints) + wait_server_ready(self.server_endpoints) return @@ -220,7 +224,7 @@ class HeterAsyncPsProgramBuilder(PsProgramBuilder): [self.cloned_startup], self.pass_ctx) if self.launch_barrier and self.launch_barrier_flag: - wait_server_ready(server_endpoints) + wait_server_ready(self.server_endpoints) return diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index ab5bd7da09d..7839c8520c6 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -450,9 +450,8 @@ def get_the_one_send_context(context, idx = 0 for i, program in enumerate(origin_programs): merged_dense_pairs = context['merged_dense_pairs'][i] - idx += get_dense_send_context(program, send_ctx, idx, - merged_dense_pairs, trainer_id, - split_dense_table) + idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs, + trainer_id, split_dense_table) distibuted_varnames = get_sparse_tablenames(origin_programs, True) print("public distibuted_varnames:", distibuted_varnames) for i, program in enumerate(origin_programs): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py index fd558ef0403..877136cf6ed 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py @@ -146,9 +146,13 @@ class TestPsTrainerPass(PsPassTestBase): self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml" self.config['debug_new_minimize'] = '0' + self.config['log_dir'] = ps_log_root_dir + "gpubox_log_old_minimize" + remove_path_if_exists(self.config['log_dir']) self.ps_launch("gpu-ps") self.config['debug_new_minimize'] = '1' + self.config['log_dir'] = ps_log_root_dir + "gpubox_log_new_minimize" + remove_path_if_exists(self.config['log_dir']) self.ps_launch("gpu-ps") file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt' diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index bc87fc255a5..0fd64b0d923 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -382,6 +382,7 @@ class DnnTrainer(object): ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, user_defined_strategy) + ps_optimizer._set_origin_programs([loss]) ps_optimizer._init_ps_pass_context(loss, startup_program) _main = ps_optimizer.pass_ctx._attrs['cloned_main'] -- GitLab From 86eafde9d3c9b9e95f9f7b3594a86217aa2c1a55 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 5 Mar 2022 08:34:37 +0800 Subject: [PATCH 066/261] fix test jit save load failed (#40180) --- python/paddle/fluid/dygraph/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index aad77373509..f58952d3036 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -885,7 +885,7 @@ def _run_dygraph(instance, input, program_holder): 'start_op_index': 0, 'end_op_index': end_op_index, 'is_test': instance._is_test, - 'program_id': _hash_with_id(trace_program) + 'program_id': _hash_with_id(trace_program, instance) }) # NOTE: [ why need set param's gradient type here ] # if user set sparse gradient mode, the param's gradient -- GitLab From 94f03dc24581ee038d84c53209ff2a67e808d407 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 5 Mar 2022 13:25:25 +0800 Subject: [PATCH 067/261] support add infershape for no grad op (#40182) --- paddle/fluid/framework/op_registry.h | 4 ++-- paddle/fluid/operators/empty_op.cc | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index c45bf32d8b7..eb40a49b406 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx, \ paddle::framework::EmptyGradOpMaker) diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index 6baa504562e..96fa3282d06 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -90,9 +90,6 @@ namespace plat = paddle::platform; DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, PT_INFER_META(phi::CreateInferMeta)); - -REGISTER_OPERATOR( - empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - EmptyInferShapeFunctor); +REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker, + ops::EmptyOpVarTypeInference, + EmptyInferShapeFunctor); -- GitLab From 4be5448b2eba3f49cd964339060f085fc267fe3b Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Sat, 5 Mar 2022 17:31:52 +0800 Subject: [PATCH 068/261] [Phi] move infershape for mv (#39954) * [Phi] move infershape for mv * [Phi] delete extra codes for mv --- paddle/fluid/operators/mv_op.cc | 36 ++++++++------------------------- paddle/phi/infermeta/binary.cc | 30 +++++++++++++++++++++++++++ paddle/phi/infermeta/binary.h | 3 +++ paddle/phi/ops/compat/mv_sig.cc | 5 ----- 4 files changed, 41 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index ab9f10070fc..d34a1ebf82c 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv"); - OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv"); - - auto dim_x = context->GetInputDim("X"); - auto dim_vec = context->GetInputDim("Vec"); - PADDLE_ENFORCE_EQ( - dim_x.size(), 2, - platform::errors::InvalidArgument( - "The rank of input X should be 2, but is %d", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_vec.size(), 1, - platform::errors::InvalidArgument( - "The rank of input Vec should be 1, but is %d", dim_vec.size())); - PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0], - platform::errors::InvalidArgument( - "X's second dimension is expected to be equal to " - "Vec's first dimension" - "but recieved X'shape = [%s], Vec's shape = [%s]", - dim_x, dim_vec)); - - framework::DDim dim_out = phi::make_ddim({dim_x[0]}); - - context->SetOutputDim("Out", dim_out); - context->ShareLoD("X", /*->*/ "Out"); - } }; template @@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, + PT_INFER_META(phi::MvInferMeta)); + REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, - ops::MVOpGradMaker); + ops::MVOpGradMaker, + MvInferShapeFunctor); REGISTER_OPERATOR(mv_grad, ops::MVOpGrad); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 745ddffabbe..03128e96a83 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -443,4 +443,34 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { + auto dim_x = x.dims(); + auto dim_vec = vec.dims(); + PADDLE_ENFORCE_EQ( + dim_x.size(), + 2, + phi::errors::InvalidArgument("The rank of input X should be 2, but is %d", + dim_x.size())); + PADDLE_ENFORCE_EQ( + dim_vec.size(), + 1, + phi::errors::InvalidArgument( + "The rank of input Vec should be 1, but is %d", dim_vec.size())); + PADDLE_ENFORCE_EQ(dim_x[1], + dim_vec[0], + phi::errors::InvalidArgument( + "X's second dimension is expected to be equal to " + "Vec's first dimension" + "but recieved X'shape = [%s], Vec's shape = [%s]", + dim_x, + dim_vec)); + + auto dim_out = phi::make_ddim({dim_x[0]}); + + out->set_dims(dim_out); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 2ec74463698..f397c0def8a 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -85,4 +85,7 @@ void GatherNdInferMeta(const MetaTensor& x, void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); + +void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc index ab0d31ee31d..0012f8e1ccb 100644 --- a/paddle/phi/ops/compat/mv_sig.cc +++ b/paddle/phi/ops/compat/mv_sig.cc @@ -16,10 +16,6 @@ namespace phi { -KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"}); -} - KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("mv_grad", {"X", "Vec", GradVarName("Out")}, @@ -29,5 +25,4 @@ KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping); -- GitLab From e7afa3917799b67e44044c95c10603bf626133cf Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 5 Mar 2022 19:02:26 +0800 Subject: [PATCH 069/261] [Phi] Remove eig op depend for svd_helper (#40174) * remove eig dep for svd helper * fix win failed --- paddle/fluid/operators/eig_op.h | 92 ++++++++++------- paddle/phi/kernels/complex_kernel.h | 60 ++++++++++- paddle/phi/kernels/funcs/diag_functor.h | 99 ++++++++++++++++++ paddle/phi/kernels/funcs/slice.h | 127 ++++++++++++++++++++++++ paddle/phi/kernels/funcs/unsqueeze.h | 41 ++++++++ paddle/phi/kernels/matmul_kernel.h | 4 +- 6 files changed, 379 insertions(+), 44 deletions(-) create mode 100644 paddle/phi/kernels/funcs/slice.h create mode 100644 paddle/phi/kernels/funcs/unsqueeze.h diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index e9c6c1eb7ec..5e4c83e1a45 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -18,12 +18,19 @@ #include #include #include "paddle/fluid/operators/math/matrix_solve.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + #define EPSILON 1e-6 namespace paddle { @@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel { ApplyEigKernel>( *x, &real_values, &real_vectors, context); - auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::dtype::Real, Tout>(context); + + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); // 1. extract real part & imag part from real_values - Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); - Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + Tensor real_part = + phi::funcs::Slice(dev_ctx, real_values, {-1}, {0}, {order}); + Tensor imag_part = phi::funcs::Slice(dev_ctx, real_values, {-1}, + {order}, {order * 2}); // 2. construct complex values auto* real_part_data = real_part.data>(); @@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel { for_range(functor); // 3. construct complex vectors - Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor real_vector_trans = + phi::TransposeLast2Dim(dev_ctx, real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); ConstructComplexVectors, Tout>( @@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel { } }; -template +template void ComputeBackwardForComplexInput( const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, - Tout* x_grad_data, int batch_count, int order, + T* x_grad_data, int batch_count, int order, const framework::ExecutionContext& context) { - auto dito = - math::DeviceIndependenceTensorOperations( - context); - - Tensor trans_v = dito.Transpose(V); - Tensor Vh = dito.Conj(trans_v); - Tensor Lconj = dito.Conj(L); - Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); - Tensor VhgV = dito.Matmul(Vh, gV); - Tensor diag_real = dito.Real(VhgV); - Tensor diag_res = dito.BatchDiag(diag_real, batch_count); - Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + + Tensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); + Tensor Vh = phi::Conj(dev_ctx, trans_v); + Tensor Lconj = phi::Conj(dev_ctx, L); + Tensor Econj = phi::Subtract(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2), + phi::funcs::Unsqueeze(Lconj, -1)); + Tensor VhgV = phi::Matmul(dev_ctx, Vh, gV); + Tensor diag_real = phi::Real(dev_ctx, VhgV); + Tensor diag_res = phi::funcs::BatchDiag(dev_ctx, diag_real, batch_count); + Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); - auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), - static_cast(numel * sizeof(Tout))); - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, - numel); + static_cast(numel * sizeof(T))); + + platform::ForRange for_range(orig_dev_ctx, numel); + phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); - Tensor res2 = dito.Matmul(Vh, res1); - Tensor result = dito.Sub(VhgV, res2); + Tensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); + Tensor res2 = phi::Matmul(dev_ctx, Vh, res1); + Tensor result = phi::Subtract(dev_ctx, VhgV, res2); - result.mutable_data(V.dims(), context.GetPlace()); - result = dito.Div(result, Econj); - result = dito.DiagFill(order, order, order, 0, gL, result); - Tensor rhs = dito.Matmul(result, Vh); + result.mutable_data(V.dims(), context.GetPlace()); + result = phi::Divide(dev_ctx, result, Econj); + result = + phi::funcs::DiagFill(dev_ctx, order, order, order, 0, gL, result); + Tensor rhs = phi::Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput( // x_grad: out int m = Vh.dims()[Vh.dims().size() - 1]; int k = rhs.dims()[rhs.dims().size() - 1]; - auto* matrix_data = Vh.data(); - auto* rhs_data = rhs.data(); - math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, - batch_count); + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); } template diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 3b3003392d3..2c52001ece1 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -24,6 +24,12 @@ namespace phi { template void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +template +void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +template +void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + // If T is complex template < typename T, @@ -50,10 +56,56 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } -template -void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +// If T is complex +template < + typename T, + typename Context, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) { + auto dense_out = phi::Empty(dev_ctx); + MetaTensor meta_out(&dense_out); + RealAndImagInferMeta(x, &meta_out); + RealKernel(dev_ctx, x, &dense_out); + return dense_out; +} -template -void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +// If T is not complex +template < + typename T, + typename Context, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) { + return x; +} + +// If T is complex +template < + typename T, + typename Context, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) { + auto dense_out = phi::Empty(dev_ctx); + MetaTensor meta_out(&dense_out); + RealAndImagInferMeta(x, &meta_out); + ImagKernel(dev_ctx, x, &dense_out); + return dense_out; +} + +// If T is not complex +template < + typename T, + typename Context, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) { + return x; +} } // namespace phi diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h index a806d1583a0..1862f5ec91b 100644 --- a/paddle/phi/kernels/funcs/diag_functor.h +++ b/paddle/phi/kernels/funcs/diag_functor.h @@ -14,6 +14,14 @@ #pragma once +#include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +// TODO(paddle-dev): Remove this file when we can call related Kernel directly + namespace phi { namespace funcs { @@ -25,5 +33,96 @@ inline int ComputeStride(int axis, phi::DDim dims) { return size; } +template +struct DiagAndFillFunctor { + DiagAndFillFunctor(const int m, + const int n, + const int num_lower_diags, + const int num_upper_diags, + const ValueType* scale, + const T* input, + T* output) + : m_(m), + n_(n), + num_lower_diags_(num_lower_diags), + num_upper_diags_(num_upper_diags), + scale_(scale), + input_(input), + output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int col = index % n_; + const int row = (index / n_) % m_; + const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); + const int band_end = + (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); + if (col < band_start || col >= band_end) { + output_[index] = input_[index]; + } else if (col == band_end - 1) { + output_[index] = static_cast(scale_[index % m_]); + } else { + output_[index] = input_[index]; + } + } + + private: + const int m_, n_, num_lower_diags_, num_upper_diags_; + const ValueType* scale_; + const T* input_; + T* output_; +}; + +template +DenseTensor DiagFill(const Context& dev_ctx, + const int m, + const int n, + const int num_lower_diags, + const int num_upper_diags, + const DenseTensor& scale, + const DenseTensor& input) { + DenseTensor out; + out.Resize(input.dims()); + dev_ctx.template Alloc(&out); + funcs::ForRange for_range(dev_ctx, input.numel()); + DiagAndFillFunctor diag_and_copy_functor( + m, + n, + num_lower_diags, + num_upper_diags, + scale.data(), + input.data(), + out.data()); + for_range(diag_and_copy_functor); + return out; +} + +template +DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) { + DenseTensor out; + auto* x_data = x.data>(); + auto numel = x.numel(); + out.Resize(x.dims()); + auto* out_data = dev_ctx.template HostAlloc>( + &out, static_cast(numel * sizeof(phi::dtype::Real))); + + auto x_dims = x.dims(); + int num_dims = x_dims.size(); + std::vector out_shape; + + for (int i = 0; i < num_dims - 1; ++i) { + out_shape.push_back(x.dims()[i]); + } + out.Resize(phi::make_ddim(out_shape)); + int order = x.dims()[num_dims - 1]; + int stride_out = order * order; + int stride_in = order + 1; + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < order; ++j) { + out_data[i * order + j] = x_data[stride_out * i + stride_in * j]; + } + } + return out; +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h new file mode 100644 index 00000000000..0a50dceb0a0 --- /dev/null +++ b/paddle/phi/kernels/funcs/slice.h @@ -0,0 +1,127 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +// TODO(paddle-dev): Remove this file when we can call related Kernel directly + +namespace phi { +namespace funcs { + +template +void EigenSliceWrapper(const Context& dev_ctx, + const DenseTensor* in, + const std::vector& start, + const std::vector& end, + DenseTensor* out) { + // Slice by call Eigen Tensor Function `.slice()` + size_t rank = in->dims().size(); + PADDLE_ENFORCE_EQ(start.size(), + rank, + errors::InvalidArgument( + "EigenSliceWrapper function start " + "argument must have the same length as input rank.")); + PADDLE_ENFORCE_EQ(end.size(), + rank, + errors::InvalidArgument( + "EigenSliceWrapper function end " + "argument must have the same length as input rank.")); + auto eigen_place_ptr = dev_ctx.eigen_device(); + auto eigen_place = *eigen_place_ptr; + auto out_t = phi::EigenTensor::From(*out, out->dims()); + auto in_t = phi::EigenTensor::From(*in, in->dims()); + Eigen::DSizes offsets_32bit, extents_32bit; + for (size_t i = 0; i < D; i++) { + offsets_32bit[i] = start[i]; + extents_32bit[i] = end[i]; + } + EigenSlice, T, D>::Eval( + eigen_place, + phi::To32BitIndex(out_t), + phi::To32BitIndex(in_t), + offsets_32bit, + extents_32bit); +} + +#define SLICE_RANK_CASE(N) \ + case N: { \ + EigenSliceWrapper(dev_ctx, &x, offset, extends, &ret); \ + break; \ + } + +template +DenseTensor Slice(const Context& dev_ctx, + const DenseTensor& x, + std::vector axes, + std::vector starts, + std::vector ends) { + DenseTensor ret; + std::vector new_axes = axes; + std::vector out_shape = phi::vectorize(x.dims()); + size_t rank = out_shape.size(); + PADDLE_ENFORCE_EQ( + axes.size(), + starts.size(), + errors::InvalidArgument("Slice Operator Argument Invalided")); + PADDLE_ENFORCE_EQ( + ends.size(), + starts.size(), + errors::InvalidArgument("Slice Operator Argument Invalided")); + for (unsigned int i = 0; i < axes.size(); ++i) { + int axis = axes[i]; + if (axis < 0) axis = rank + axis; + new_axes[i] = axis; // change negative to positive + int st = starts[i]; + int ed = ends[i]; + PADDLE_ENFORCE_GT( + ed, + st, + errors::InvalidArgument("C++ Slice Operation Not Support End < Start")); + out_shape[axis] = ed - st; + } + std::vector offset(rank), extends(rank); + for (size_t i = 0; i < rank; ++i) { + offset[i] = 0; + extends[i] = x.dims()[i]; + } + for (size_t i = 0; i < new_axes.size(); ++i) { + offset[new_axes[i]] = starts[i]; + extends[new_axes[i]] = ends[i] - starts[i]; + } + ret.Resize(phi::make_ddim(out_shape)); + dev_ctx.template Alloc(&ret); + switch (rank) { + SLICE_RANK_CASE(1); + SLICE_RANK_CASE(2); + SLICE_RANK_CASE(3); + SLICE_RANK_CASE(4); + SLICE_RANK_CASE(5); + SLICE_RANK_CASE(6); + default: { + PADDLE_THROW( + errors::InvalidArgument("Invalid Rank number, " + "currently only support rank between 2~6")); + } + } + return ret; +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h new file mode 100644 index 00000000000..7b8a81471ef --- /dev/null +++ b/paddle/phi/kernels/funcs/unsqueeze.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" + +// TODO(paddle-dev): Remove this file when we can call related Kernel directly + +namespace phi { +namespace funcs { + +inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) { + // don't copy data, only change the dims + DenseTensor out(x); + std::vector out_shape = phi::vectorize(x.dims()); + if (axis >= 0) { + auto index = (out_shape.begin() + axis); + out_shape.insert(index, 1); + } else if (axis < 0) { + auto index = (out_shape.end() + axis + 1); + out_shape.insert(index, 1); + } + out.Resize(phi::make_ddim(out_shape)); + return out; +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h index 8fc060d2e3d..1f1cb22c271 100644 --- a/paddle/phi/kernels/matmul_kernel.h +++ b/paddle/phi/kernels/matmul_kernel.h @@ -33,8 +33,8 @@ template DenseTensor Matmul(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - bool transpose_x, - bool transpose_y) { + bool transpose_x = false, + bool transpose_y = false) { auto dense_out = Empty(dev_ctx); MetaTensor meta_out(&dense_out); MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out); -- GitLab From a3f28a31839b00b7b822ec90cdcce88687ed8fac Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Sun, 6 Mar 2022 00:33:44 +0800 Subject: [PATCH 070/261] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20triangul?= =?UTF-8?q?ar=5Fsolve=20op=20into=20phi=20(#40093)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate triangular_solve op into phi * fix CI * move MatrixReduceSum to phi funcs * move MatrixReduceSum to phi funcs * fix comment * fic CI --- paddle/fluid/operators/triangular_solve_op.cc | 72 ++------- paddle/fluid/operators/triangular_solve_op.cu | 65 -------- paddle/fluid/operators/triangular_solve_op.h | 139 +----------------- paddle/phi/infermeta/binary.cc | 59 ++++++++ paddle/phi/infermeta/binary.h | 7 + paddle/phi/kernels/CMakeLists.txt | 3 +- .../cpu/triangular_solve_grad_kernel.cc | 23 +++ .../kernels/cpu/triangular_solve_kernel.cc | 84 +++++++++++ paddle/phi/kernels/funcs/CMakeLists.txt | 1 + paddle/phi/kernels/funcs/common_shape.h | 66 +++++++++ paddle/phi/kernels/funcs/matrix_reduce.cc | 59 ++++++++ paddle/phi/kernels/funcs/matrix_reduce.cu | 62 ++++++++ paddle/phi/kernels/funcs/matrix_reduce.h | 34 +++++ .../gpu/triangular_solve_grad_kernel.cu | 23 +++ .../kernels/gpu/triangular_solve_kernel.cu | 132 +++++++++++++++++ .../impl/triangular_solve_grad_kernel_impl.h | 138 +++++++++++++++++ .../kernels/triangular_solve_grad_kernel.h | 36 +++++ paddle/phi/kernels/triangular_solve_kernel.h | 30 ++++ paddle/phi/ops/compat/triangular_solve_sig.cc | 30 ++++ 19 files changed, 802 insertions(+), 261 deletions(-) delete mode 100644 paddle/fluid/operators/triangular_solve_op.cu create mode 100644 paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/triangular_solve_kernel.cc create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.cc create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.cu create mode 100644 paddle/phi/kernels/funcs/matrix_reduce.h create mode 100644 paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/triangular_solve_kernel.cu create mode 100644 paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/triangular_solve_grad_kernel.h create mode 100644 paddle/phi/kernels/triangular_solve_kernel.h create mode 100644 paddle/phi/ops/compat/triangular_solve_sig.cc diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index 9233917b093..179f818104c 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/triangular_solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/solve_op.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE( - x_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), x_dims)); - - PADDLE_ENFORCE_GE( - y_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), - x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), - y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2], - y_dims_vec[y_dims_n - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( @@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DELCARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, + PT_INFER_META(phi::TriangularSolveInferMeta)); + REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp, ops::TriangularSolveOpMaker, ops::TriangularSolveOpInferVarType, ops::TriangularSolveOpGradMaker, - ops::TriangularSolveOpGradMaker); + ops::TriangularSolveOpGradMaker, + TriangularSolveInferShapeFunctor); REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp); - -REGISTER_OP_CPU_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CPU_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu deleted file mode 100644 index 7df98517e84..00000000000 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" - -namespace paddle { -namespace operators { - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CUDA_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h index 4e68add096f..315847b4d80 100644 --- a/paddle/fluid/operators/triangular_solve_op.h +++ b/paddle/fluid/operators/triangular_solve_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/complex_functors.h" namespace paddle { @@ -30,10 +29,10 @@ namespace operators { using Tensor = framework::Tensor; template -static void triangular_solve(const DeviceContext& context, const Tensor& x, - const Tensor& y, Tensor* out, bool upper, +static void triangular_solve(const DeviceContext &context, const Tensor &x, + const Tensor &y, Tensor *out, bool upper, bool transpose, bool unitriangular) { - // Tensor broadcast use eigen + // Tensor broadcast use eigen library std::vector x_bst_dims_vec; std::vector y_bst_dims_vec; std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); @@ -64,15 +63,15 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x, template class MatrixReduceSumFunctor { public: - void operator()(const Tensor& input, Tensor* output, - const framework::ExecutionContext& ctx); + void operator()(const Tensor &input, Tensor *output, + const framework::ExecutionContext &ctx); }; template class MatrixReduceSumFunctor { public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { + void operator()(const Tensor &in, Tensor *out, + const framework::ExecutionContext &ctx) { // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] // out_reduce_dim should be [0, 2] const std::vector in_dims = phi::vectorize(in.dims()); @@ -101,129 +100,5 @@ class MatrixReduceSumFunctor { } }; -template -class TriangularSolveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - const auto& dev_ctx = ctx.template device_context(); - triangular_solve(dev_ctx, *x, *y, out, upper, transpose, - unitriangular); - } -}; - -template -class TriangularSolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - const auto* out = ctx.Input("Out"); - const auto* dout = - ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - auto& dev_ctx = ctx.template device_context(); - - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y); - - Tensor dy_bst(y->type()); - if (dy) { - dy->mutable_data(y->dims(), dev_ctx.GetPlace()); - dy_bst.Resize(phi::make_ddim(y_bst_dims_vec)); - dy_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate x's conjugate for complex - Tensor x_conj(x->type()); - platform::ForRange x_for_range(dev_ctx, x->numel()); - phi::funcs::ConjFunctor x_functor( - x->data(), x->numel(), - x_conj.mutable_data(x->dims(), dev_ctx.GetPlace())); - x_for_range(x_functor); - - // reuse forward to get dy_bst, and the result has been broadcated. - triangular_solve(dev_ctx, x_conj, *dout, &dy_bst, upper, - !transpose, unitriangular); - - if (dy_bst.dims() == dy->dims()) { - framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy); - } else { - MatrixReduceSumFunctor functor; - functor(dy_bst, dy, ctx); - dy->Resize(y->dims()); - } - } - - Tensor dx_bst(x->type()); - if (dx) { - dx->mutable_data(x->dims(), dev_ctx.GetPlace()); - dx_bst.Resize(phi::make_ddim(x_bst_dims_vec)); - dx_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate out's conjugate for complex - Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - - auto blas = phi::funcs::GetBlas(ctx); - if (transpose) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); - blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } else { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); - blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } - - Tensor dx_bst_upper(x->type()); - // get upper or lower triangular - dx_bst_upper.Resize(dx_bst.dims()); - dx_bst_upper.mutable_data(dev_ctx.GetPlace()); - - const auto& dims = dx_bst.dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, dx_bst.numel()); - TrilTriuCompute tril_triu_computer(dx_bst.data(), unitriangular, - !upper, H, W, - dx_bst_upper.data()); - x_for_range(tril_triu_computer); - - if (dx_bst_upper.dims() == dx->dims()) { - framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx); - } else { - MatrixReduceSumFunctor functor; - functor(dx_bst_upper, dx, ctx); - dx->Resize(x->dims()); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 03128e96a83..c017e5864aa 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -274,6 +274,65 @@ void HuberLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto x_dims_n = x_dims.size(); + auto y_dims_n = y_dims.size(); + + PADDLE_ENFORCE_GE(x_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor X's dimensions of TriangularSolveOp " + "should be >= 2. But received X's " + "dimensions = %d, X's shape = [%s]", + x_dims.size(), + x_dims)); + + PADDLE_ENFORCE_GE(y_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor Y's dimensions of TriangularSolveOp " + "should be >=2. But received Y's " + "dimensions = %d, Y's shape = [%s]", + y_dims.size(), + y_dims)); + + PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1])); + + std::vector x_dims_vec = phi::vectorize(x_dims); + std::vector y_dims_vec = phi::vectorize(y_dims); + + std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); + std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); + + std::vector expand_batch_portion = + funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector y_broadcast_dims({expand_batch_portion}); + y_broadcast_dims.insert(y_broadcast_dims.end(), + {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]}); + + // dim of 'out' is the same with 'Y' after broadcast + out->set_dims(phi::make_ddim(y_broadcast_dims)); + out->set_dtype(y.dtype()); + out->set_layout(y.layout()); + out->share_lod(y); +} + void IndexSampleInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index f397c0def8a..976c17cd8d9 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -62,6 +62,13 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out); + void IndexSampleInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 4ffa1826a29..e9108787082 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -18,10 +18,11 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) # NOTE: Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) # auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS}) diff --git a/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc new file mode 100644 index 00000000000..80b2015f731 --- /dev/null +++ b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(triangular_solve_grad, + CPU, + ALL_LAYOUT, + phi::TriangularSolveGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc new file mode 100644 index 00000000000..5aca5be1279 --- /dev/null +++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/triangular_solve_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/expand_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/common_shape.h" + +namespace phi { + +template +void TriangularSolveKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + bool upper, + bool transpose, + bool unitriangular, + DenseTensor* out) { + // get broadcast dim + std::vector x_bst_dims_vec; + std::vector y_bst_dims_vec; + std::tie(x_bst_dims_vec, y_bst_dims_vec) = + funcs::MatrixGetBroadcastDims(x, y); + int x_bst_ndim = x_bst_dims_vec.size(); + int y_bst_ndim = y_bst_dims_vec.size(); + + // Tensor broadcast to 'out' and temp 'x_bst' + ScalarArray x_bst_dims(x_bst_dims_vec); + DenseTensor x_bst = phi::Empty(dev_ctx, x_bst_dims); + const T* x_bst_data = x_bst.data(); + ExpandKernel(dev_ctx, x, x_bst_dims, &x_bst); + + out->Resize(phi::make_ddim(y_bst_dims_vec)); + T* out_data = dev_ctx.template Alloc(out); + ScalarArray y_bst_dims(y_bst_dims_vec); + ExpandKernel(dev_ctx, y, y_bst_dims, out); + + // Calculate use blas library + int M = static_cast(y_bst_dims_vec[y_bst_ndim - 2]); + int N = static_cast(y_bst_dims_vec[y_bst_ndim - 1]); + int batch_size = 1; + for (int i = 0; i < x_bst_ndim - 2; i++) { + batch_size *= x_bst_dims_vec[i]; + } + + auto blas = phi::funcs::GetBlas(dev_ctx); + for (int i = 0; i < batch_size; i++) { + blas.TRSM(CblasLeft, + upper ? CblasUpper : CblasLower, + transpose ? CblasTrans : CblasNoTrans, + unitriangular ? CblasUnit : CblasNonUnit, + M, + N, + T(1), + x_bst_data + i * M * M, + std::max(1, M), + out_data + i * N * M, + std::max(1, N)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(triangular_solve, + CPU, + ALL_LAYOUT, + phi::TriangularSolveKernel, + float, + double) {} diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 8b8697b6df1..02cba6009c4 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -8,3 +8,4 @@ math_library(sequence2batch) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) math_library(concat_and_split_functor DEPS dense_tensor) +math_library(matrix_reduce DEPS dense_tensor) diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index dce80caab72..139341536de 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -140,6 +140,72 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) { return true; } +// Just For Matrix OP, for example: +// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N] +// out [5, 3, 2], which is batch_size of matrix +static inline std::vector MatrixGetBroadcastBatchPortion( + std::vector x, std::vector y) { + size_t size_x = x.size(); + size_t size_y = y.size(); + size_t size = std::max(size_x, size_y); + std::vector batchPortion(size); + + ptrdiff_t i = (ptrdiff_t)size - 1; + for (; i >= 0; --i) { + ptrdiff_t offset = size - i - 1; + ptrdiff_t dim_x = size_x - offset - 1; + ptrdiff_t dim_y = size_y - offset - 1; + int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1; + int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1; + + PADDLE_ENFORCE_EQ( + (x_size == y_size || x_size == 1 || y_size == 1), + true, + phi::errors::PreconditionNotMet( + "The size of tensor x (%d) must match the size of tensor y " + "(%d) at non-singleton dimension %d.", + x_size, + y_size, + i)); + + batchPortion[i] = x_size != 1 ? x_size : y_size; + } + return batchPortion; +} + +// Just For Matrix OP, for example: +// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N] +// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is +// batch_size of matrix +static inline std::tuple, std::vector> +MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) { + std::vector x_dims_vec = phi::vectorize(x.dims()); + std::vector y_dims_vec = phi::vectorize(y.dims()); + + std::vector::const_iterator f1 = x_dims_vec.begin(); + std::vector::const_iterator l1 = x_dims_vec.end() - 2; + std::vector x_dims_vec_cut(f1, l1); + + std::vector::const_iterator f2 = y_dims_vec.begin(); + std::vector::const_iterator l2 = y_dims_vec.end() - 2; + std::vector y_dims_vec_cut(f2, l2); + + std::vector expand_batch_portion = + MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector x_expand_size({expand_batch_portion}); + x_expand_size.insert(x_expand_size.end(), + {x_dims_vec[static_cast(x_dims_vec.size()) - 2], + x_dims_vec[static_cast(x_dims_vec.size()) - 1]}); + + std::vector y_expand_size({expand_batch_portion}); + y_expand_size.insert(y_expand_size.end(), + {y_dims_vec[static_cast(y_dims_vec.size()) - 2], + y_dims_vec[static_cast(y_dims_vec.size()) - 1]}); + + return std::make_tuple(x_expand_size, y_expand_size); +} + inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) { if (s_dims.size() > l_dims.size()) { return GetOutputDims(l_dims, s_dims); diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc new file mode 100644 index 00000000000..849fd7a0075 --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_reduce.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/funcs/matrix_reduce.h" + +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { +namespace funcs { + +template +class MatrixReduceSumFunctor { + public: + void operator()(const CPUContext& dev_ctx, + const DenseTensor& in, + DenseTensor* out) { + // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] + // out_reduce_dim should be [0, 2] + const std::vector in_dims = phi::vectorize(in.dims()); + auto in_size = in_dims.size(); + const std::vector out_dims = phi::vectorize(out->dims()); + auto out_size = out_dims.size(); + + std::vector out_bst_dims(in_size); + + std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); + std::copy(out_dims.data(), + out_dims.data() + out_size, + out_bst_dims.data() + in_size - out_size); + out->Resize(phi::make_ddim(out_bst_dims)); + + std::vector out_reduce_dims; + for (size_t idx = 0; idx <= in_size - 3; idx++) { + if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { + out_reduce_dims.push_back(idx); + } + } + phi::ReduceKernelImpl( + dev_ctx, in, out, out_reduce_dims, true, false); + } +}; + +template class MatrixReduceSumFunctor; +template class MatrixReduceSumFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu new file mode 100644 index 00000000000..5e288c6e9c2 --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_reduce.cu @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" + +namespace phi { +namespace funcs { + +template +class MatrixReduceSumFunctor { + public: + void operator()(const GPUContext& dev_ctx, + const DenseTensor& in, + DenseTensor* out) { + // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] + // out_reduce_dim should be [0, 2] + const std::vector in_dims = phi::vectorize(in.dims()); + auto in_size = in_dims.size(); + const std::vector out_dims = phi::vectorize(out->dims()); + auto out_size = out_dims.size(); + + std::vector out_bst_dims(in_size); + + std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); + std::copy(out_dims.data(), + out_dims.data() + out_size, + out_bst_dims.data() + in_size - out_size); + out->Resize(phi::make_ddim(out_bst_dims)); + + std::vector out_reduce_dims; + for (size_t idx = 0; idx <= in_size - 3; idx++) { + if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { + out_reduce_dims.push_back(idx); + } + } + TensorReduceImpl>( + dev_ctx, + in, + out, + kps::IdentityFunctor(), + out_reduce_dims, + dev_ctx.stream()); + } +}; + +template class MatrixReduceSumFunctor; +template class MatrixReduceSumFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_reduce.h b/paddle/phi/kernels/funcs/matrix_reduce.h new file mode 100644 index 00000000000..22bddacd43d --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_reduce.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +namespace funcs { + +// Use For Matrix OP, reduce_sum 'in' according to out's dim +// for example: in's dim = [5, 3, 2, M, N] ; out's dim = [3, 1, M, N] +// axis [0, 2] of DenseTensor 'in' will be reduced +template +class MatrixReduceSumFunctor { + public: + void operator()(const Context& dev_ctx, + const DenseTensor& in, + DenseTensor* out); +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu new file mode 100644 index 00000000000..f7eaa485797 --- /dev/null +++ b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(triangular_solve_grad, + GPU, + ALL_LAYOUT, + phi::TriangularSolveGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu new file mode 100644 index 00000000000..f137d8e1c26 --- /dev/null +++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu @@ -0,0 +1,132 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/triangular_solve_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/expand_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/common_shape.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +template +void TriangularSolveKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + bool upper, + bool transpose, + bool unitriangular, + DenseTensor* out) { + // get broadcast dim + std::vector x_bst_dims_vec; + std::vector y_bst_dims_vec; + std::tie(x_bst_dims_vec, y_bst_dims_vec) = + funcs::MatrixGetBroadcastDims(x, y); + int x_bst_ndim = x_bst_dims_vec.size(); + int y_bst_ndim = y_bst_dims_vec.size(); + + // Tensor broadcast to 'out' and temp 'x_bst' + ScalarArray x_bst_dims(x_bst_dims_vec); + DenseTensor x_bst = phi::Empty(dev_ctx, x_bst_dims); + const T* x_bst_data = x_bst.data(); + ExpandKernel(dev_ctx, x, x_bst_dims, &x_bst); + + out->Resize(phi::make_ddim(y_bst_dims_vec)); + T* out_data = dev_ctx.template Alloc(out); + ScalarArray y_bst_dims(y_bst_dims_vec); + ExpandKernel(dev_ctx, y, y_bst_dims, out); + + // calculate use cublas library + CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; + CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; + CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; + + int M = static_cast(y_bst_dims_vec[y_bst_ndim - 2]); + int N = static_cast(y_bst_dims_vec[y_bst_ndim - 1]); + auto lda = std::max(1, M); + auto ldb = std::max(1, N); + + int batch_size = 1; + for (int i = 0; i < x_bst_ndim - 2; i++) { + batch_size *= x_bst_dims_vec[i]; + } + + auto blas = phi::funcs::GetBlas(dev_ctx); + if (batch_size <= 8 && M >= 64) { + for (auto i = 0; i < batch_size; i++) { + blas.TRSM(CblasLeft, + uplo, + transA, + diag, + M, + N, + T(1), + x_bst_data + i * M * M, + lda, + out_data + i * N * M, + ldb); + } + } else { + std::vector cpu_ptrs(batch_size * 2); + for (int i = 0; i < batch_size; ++i) { + cpu_ptrs[i] = x_bst_data + i * M * M; + cpu_ptrs[i + batch_size] = out_data + i * M * N; + } + + // Copy the addresses of A and tmp_b from host to device. + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); + + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_gpu_ptrs_data->ptr(), + paddle::platform::CPUPlace(), + static_cast(cpu_ptrs.data()), + cpu_ptrs.size() * sizeof(T*), + dev_ctx.stream()); + + const T** gpu_a_ptrs = + reinterpret_cast(tmp_gpu_ptrs_data->ptr()); + T** gpu_b_ptrs = + reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; + blas.BatchedTRSM(CblasLeft, + uplo, + transA, + diag, + M, + N, + static_cast(1.0), + gpu_a_ptrs, + lda, + gpu_b_ptrs, + ldb, + batch_size); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(triangular_solve, + GPU, + ALL_LAYOUT, + phi::TriangularSolveKernel, + float, + double) {} diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h new file mode 100644 index 00000000000..a6868ebe6ca --- /dev/null +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/triangular_solve_grad_kernel.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/operators/tril_triu_op.h" + +namespace phi { + +template +void TriangularSolveGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + bool upper, + bool transpose, + bool unitriangular, + DenseTensor* dx, + DenseTensor* dy) { + std::vector x_bst_dims_vec; + std::vector y_bst_dims_vec; + std::tie(x_bst_dims_vec, y_bst_dims_vec) = + funcs::MatrixGetBroadcastDims(x, y); + + ScalarArray y_bst_dims_array(y_bst_dims_vec); + DenseTensor dy_bst = phi::Empty(dev_ctx, y_bst_dims_array); + if (dy) { + // calculate x's conjugate for complex + DenseTensor x_conj = phi::Empty(dev_ctx); + x_conj.Resize(x.dims()); + + phi::funcs::ForRange x_for_range(dev_ctx, x.numel()); + phi::funcs::ConjFunctor x_functor( + x.data(), x.numel(), dev_ctx.template Alloc(&x_conj)); + x_for_range(x_functor); + + // reuse forward to get dy_bst, and the result has been broadcated already. + TriangularSolveKernel( + dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst); + + dy->Resize(y.dims()); + dev_ctx.template Alloc(dy); + if (dy_bst.dims() == y.dims()) { + Copy(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy); + } else { + funcs::MatrixReduceSumFunctor functor; + functor(dev_ctx, dy_bst, dy); + dy->Resize(y.dims()); + } + } + + ScalarArray x_bst_dims_array(x_bst_dims_vec); + DenseTensor dx_bst = phi::Empty(dev_ctx, x_bst_dims_array); + if (dx) { + // calculate x's conjugate for complex + DenseTensor out_conj = phi::Empty(dev_ctx); + out_conj.Resize(out.dims()); + + phi::funcs::ForRange out_for_range(dev_ctx, out.numel()); + phi::funcs::ConjFunctor out_functor( + out.data(), out.numel(), dev_ctx.template Alloc(&out_conj)); + out_for_range(out_functor); + + auto blas = phi::funcs::GetBlas(dev_ctx); + if (transpose) { + auto mat_dim_a = + phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); + auto mat_dim_b = + phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); + blas.MatMul(out_conj, + mat_dim_a, + dy_bst, + mat_dim_b, + static_cast(-1), + &dx_bst, + static_cast(0)); + } else { + auto mat_dim_a = + phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); + auto mat_dim_b = + phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); + blas.MatMul(dy_bst, + mat_dim_a, + out_conj, + mat_dim_b, + static_cast(-1), + &dx_bst, + static_cast(0)); + } + + // get upper or lower triangular + DenseTensor dx_bst_upper = + phi::Empty(dev_ctx, x_bst_dims_array); + + const auto& dims = dx_bst.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + phi::funcs::ForRange x_for_range(dev_ctx, dx_bst.numel()); + paddle::operators::TrilTriuCompute tril_triu_functor( + dx_bst.data(), unitriangular, !upper, H, W, dx_bst_upper.data()); + x_for_range(tril_triu_functor); + + dx->Resize(x.dims()); + dev_ctx.template Alloc(dx); + if (dx_bst.dims() == x.dims()) { + Copy(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx); + } else { + funcs::MatrixReduceSumFunctor functor; + functor(dev_ctx, dx_bst_upper, dx); + dx->Resize(x.dims()); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h new file mode 100644 index 00000000000..eb5a5ab461a --- /dev/null +++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TriangularSolveGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + bool upper, + bool transpose, + bool unitriangular, + DenseTensor* dx, + DenseTensor* dy); + +} // namespace phi diff --git a/paddle/phi/kernels/triangular_solve_kernel.h b/paddle/phi/kernels/triangular_solve_kernel.h new file mode 100644 index 00000000000..833de3f8439 --- /dev/null +++ b/paddle/phi/kernels/triangular_solve_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TriangularSolveKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + bool upper, + bool transpose, + bool unitriangular, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc new file mode 100644 index 00000000000..c56af3e21e5 --- /dev/null +++ b/paddle/phi/ops/compat/triangular_solve_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TriangularSolveGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("triangular_solve_grad", + {"X", "Y", "Out", GradVarName("Out")}, + {"upper", "transpose", "unitriangular"}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad, + phi::TriangularSolveGradOpArgumentMapping); -- GitLab From 7e076e7b750b20c27c1f230b14bc794c2231c897 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Sun, 6 Mar 2022 09:36:54 +0800 Subject: [PATCH 071/261] [PHI] Move dist op to phi (#40178) * move dist op to phi * fix * fix * fix as reviews --- paddle/fluid/operators/dist_op.cc | 17 +- paddle/fluid/operators/dist_op.h | 304 ------------------ paddle/phi/infermeta/binary.cc | 23 ++ paddle/phi/infermeta/binary.h | 5 + paddle/phi/kernels/cpu/dist_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/dist_kernel.cc | 21 ++ paddle/phi/kernels/dist_grad_kernel.h | 31 ++ paddle/phi/kernels/dist_kernel.h | 28 ++ paddle/phi/kernels/gpu/dist_grad_kernel.cu | 26 ++ .../kernels/gpu/dist_kernel.cu} | 21 +- .../phi/kernels/impl/dist_grad_kernel_impl.h | 223 +++++++++++++ paddle/phi/kernels/impl/dist_kernel_impl.h | 164 ++++++++++ paddle/phi/ops/compat/dist_sig.cc | 28 ++ 13 files changed, 588 insertions(+), 325 deletions(-) delete mode 100644 paddle/fluid/operators/dist_op.h create mode 100644 paddle/phi/kernels/cpu/dist_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/dist_kernel.cc create mode 100644 paddle/phi/kernels/dist_grad_kernel.h create mode 100644 paddle/phi/kernels/dist_kernel.h create mode 100644 paddle/phi/kernels/gpu/dist_grad_kernel.cu rename paddle/{fluid/operators/dist_op.cu => phi/kernels/gpu/dist_kernel.cu} (51%) create mode 100644 paddle/phi/kernels/impl/dist_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/dist_kernel_impl.h create mode 100644 paddle/phi/ops/compat/dist_sig.cc diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 3a53f136556..10750574c45 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/dist_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, + PT_INFER_META(phi::DistInferMeta)); + REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker, ops::DistGradOpMaker, - ops::DistGradOpMaker); + ops::DistGradOpMaker, + DistInferShapeFunctor); REGISTER_OPERATOR(dist_grad, ops::DistOpGrad); -REGISTER_OP_CPU_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CPU_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel) diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h deleted file mode 100644 index dfd7e29a8d0..00000000000 --- a/paddle/fluid/operators/dist_op.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using framework::Tensor; - -template -static void GetBraodcastDims(const framework::DDim& x_dims, - const framework::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0, - platform::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, y_dims)); -} - -static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - auto p = context.Attr("p"); - out->mutable_data(context.GetPlace()); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - - // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - // p=0 means number of non-zero elements of (x-y) - // p=inf means the maximum of |x-y| - // p=-inf means the minimum of |x-y| - // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) - if (p == 0) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) - .template cast() - .sum(); - } else if (p == INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .maximum(); - } else if (p == -INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .minimum(); - } else { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .pow(p) - .sum() - .pow(1.0 / p); - } -} - -template -static void DistGradFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Input("Out"); - auto p = context.Attr("p"); - - auto x_grad = context.Output(framework::GradVarName("X")); - auto y_grad = context.Output(framework::GradVarName("Y")); - auto out_grad = context.Input(framework::GradVarName("Out")); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - auto out_dims = context.Input("Out")->dims(); - - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - framework::DDim out_new_dims = GetNewDims(out_dims, Rank); - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out, out_new_dims); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes out_bcast_dims; - - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - std::vector new_dims_vec(Rank); - for (int i = 0; i < Rank; ++i) { - new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); - out_bcast_dims[i] = new_dims_vec[i]; - } - framework::DDim new_dims = phi::make_ddim(new_dims_vec); - - auto& place = - *context.template device_context().eigen_device(); - auto out_grad_t = EigenTensor::From(*out_grad, out_new_dims); - framework::Tensor grad; - grad.mutable_data(new_dims, context.GetPlace()); - auto grad_t = EigenTensor::From(grad); - - auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); - auto x_minux_y_abs = x_minux_y.abs(); - auto sign = - (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + - (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); - T epsilon = static_cast(1.0e-10f); - - // 1: Lp-norm(z), z = x-y, compute dz - if (p == 0) { - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, &grad, static_cast(0)); - } else if (p == INFINITY || p == -INFINITY) { - // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if - // j!=i, or equals to sign(z_i) * dout if j=i. - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } else { - // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } - - Eigen::DSizes x_reshape_dims; - Eigen::DSizes y_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < x_new_dims.size(); ++i) { - x_reshape_dims[2 * i] = x_bcast_dims[i]; - x_reshape_dims[2 * i + 1] = x_new_dims[i]; - y_reshape_dims[2 * i] = y_bcast_dims[i]; - y_reshape_dims[2 * i + 1] = y_new_dims[i]; - reduce_dims[i] = 2 * i; - } - - // 2: if x or y is broadcasted in forward function, - // the grad need to be sum along the broadcasted dimensions - if (x_grad) { - x_grad->mutable_data(context.GetPlace()); - auto x_grad_t = EigenTensor::From(*x_grad, x_new_dims); - x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) - .sum(reduce_dims) - .reshape(x_grad_t.dimensions()); - } - if (y_grad) { - y_grad->mutable_data(context.GetPlace()); - auto y_grad_t = EigenTensor::From(*y_grad, y_new_dims); - y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) - .sum(reduce_dims) - .reshape(y_grad_t.dimensions()); - } -} - -template -class DistKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistFunction(context); - break; - case 2: - DistFunction(context); - break; - case 3: - DistFunction(context); - break; - case 4: - DistFunction(context); - break; - case 5: - DistFunction(context); - break; - case 6: - DistFunction(context); - break; - } - } -}; - -template -class DistGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistGradFunction(context); - break; - case 2: - DistGradFunction(context); - break; - case 3: - DistGradFunction(context); - break; - case 4: - DistGradFunction(context); - break; - case 5: - DistGradFunction(context); - break; - case 6: - DistGradFunction(context); - break; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index c017e5864aa..94b489906c6 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -456,6 +456,29 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + PADDLE_ENFORCE_NE(phi::product(x_dims), + 0, + phi::errors::InvalidArgument( + "The Input(X) has not been initialized properly. The " + "shape of Input(X) = [%s].", + x_dims)); + PADDLE_ENFORCE_NE(phi::product(y_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Y) has not been initialized properly. The " + "shape of Input(Y) = [%s].", + y_dims)); + out->set_dims({1}); + out->set_dtype(x.dtype()); +} + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out) { diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 976c17cd8d9..caf9185c900 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -85,6 +85,11 @@ void BCELossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out); + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out); diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc new file mode 100644 index 00000000000..2b7f8f98f94 --- /dev/null +++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/dist_grad_kernel.h" +#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc new file mode 100644 index 00000000000..ccf3d4be832 --- /dev/null +++ b/paddle/phi/kernels/cpu/dist_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/dist_kernel.h" +#include "paddle/phi/kernels/impl/dist_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {} diff --git a/paddle/phi/kernels/dist_grad_kernel.h b/paddle/phi/kernels/dist_grad_kernel.h new file mode 100644 index 00000000000..1f8d7ff21f2 --- /dev/null +++ b/paddle/phi/kernels/dist_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DistGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& out_grad, + float p, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h new file mode 100644 index 00000000000..6cb3d6e0e8b --- /dev/null +++ b/paddle/phi/kernels/dist_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DistKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + float p, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu new file mode 100644 index 00000000000..c458f8cce3e --- /dev/null +++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/dist_grad_kernel.h" +#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {} +#else +PD_REGISTER_KERNEL( + dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} +#endif diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/phi/kernels/gpu/dist_kernel.cu similarity index 51% rename from paddle/fluid/operators/dist_op.cu rename to paddle/phi/kernels/gpu/dist_kernel.cu index 90674969e28..87e75e02754 100644 --- a/paddle/fluid/operators/dist_op.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,21 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/dist_op.h" +#include "paddle/phi/kernels/dist_kernel.h" +#include "paddle/phi/kernels/impl/dist_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" -namespace ops = paddle::operators; #ifdef PADDLE_WITH_HIP // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 // do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL( - dist, ops::DistKernel); -REGISTER_OP_CUDA_KERNEL( - dist_grad, ops::DistGradKernel); +PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {} #else -REGISTER_OP_CUDA_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CUDA_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel); +PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {} #endif diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h new file mode 100644 index 00000000000..fc118a832dc --- /dev/null +++ b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h @@ -0,0 +1,223 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +using ETensor = phi::EigenTensor; + +template +static void GetBraodcastDims(const phi::DDim& x_dims, + const phi::DDim& y_dims, + Eigen::DSizes* x_bcast_dims, + Eigen::DSizes* y_bcast_dims) { + int bcast_dims_remainder = 0; + for (int i = 0; i < x_dims.size(); ++i) { + if (x_dims[i] >= y_dims[i]) { + (*x_bcast_dims)[i] = 1; + (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; + bcast_dims_remainder += x_dims[i] % y_dims[i]; + } else { + (*y_bcast_dims)[i] = 1; + (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; + bcast_dims_remainder += y_dims[i] % x_dims[i]; + } + } + PADDLE_ENFORCE_EQ(bcast_dims_remainder, + 0, + phi::errors::PreconditionNotMet( + "The input tensor of Op(dist) could not be broadcast, " + "X's shape is [%s], Y's shape is [%s].", + x_dims, + y_dims)); +} + +static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) { + std::vector new_dims_vec(rank); + if (in_dims.size() < rank) { + for (int i = 0; i < rank - in_dims.size(); ++i) { + new_dims_vec[i] = 1; + } + for (int i = 0; i < in_dims.size(); ++i) { + new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; + } + } else { + new_dims_vec = vectorize(in_dims); + } + return phi::make_ddim(new_dims_vec); +} + +template +static void DistGradFunction(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& out_grad, + float p, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto out_dims = out.dims(); + + phi::DDim x_new_dims = GetNewDims(x_dims, Rank); + phi::DDim y_new_dims = GetNewDims(y_dims, Rank); + phi::DDim out_new_dims = GetNewDims(out_dims, Rank); + auto x_t = ETensor::From(x, x_new_dims); + auto y_t = ETensor::From(y, y_new_dims); + auto out_t = ETensor::From(out, out_new_dims); + + Eigen::DSizes x_bcast_dims; + Eigen::DSizes y_bcast_dims; + Eigen::DSizes out_bcast_dims; + + GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); + std::vector new_dims_vec(Rank); + for (int i = 0; i < Rank; ++i) { + new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); + out_bcast_dims[i] = new_dims_vec[i]; + } + phi::DDim new_dims = phi::make_ddim(new_dims_vec); + + auto& place = *dev_ctx.eigen_device(); + auto out_grad_t = ETensor::From(out_grad, out_new_dims); + DenseTensor grad; + grad.Resize(new_dims); + dev_ctx.template Alloc(&grad); + auto grad_t = ETensor::From(grad); + + auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); + auto x_minux_y_abs = x_minux_y.abs(); + auto sign = + (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + + (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); + T epsilon = static_cast(1.0e-10f); + + // 1: Lp-norm(z), z = x-y, compute dz + if (p == 0) { + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, &grad, static_cast(0)); + } else if (p == INFINITY || p == -INFINITY) { + // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if + // j!=i, or equals to sign(z_i) * dout if j=i. + if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) { + grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) + .template cast() * + sign.eval() * out_grad_t.broadcast(out_bcast_dims); + } else { + grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) + .template cast() * + sign * out_grad_t.broadcast(out_bcast_dims); + } + } else { + // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout + if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) { + grad_t.device(place) = + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * + sign.eval() * out_grad_t.broadcast(out_bcast_dims); + } else { + grad_t.device(place) = + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * + sign * out_grad_t.broadcast(out_bcast_dims); + } + } + + Eigen::DSizes x_reshape_dims; + Eigen::DSizes y_reshape_dims; + Eigen::DSizes reduce_dims; + for (int i = 0; i < x_new_dims.size(); ++i) { + x_reshape_dims[2 * i] = x_bcast_dims[i]; + x_reshape_dims[2 * i + 1] = x_new_dims[i]; + y_reshape_dims[2 * i] = y_bcast_dims[i]; + y_reshape_dims[2 * i + 1] = y_new_dims[i]; + reduce_dims[i] = 2 * i; + } + + // 2: if x or y is broadcasted in forward function, + // the grad need to be sum along the broadcasted dimensions + if (x_grad) { + dev_ctx.template Alloc(x_grad); + auto x_grad_t = ETensor::From(*x_grad, x_new_dims); + x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) + .sum(reduce_dims) + .reshape(x_grad_t.dimensions()); + } + if (y_grad) { + dev_ctx.template Alloc(y_grad); + auto y_grad_t = ETensor::From(*y_grad, y_new_dims); + y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) + .sum(reduce_dims) + .reshape(y_grad_t.dimensions()); + } +} + +template +void DistGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& out_grad, + float p, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto x_rank = x.dims().size(); + auto y_rank = y.dims().size(); + auto rank = std::max(x_rank, y_rank); + PADDLE_ENFORCE_LE(rank, + 6, + phi::errors::Unimplemented( + "Op(dist) only support tensors with no more than 6 " + "dimensions, but X's rank is %d, Y's rank is %d.", + x_rank, + y_rank)); + switch (rank) { + case 1: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + case 2: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + case 3: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + case 4: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + case 5: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + case 6: + DistGradFunction( + dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); + break; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h new file mode 100644 index 00000000000..397fc1b9224 --- /dev/null +++ b/paddle/phi/kernels/impl/dist_kernel_impl.h @@ -0,0 +1,164 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +using ETensor = phi::EigenTensor; + +template +static void GetBraodcastDims(const phi::DDim& x_dims, + const phi::DDim& y_dims, + Eigen::DSizes* x_bcast_dims, + Eigen::DSizes* y_bcast_dims) { + int bcast_dims_remainder = 0; + for (int i = 0; i < x_dims.size(); ++i) { + if (x_dims[i] >= y_dims[i]) { + (*x_bcast_dims)[i] = 1; + (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; + bcast_dims_remainder += x_dims[i] % y_dims[i]; + } else { + (*y_bcast_dims)[i] = 1; + (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; + bcast_dims_remainder += y_dims[i] % x_dims[i]; + } + } + PADDLE_ENFORCE_EQ(bcast_dims_remainder, + 0, + phi::errors::PreconditionNotMet( + "The input tensor of Op(dist) could not be broadcast, " + "X's shape is [%s], Y's shape is [%s].", + x_dims, + y_dims)); +} + +static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) { + std::vector new_dims_vec(rank); + if (in_dims.size() < rank) { + for (int i = 0; i < rank - in_dims.size(); ++i) { + new_dims_vec[i] = 1; + } + for (int i = 0; i < in_dims.size(); ++i) { + new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; + } + } else { + new_dims_vec = vectorize(in_dims); + } + return phi::make_ddim(new_dims_vec); +} + +template +static void DistFunction(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + float p, + DenseTensor* out) { + if (out) { + dev_ctx.template Alloc(out); + } + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) + phi::DDim x_new_dims = GetNewDims(x_dims, Rank); + phi::DDim y_new_dims = GetNewDims(y_dims, Rank); + + auto x_t = ETensor::From(x, x_new_dims); + auto y_t = ETensor::From(y, y_new_dims); + auto out_t = ETensor::From(*out); + auto& place = *dev_ctx.eigen_device(); + + Eigen::DSizes x_bcast_dims; + Eigen::DSizes y_bcast_dims; + GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); + // p=0 means number of non-zero elements of (x-y) + // p=inf means the maximum of |x-y| + // p=-inf means the minimum of |x-y| + // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) + if (p == 0) { + out_t.device(place) = + (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) + .template cast() + .sum(); + } else if (p == INFINITY) { + out_t.device(place) = + (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) + .abs() + .maximum(); + } else if (p == -INFINITY) { + out_t.device(place) = + (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) + .abs() + .minimum(); + } else { + out_t.device(place) = + (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) + .abs() + .pow(p) + .sum() + .pow(1.0 / p); + } +} + +template +void DistKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + float p, + DenseTensor* out) { + auto x_rank = x.dims().size(); + auto y_rank = y.dims().size(); + auto rank = std::max(x_rank, y_rank); + PADDLE_ENFORCE_LE(rank, + 6, + phi::errors::Unimplemented( + "Op(dist) only support tensors with no more than 6 " + "dimensions, but X's rank is %d, Y's rank is %d.", + x_rank, + y_rank)); + switch (rank) { + case 1: + DistFunction(dev_ctx, x, y, p, out); + break; + case 2: + DistFunction(dev_ctx, x, y, p, out); + break; + case 3: + DistFunction(dev_ctx, x, y, p, out); + break; + case 4: + DistFunction(dev_ctx, x, y, p, out); + break; + case 5: + DistFunction(dev_ctx, x, y, p, out); + break; + case 6: + DistFunction(dev_ctx, x, y, p, out); + break; + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc new file mode 100644 index 00000000000..18a30b9b840 --- /dev/null +++ b/paddle/phi/ops/compat/dist_sig.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("dist_grad", + {"X", "Y", "Out", GradVarName("Out")}, + {"p"}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping); -- GitLab From d30d85dafb364a25807422bcd587e45917e09254 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 6 Mar 2022 13:31:36 +0800 Subject: [PATCH 072/261] [Phi] Replace all prefix PT by PD and fix typo (#40046) * replace prefix pt by pd * replace added kernel * revert util change * pd kernel to phi * resolve conflict * resolve conflict --- paddle/fluid/framework/infershape_utils.h | 2 +- .../fluid/framework/infershape_utils_test.cc | 4 +- paddle/fluid/operators/abs_op.cc | 4 +- paddle/fluid/operators/addmm_op.cc | 4 +- paddle/fluid/operators/atan2_op.cc | 4 +- paddle/fluid/operators/bce_loss_op.cc | 4 +- .../operators/bilinear_tensor_product_op.cc | 8 +- .../fluid/operators/broadcast_tensors_op.cc | 4 +- paddle/fluid/operators/cholesky_op.cc | 4 +- paddle/fluid/operators/concat_op.cc | 4 +- paddle/fluid/operators/conj_op.cc | 4 +- .../operators/controlflow/compare_all_op.cc | 4 +- .../fluid/operators/controlflow/compare_op.cc | 4 +- paddle/fluid/operators/cross_op.cc | 4 +- paddle/fluid/operators/diag_v2_op.cc | 4 +- paddle/fluid/operators/diagonal_op.cc | 4 +- paddle/fluid/operators/dist_op.cc | 4 +- paddle/fluid/operators/dot_op.cc | 4 +- paddle/fluid/operators/empty_op.cc | 4 +- paddle/fluid/operators/erfinv_op.cc | 4 +- paddle/fluid/operators/eye_op.cc | 4 +- paddle/fluid/operators/gather_nd_op.cc | 8 +- paddle/fluid/operators/gather_tree_op.cc | 4 +- paddle/fluid/operators/gumbel_softmax_op.cc | 8 +- paddle/fluid/operators/huber_loss_op.cc | 4 +- paddle/fluid/operators/imag_op.cc | 4 +- paddle/fluid/operators/increment_op.cc | 4 +- paddle/fluid/operators/index_sample_op.cc | 4 +- paddle/fluid/operators/lerp_op.cc | 4 +- paddle/fluid/operators/matmul_v2_op.cc | 4 +- paddle/fluid/operators/multinomial_op.cc | 4 +- paddle/fluid/operators/mv_op.cc | 4 +- paddle/fluid/operators/pixel_shuffle_op.cc | 4 +- paddle/fluid/operators/poisson_op.cc | 4 +- paddle/fluid/operators/real_op.cc | 4 +- .../operators/reduce_ops/reduce_mean_op.cc | 4 +- .../operators/reduce_ops/reduce_sum_op.cc | 4 +- paddle/fluid/operators/scale_op.cc | 4 +- paddle/fluid/operators/scatter_nd_add_op.cc | 8 +- paddle/fluid/operators/scatter_op.cc | 8 +- paddle/fluid/operators/selu_op.cc | 4 +- paddle/fluid/operators/sign_op.cc | 4 +- paddle/fluid/operators/size_op.cc | 4 +- paddle/fluid/operators/trace_op.cc | 4 +- paddle/fluid/operators/triangular_solve_op.cc | 4 +- paddle/fluid/operators/trunc_op.cc | 4 +- paddle/fluid/operators/unfold_op.cc | 4 +- paddle/fluid/operators/where_op.cc | 4 +- paddle/phi/common/data_type.h | 14 +- paddle/phi/core/compat/op_utils.h | 8 +- paddle/phi/core/infermeta_utils.h | 40 +-- paddle/phi/core/kernel_registry.h | 310 +++++++++--------- paddle/phi/core/kernel_utils.h | 99 +++--- paddle/phi/core/macros.h | 18 +- 54 files changed, 350 insertions(+), 349 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index 64c8371d583..b692b6ffab0 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -29,7 +29,7 @@ namespace framework { phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, const std::string& op_type); -#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ +#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ struct functor_name : public paddle::framework::InferShapeBase { \ void operator()( \ paddle::framework::InferShapeContext* ctx) const override { \ diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 53dcc19fcba..2eeefb19a1a 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel( } // namespace framework } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, +DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, InferShapeUtilsTestInferShapeFunctor, - PT_INFER_META(paddle::framework::TestInferMeta)); + PD_INFER_META(paddle::framework::TestInferMeta)); REGISTER_OPERATOR(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestOp, paddle::framework::InferShapeUtilsTestOpMaker, diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index c28026a4bd4..e1460629fb1 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc index de4d7818020..716a2e40179 100644 --- a/paddle/fluid/operators/addmm_op.cc +++ b/paddle/fluid/operators/addmm_op.cc @@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, - PT_INFER_META(phi::AddmmInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, + PD_INFER_META(phi::AddmmInferMeta)); REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker, ops::AddMMOpGradMaker, diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc index 71a895c244c..0783b30a858 100644 --- a/paddle/fluid/operators/atan2_op.cc +++ b/paddle/fluid/operators/atan2_op.cc @@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, - PT_INFER_META(phi::Atan2InferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, + PD_INFER_META(phi::Atan2InferMeta)); REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, ops::Atan2GradMaker, ops::Atan2GradMaker, diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 55bb57466c7..bc9076f4d7c 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, - PT_INFER_META(phi::BCELossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PD_INFER_META(phi::BCELossInferMeta)); REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 4774c0a1dbc..9f6a78ab7a5 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, BilinearTensorProductInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR( + PD_INFER_META(phi::BilinearTensorProductInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductGradInferMeta)); + PD_INFER_META(phi::BilinearTensorProductGradInferMeta)); REGISTER_OPERATOR( bilinear_tensor_product, ops::BilinearTensorProductOp, diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index c3917fad555..1063a8b7992 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, +DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, BroadcastTensorsInferShapeFunctor, - PT_INFER_META(phi::BroadcastTensorsInferMeta)); + PD_INFER_META(phi::BroadcastTensorsInferMeta)); REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 09e915a6baf..ed80ac076c0 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, - PT_INFER_META(phi::CholeskyInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, + PD_INFER_META(phi::CholeskyInferMeta)); REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, ops::CholeskyGradOpMaker, diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 1da7798ea26..059fafa3e7f 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, - PT_INFER_META(phi::ConcatInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PD_INFER_META(phi::ConcatInferMeta)); REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index 95135ba3b1a..cbec1182f20 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, ops::ConjGradMaker, ops::ConjGradMaker, diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index 9f229e6f15c..dd407f4f6f3 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -58,8 +58,8 @@ class CompareReduceOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ - DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ - PT_INFER_META(phi::CompareAllInferMeta)); \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareAllInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>, \ ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \ diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 5d9cdc61769..72d81d8c3fd 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -96,8 +96,8 @@ class CompareOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ - DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ - PT_INFER_META(phi::CompareInferMeta)); \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index fe00ee06603..674b75625d1 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, - PT_INFER_META(phi::CrossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PD_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, ops::CrossGradMaker, diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 0160277dc79..93fbff67e22 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, - PT_INFER_META(phi::DiagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, + PD_INFER_META(phi::DiagInferMeta)); REGISTER_OPERATOR( diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc index 20813f8bb44..bf3cc941539 100644 --- a/paddle/fluid/operators/diagonal_op.cc +++ b/paddle/fluid/operators/diagonal_op.cc @@ -105,8 +105,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, - PT_INFER_META(phi::DiagonalInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, + PD_INFER_META(phi::DiagonalInferMeta)); REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, ops::DiagonalGradOpMaker, diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 10750574c45..55b24849412 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -124,8 +124,8 @@ class DistGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, - PT_INFER_META(phi::DistInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, + PD_INFER_META(phi::DistInferMeta)); REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker, ops::DistGradOpMaker, diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index a86a3bb3592..8efdd15781a 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, - PT_INFER_META(phi::DotInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PD_INFER_META(phi::DotInferMeta)); REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index 96fa3282d06..9e0e4e7fe1c 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -88,8 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, - PT_INFER_META(phi::CreateInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, + PD_INFER_META(phi::CreateInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, EmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc index 3d409b4c4f6..374b0079262 100644 --- a/paddle/fluid/operators/erfinv_op.cc +++ b/paddle/fluid/operators/erfinv_op.cc @@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR( erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker, diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index f8c6b4eb8c5..537c218d357 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -67,8 +67,8 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, - PT_INFER_META(phi::EyeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, + PD_INFER_META(phi::EyeInferMeta)); REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index fcd3384ac24..e5ca15a39ef 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -130,11 +130,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X"); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, - PT_INFER_META(phi::GatherNdInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, + PD_INFER_META(phi::GatherNdInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, - PT_INFER_META(phi::GatherNdGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, + PD_INFER_META(phi::GatherNdGradInferMeta)); REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, ops::GatherNdGradOpMaker, diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 7f6c82032fe..c84e94f5c71 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -61,8 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, - PT_INFER_META(phi::GatherTreeMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PD_INFER_META(phi::GatherTreeMeta)); REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, GatherTreeInferShapeFunctor); diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc index f8f8f3fd789..524f2d6c9d7 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cc +++ b/paddle/fluid/operators/gumbel_softmax_op.cc @@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, + PD_INFER_META(phi::GumbelSoftmaxInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, GumbelSoftmaxGradInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxGradInferMeta)); + PD_INFER_META(phi::GumbelSoftmaxGradInferMeta)); REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp, ops::GumbelSoftmaxOpMaker, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 3915ce5809c..3c9bbc753f2 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, - PT_INFER_META(phi::HuberLossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, + PD_INFER_META(phi::HuberLossInferMeta)); REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, ops::HuberLossGradOpMaker, diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 567a69f383d..16968876ac9 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 105d818e197..e2efaa1759b 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, - PT_INFER_META(phi::IncrementInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, + PD_INFER_META(phi::IncrementInferMeta)); REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, ops::IncrementGradOpMaker, diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 68d002fceea..d17c6368c75 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, - PT_INFER_META(phi::IndexSampleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PD_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, ops::IndexSampleGradMaker, diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index fef6fc5319e..5e053445379 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -85,8 +85,8 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, - PT_INFER_META(phi::LerpInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, + PD_INFER_META(phi::LerpInferMeta)); REGISTER_OPERATOR( lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker, paddle::operators::LerpOpGradMaker, diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 788dbb22041..01fa01e3c6e 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker, ops::MatMulV2GradOpMaker, ops::MatMulV2GradOpMaker); -DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, - PT_INFER_META(phi::GeneralBinaryGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad, ops::MatMulV2OpDoubleGradMaker, ops::MatMulV2OpDoubleGradMaker, diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 1143f9cb37a..0113f638b9a 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, - PT_INFER_META(phi::MultinomialInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, + PD_INFER_META(phi::MultinomialInferMeta)); REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index d34a1ebf82c..bf7222fc45c 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -94,8 +94,8 @@ class MVOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, - PT_INFER_META(phi::MvInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, + PD_INFER_META(phi::MvInferMeta)); REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 2a127d9ad1d..21ca26f49f6 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, - PT_INFER_META(phi::PixelShuffleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, + PD_INFER_META(phi::PixelShuffleInferMeta)); REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, ops::PixelShuffleGradMaker, diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index 0cecbf0b9cb..d5896c41059 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 28a8484f539..18e444702fb 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 6157a3a925d..894106883cb 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_mean"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PT_INFER_META(phi::MeanRawInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, + PD_INFER_META(phi::MeanRawInferMeta)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 8ef0712dc7a..6559ed479c8 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_sum"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PT_INFER_META(phi::ReduceInferMetaBase)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index e4410b21b54..cbf2b915207 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index b7be4cfb2a3..0ae0e1500c1 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -119,12 +119,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, - PT_INFER_META(phi::ScatterNdAddInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, ScatterNdAddGradInferShapeFunctor, - PT_INFER_META(phi::ScatterNdAddGradInferMeta)); + PD_INFER_META(phi::ScatterNdAddGradInferMeta)); REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, ops::ScatterNdAddGradMaker, diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index fec003305fd..5f6b04cf59e 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -103,11 +103,11 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, - PT_INFER_META(phi::ScatterInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, + PD_INFER_META(phi::ScatterInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, - PT_INFER_META(phi::ScatterGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, + PD_INFER_META(phi::ScatterGradInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 0372a79b967..59c6e165357 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -120,8 +120,8 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index e2381c76f7e..ceb42dcf3e5 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker, ops::SignGradMaker, diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc index e584c1a4cce..84b0f403be0 100644 --- a/paddle/fluid/operators/size_op.cc +++ b/paddle/fluid/operators/size_op.cc @@ -44,8 +44,8 @@ Return the number of elements in the input. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, - PT_INFER_META(phi::SizeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, + PD_INFER_META(phi::SizeInferMeta)); REGISTER_OPERATOR( size, ops::SizeOp, ops::SizeOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 63b914a31a8..0590b66f6f8 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -107,8 +107,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, - PT_INFER_META(phi::TraceInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, + PD_INFER_META(phi::TraceInferMeta)); REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, ops::TraceGradOpMaker, diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index 179f818104c..df84659a00f 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -120,8 +120,8 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, - PT_INFER_META(phi::TriangularSolveInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, + PD_INFER_META(phi::TriangularSolveInferMeta)); REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp, ops::TriangularSolveOpMaker, diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index 54f4deac80a..b77775f5a8c 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index c45b839d5b4..02fed3de6ce 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, - PT_INFER_META(phi::UnfoldInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, + PD_INFER_META(phi::UnfoldInferMeta)); REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker, ops::UnfoldGradMaker, ops::UnfoldGradMaker, diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc index 0f10efefa13..acbfee30670 100644 --- a/paddle/fluid/operators/where_op.cc +++ b/paddle/fluid/operators/where_op.cc @@ -117,8 +117,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor, - PT_INFER_META(phi::WhereInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor, + PD_INFER_META(phi::WhereInferMeta)); REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker, ops::WhereOpGradMaker, ops::WhereOpGradMaker, diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h index d9dc103e48e..38239f0fa9d 100644 --- a/paddle/phi/common/data_type.h +++ b/paddle/phi/common/data_type.h @@ -82,7 +82,7 @@ inline size_t SizeOf(DataType data_type) { return 0; } -#define PT_FOR_EACH_DATA_TYPE(_) \ +#define PD_FOR_EACH_DATA_TYPE(_) \ _(bool, DataType::BOOL) \ _(int8_t, DataType::INT8) \ _(uint8_t, DataType::UINT8) \ @@ -105,25 +105,25 @@ struct DataTypeToCppType; template struct CppTypeToDataType; -#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \ +#define PD_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \ template <> \ struct DataTypeToCppType { \ using type = cpp_type; \ }; -PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType) +PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCppType) -#undef PT_SPECIALIZE_DataTypeToCppType +#undef PD_SPECIALIZE_DataTypeToCppType -#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \ +#define PD_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \ template <> \ struct CppTypeToDataType { \ constexpr static DataType Type() { return data_type; } \ }; -PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType) +PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_CppTypeToDataType) -#undef PT_SPECIALIZE_CppTypeToDataType +#undef PD_SPECIALIZE_CppTypeToDataType inline std::ostream& operator<<(std::ostream& os, DataType dtype) { switch (dtype) { diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index bbf634b4b09..8f64a7145ed 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -166,7 +166,7 @@ struct ArgumentMappingFnRegistrar { }; #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_REGISTER_base_kernel_name_ns_check_##op_type, \ "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace."); \ static const ::phi::BaseKernelNameRegistrar \ @@ -174,7 +174,7 @@ struct ArgumentMappingFnRegistrar { int TouchBaseKernelNameSymbol_##op_type() { return 0; } #define PD_DECLARE_BASE_KERNEL_NAME(op_type) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_DECLARE_ai_name_ns_check_##op_type, \ "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \ extern int TouchBaseKernelNameSymbol_##op_type(); \ @@ -182,7 +182,7 @@ struct ArgumentMappingFnRegistrar { TouchBaseKernelNameSymbol_##op_type() #define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_REGISTER_arg_map_fn_ns_check_##op_type, \ "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \ static const ::phi::ArgumentMappingFnRegistrar \ @@ -190,7 +190,7 @@ struct ArgumentMappingFnRegistrar { int TouchArgumentMappingFnSymbol_##op_type() { return 0; } #define PD_DECLARE_ARG_MAPPING_FN(op_type) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_DECLARE_arg_map_fn_ns_check_##op_type, \ "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \ extern int TouchArgumentMappingFnSymbol_##op_type(); \ diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index a5775db7438..9c351ce9063 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -86,10 +86,10 @@ class InferMetaContext { paddle::SmallVector> output_range_; }; -#define PT_INFER_META(...) \ +#define PD_INFER_META(...) \ ::phi::InferMetaFnImpl::Call -#define PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type) \ +#define PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type) \ template \ struct InferMetaFnCallHelper { \ template \ @@ -175,24 +175,24 @@ struct InferMetaFnImpl { }; // TODO(chenweihang): support other attr type later - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE( const std::vector&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&); - PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&); + PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&); // TODO(chenweihang): support vector input later @@ -304,11 +304,11 @@ struct InferMetaFnRegistrar { }; #define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix, \ "PD_REGISTER_INFER_META_FN must be called in global namespace."); \ static const ::phi::InferMetaFnRegistrar \ __registrar_arg_map_fn_for_##kernel_name_prefix( \ - #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn)) + #kernel_name_prefix, PD_INFER_META(variadic_infer_meta_fn)) } // namespace phi diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 35e170a3fce..6a0c7bbc9b7 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -228,13 +228,13 @@ struct KernelRegistrar { * http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement * http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644 */ -#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N())) -#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__) -#define _PT_ARG_N_EXPAND( \ +#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N())) +#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__) +#define _PD_ARG_N_EXPAND( \ _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \ N -#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args -#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args +#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /** PD_REGISTER_KERNEL * @@ -256,10 +256,10 @@ struct KernelRegistrar { #define _PD_REGISTER_KERNEL( \ reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ "PD_REGISTER_KERNEL must be called in global namespace."); \ - PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type, \ + PD_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type, \ kernel_name, \ backend, \ context, \ @@ -270,19 +270,19 @@ struct KernelRegistrar { #ifndef _WIN32 #define _PD_REGISTER_2TA_KERNEL( \ reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ - PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__); \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__); \ + static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ - PT_KERNEL_REGISTRAR_INIT( \ + PD_KERNEL_REGISTRAR_INIT( \ reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ + &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ meta_kernel_fn, \ __VA_ARGS__); \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #else /** @@ -299,119 +299,119 @@ struct KernelRegistrar { */ #define _PD_REGISTER_2TA_KERNEL( \ reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ - PT_EXPAND(PT_KERNEL_REGISTRAR_INIT( \ + PD_EXPAND(PD_KERNEL_REGISTRAR_INIT( \ reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ + &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ meta_kernel_fn, \ __VA_ARGS__)); \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #endif -#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \ - _PT_KERNEL_INSTANTIATION( \ - PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__) +#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \ + _PD_KERNEL_INSTANTIATION( \ + PD_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__) -#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \ - PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ +#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \ + PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N) \ (meta_kernel_fn, backend, context, __VA_ARGS__) -#define _PT_KERNEL_INSTANTIATION_1( \ +#define _PD_KERNEL_INSTANTIATION_1( \ meta_kernel_fn, backend, context, cpp_dtype) \ template decltype( \ meta_kernel_fn) meta_kernel_fn -#define _PT_KERNEL_INSTANTIATION_2( \ +#define _PD_KERNEL_INSTANTIATION_2( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_1( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_1( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_3( \ +#define _PD_KERNEL_INSTANTIATION_3( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_2( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_2( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_4( \ +#define _PD_KERNEL_INSTANTIATION_4( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_3( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_3( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_5( \ +#define _PD_KERNEL_INSTANTIATION_5( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_4( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_4( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_6( \ +#define _PD_KERNEL_INSTANTIATION_6( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_5( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_5( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_7( \ +#define _PD_KERNEL_INSTANTIATION_7( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_6( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_6( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_8( \ +#define _PD_KERNEL_INSTANTIATION_8( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_7( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_7( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_9( \ +#define _PD_KERNEL_INSTANTIATION_9( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_8( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_8( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_10( \ +#define _PD_KERNEL_INSTANTIATION_10( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_9( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_9( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_11( \ +#define _PD_KERNEL_INSTANTIATION_11( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_10( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_10( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_12( \ +#define _PD_KERNEL_INSTANTIATION_12( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_11( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_11( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_13( \ +#define _PD_KERNEL_INSTANTIATION_13( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_12( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_12( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_14( \ +#define _PD_KERNEL_INSTANTIATION_14( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_13( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_13( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_15( \ +#define _PD_KERNEL_INSTANTIATION_15( \ meta_kernel_fn, backend, context, cpp_dtype, ...) \ template decltype( \ meta_kernel_fn) meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_14( \ + PD_EXPAND(_PD_KERNEL_INSTANTIATION_14( \ meta_kernel_fn, backend, context, __VA_ARGS__)) -#define PT_KERNEL_REGISTRAR_INIT(reg_type, \ +#define PD_KERNEL_REGISTRAR_INIT(reg_type, \ kernel_name, \ backend, \ context, \ @@ -419,7 +419,7 @@ struct KernelRegistrar { args_def_fn, \ meta_kernel_fn, \ ...) \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT(PD_NARGS(__VA_ARGS__), \ reg_type, \ kernel_name, \ backend, \ @@ -433,7 +433,7 @@ struct KernelRegistrar { /* The =pre-commit always treats this macro into the wrong format, and multi-line macros cannot be skipped with NOLINT.*/ -#define _PT_KERNEL_REGISTRAR_INIT(N, \ +#define _PD_KERNEL_REGISTRAR_INIT(N, \ reg_type, \ kernel_name, \ backend, \ @@ -442,20 +442,20 @@ struct KernelRegistrar { args_def_fn, \ meta_kernel_fn, \ ...) \ - PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \ + PD_EXPAND(PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \ reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) // clang-format on -#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_1(reg_type, \ kernel_name, \ backend, \ context, \ @@ -464,7 +464,7 @@ struct KernelRegistrar { args_def_fn, \ meta_kernel_fn, \ cpp_dtype) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -474,10 +474,10 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; } -#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_2(reg_type, \ kernel_name, \ backend, \ context, \ @@ -487,7 +487,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -497,18 +497,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_3(reg_type, \ kernel_name, \ backend, \ context, \ @@ -518,7 +518,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -528,18 +528,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_4(reg_type, \ kernel_name, \ backend, \ context, \ @@ -549,7 +549,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -559,18 +559,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_5(reg_type, \ kernel_name, \ backend, \ context, \ @@ -580,7 +580,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -590,18 +590,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_6(reg_type, \ kernel_name, \ backend, \ context, \ @@ -611,7 +611,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -621,18 +621,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_7(reg_type, \ kernel_name, \ backend, \ context, \ @@ -642,7 +642,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -652,18 +652,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_8(reg_type, \ kernel_name, \ backend, \ context, \ @@ -673,7 +673,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -683,18 +683,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_9(reg_type, \ kernel_name, \ backend, \ context, \ @@ -704,7 +704,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -714,18 +714,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_10(reg_type, \ kernel_name, \ backend, \ context, \ @@ -735,7 +735,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -745,18 +745,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_11(reg_type, \ kernel_name, \ backend, \ context, \ @@ -766,7 +766,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -776,18 +776,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_12(reg_type, \ kernel_name, \ backend, \ context, \ @@ -797,7 +797,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -807,18 +807,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_13(reg_type, \ kernel_name, \ backend, \ context, \ @@ -828,7 +828,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -838,18 +838,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_14(reg_type, \ kernel_name, \ backend, \ context, \ @@ -859,7 +859,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -869,18 +869,18 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type, \ +#define _PD_KERNEL_REGISTRAR_INIT_15(reg_type, \ kernel_name, \ backend, \ context, \ @@ -890,7 +890,7 @@ struct KernelRegistrar { meta_kernel_fn, \ cpp_dtype, \ ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ + static const ::phi::KernelRegistrar PD_CONCATENATE( \ __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ reg_type, \ #kernel_name, \ @@ -900,14 +900,14 @@ struct KernelRegistrar { ::phi::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type, \ + PHI_KERNEL(meta_kernel_fn), \ + PHI_VARIADIC_KERNEL(meta_kernel_fn)); \ + PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(reg_type, \ kernel_name, \ backend, \ context, \ layout, \ - PT_ID, \ + PD_ID, \ args_def_fn, \ meta_kernel_fn, \ __VA_ARGS__)) @@ -924,7 +924,7 @@ struct KernelRegistrar { #define _PD_REGISTER_GENERAL_KERNEL( \ reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \ "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \ __PD_REGISTER_GENERAL_KERNEL( \ @@ -934,7 +934,7 @@ struct KernelRegistrar { #define __PD_REGISTER_GENERAL_KERNEL( \ reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ template decltype(kernel_fn) kernel_fn; \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ static const ::phi::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ @@ -943,18 +943,18 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::phi::KernelArgsParseFunctor::Parse, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - PT_KERNEL(kernel_fn), \ - PT_VARIADIC_KERNEL(kernel_fn)); \ + &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ + PHI_KERNEL(kernel_fn), \ + PHI_VARIADIC_KERNEL(kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { \ return 0; \ } \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #else #define __PD_REGISTER_GENERAL_KERNEL( \ reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ - static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ static const ::phi::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ @@ -963,13 +963,13 @@ struct KernelRegistrar { #backend, \ DATALAYOUT(layout), \ ::phi::KernelArgsParseFunctor::Parse, \ - &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - PT_KERNEL(kernel_fn), \ - PT_VARIADIC_KERNEL(kernel_fn)); \ + &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ + PHI_KERNEL(kernel_fn), \ + PHI_VARIADIC_KERNEL(kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { \ return 0; \ } \ - void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ + void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #endif @@ -979,7 +979,7 @@ struct KernelRegistrar { * to avoid being removed by linker */ #define PD_DECLARE_KERNEL(kernel_name, backend, layout) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_STATIC_ASSERT_GLOBAL_NAMESPACE( \ PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ "PD_DECLARE_KERNEL must be called in global namespace."); \ extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \ diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index f7fa27b0744..baa549d7a66 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -30,14 +30,15 @@ namespace phi { -#define PT_KERNEL(...) \ +// PD_KERNEL has been used by custom op api +#define PHI_KERNEL(...) \ ::phi::KernelImpl::Compute -#define PT_VARIADIC_KERNEL(...) \ +#define PHI_VARIADIC_KERNEL(...) \ reinterpret_cast(&::phi::KernelImpl::VariadicCompute) -#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx) \ +#define PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx) \ template \ struct KernelCallHelper { \ template \ struct KernelCallHelper { \ template \ struct KernelCallHelper, Tail...> { \ template \ struct KernelCallHelper&, Tail...> { \ template \ struct KernelCallHelper { \ template \ struct KernelCallHelper { \ template \ struct KernelCallHelper, Tail...> { \ template { /* DeviceContext Helpers */ - PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); + PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); + PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext); #endif #ifdef PADDLE_WITH_XPU - PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext); + PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext); + PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext); #endif /* Input Helpers */ - PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); - PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor); /* Attribute Helpers */ - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); /* Output Helpers */ - PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); + PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); - PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor); - PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor); + PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor); /* End case */ template diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h index 97c5466e1de..8049d027a77 100644 --- a/paddle/phi/core/macros.h +++ b/paddle/phi/core/macros.h @@ -26,19 +26,19 @@ namespace phi { classname& operator=(classname&&) = delete #endif -#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ - _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) +#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) -#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ +#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ struct __test_global_namespace_##uniq_name##__ {}; \ static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ __test_global_namespace_##uniq_name##__>::value, \ msg) #ifdef __COUNTER__ -#define PT_ID __COUNTER__ +#define PD_ID __COUNTER__ #else -#define PT_ID __LINE__ +#define PD_ID __LINE__ #endif #if defined(_WIN32) @@ -48,9 +48,9 @@ namespace phi { #define UNUSED __attribute__((unused)) #endif -#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2) -#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2) -#define PT_CONCATENATE2(arg1, arg2) arg1##arg2 -#define PT_EXPAND(x) x +#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2) +#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2) +#define PD_CONCATENATE2(arg1, arg2) arg1##arg2 +#define PD_EXPAND(x) x } // namespace phi -- GitLab From da3de72de952d9133df385cd5b98d729906db4a0 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 7 Mar 2022 10:05:49 +0800 Subject: [PATCH 073/261] [AMP] refine paddle.amp.decorate code example (#40159) * refine amp.decorate code example * refine code --- python/paddle/amp/auto_cast.py | 6 +++--- python/paddle/fluid/dygraph/amp/auto_cast.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 9ca29d509f6..5132f23079f 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -107,9 +107,9 @@ def decorate(models, import paddle model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False) - optimzier = paddle.optimizer.SGD(parameters=model.parameters()) + optimizer = paddle.optimizer.SGD(parameters=model.parameters()) - model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2') + model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2') data = paddle.rand([10, 3, 32, 32]) @@ -122,7 +122,7 @@ def decorate(models, model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False) optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters()) - models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2') + models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2') data = paddle.rand([10, 3, 32, 32]) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index f43a51063b0..191661b7bf9 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -411,9 +411,9 @@ def amp_decorate(models, import paddle model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False) - optimzier = paddle.optimizer.SGD(parameters=model.parameters()) + optimizer = paddle.optimizer.SGD(parameters=model.parameters()) - model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2') + model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimizer, level='O2') data = paddle.rand([10, 3, 32, 32]) @@ -426,7 +426,7 @@ def amp_decorate(models, model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False) optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters()) - models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2') + models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2') data = paddle.rand([10, 3, 32, 32]) -- GitLab From f5ec03147d51302d2f04a23f1b30addb8f93ab43 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 7 Mar 2022 10:35:35 +0800 Subject: [PATCH 074/261] [Phi]Migrate Adamax and Adadelta Optimizer Op into Phi (#40173) * [Phi]Migrate Adamax into phi * Add adadelta kernel --- .../fluid/operators/optimizers/adadelta_op.cc | 88 +++--------------- .../fluid/operators/optimizers/adadelta_op.cu | 19 ---- .../fluid/operators/optimizers/adadelta_op.h | 84 ----------------- .../fluid/operators/optimizers/adamax_op.cc | 78 +++------------- .../fluid/operators/optimizers/adamax_op.cu | 19 ---- paddle/fluid/operators/optimizers/adamax_op.h | 82 ----------------- paddle/phi/infermeta/multiary.cc | 92 +++++++++++++++++++ paddle/phi/infermeta/multiary.h | 24 +++++ paddle/phi/kernels/adadelta_kernel.h | 33 +++++++ paddle/phi/kernels/adamax_kernel.h | 36 ++++++++ paddle/phi/kernels/cpu/adadelta_kernel.cc | 22 +++++ paddle/phi/kernels/cpu/adamax_kernel.cc | 21 +++++ paddle/phi/kernels/gpu/adadelta_kernel.cu | 22 +++++ paddle/phi/kernels/gpu/adamax_kernel.cu | 21 +++++ .../phi/kernels/impl/adadelta_kernel_impl.h | 65 +++++++++++++ paddle/phi/kernels/impl/adamax_kernel_impl.h | 69 ++++++++++++++ 16 files changed, 429 insertions(+), 346 deletions(-) delete mode 100644 paddle/fluid/operators/optimizers/adadelta_op.cu delete mode 100644 paddle/fluid/operators/optimizers/adadelta_op.h delete mode 100644 paddle/fluid/operators/optimizers/adamax_op.cu delete mode 100644 paddle/fluid/operators/optimizers/adamax_op.h create mode 100644 paddle/phi/kernels/adadelta_kernel.h create mode 100644 paddle/phi/kernels/adamax_kernel.h create mode 100644 paddle/phi/kernels/cpu/adadelta_kernel.cc create mode 100644 paddle/phi/kernels/cpu/adamax_kernel.cc create mode 100644 paddle/phi/kernels/gpu/adadelta_kernel.cu create mode 100644 paddle/phi/kernels/gpu/adamax_kernel.cu create mode 100644 paddle/phi/kernels/impl/adadelta_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/adamax_kernel_impl.h diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index ad7f93d73e9..3cafbce04d3 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adadelta_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, - platform::errors::InvalidArgument( - "Input(Param) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, - platform::errors::InvalidArgument( - "Input(Grad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredGrad"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredGrad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredUpdate"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("ParamOut"), true, - platform::errors::InvalidArgument( - "Output(ParamOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredGradOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredUpdateOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.")); - - auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and grad input of AdadeltaOp should have same dimension.")); - PADDLE_ENFORCE_NE( - phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable AvgSquaredGrad has not " - "been initialized. You may need to confirm if you put " - "exe.run(startup_program) after optimizer.minimize " - "function.")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), - platform::errors::InvalidArgument( - "Param and AvgSquaredGrad input of AdadeltaOp " - "should have same dimension")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), - platform::errors::InvalidArgument( - "Param and AvgSquaredUpdate input of AdadeltaOp " - "should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("AvgSquaredGradOut", param_dim); - ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -149,7 +81,11 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); -REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, + PT_INFER_META(phi::AdadeltaInferMeta)); +REGISTER_OPERATOR( + adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdadeltaInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu deleted file mode 100644 index 562a157f063..00000000000 --- a/paddle/fluid/operators/optimizers/adadelta_op.cu +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adadelta_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h deleted file mode 100644 index 85cfad35858..00000000000 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdadeltaOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto avg_squared_grad_out_tensor = - ctx.Output("AvgSquaredGradOut"); - auto avg_squared_update_out_tensor = - ctx.Output("AvgSquaredUpdateOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); - - T rho = static_cast(ctx.Attr("rho")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - // Squared gradient accumulator - auto avg_squared_grad = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredGrad")); - // Squared updates accumulator - auto avg_squared_update = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredUpdate")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto avg_squared_grad_out = - framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); - auto avg_squared_update_out = - framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto& place = *ctx.template device_context().eigen_device(); - - avg_squared_grad_out.device(place) = - rho * avg_squared_grad + (1 - rho) * grad.square(); - auto update = - -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) - .sqrt() * - grad; - avg_squared_update_out.device(place) = - rho * avg_squared_update + (1 - rho) * update.square(); - param_out.device(place) = param + update; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index a95a37c980c..29f3d3b09de 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adamax_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate", - "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax"); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut", - "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut", - "Adamax"); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE(phi::product(lr_dims), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension")); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1, - platform::errors::InvalidArgument( - "Beta1 power accumulator should have 1 dimension")); - auto param_dims = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment"), - platform::errors::InvalidArgument( - "Param and Moment input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("InfNorm"), - platform::errors::InvalidArgument( - "Param and InfNorm input of AdamaxOp should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("MomentOut", param_dims); - ctx->SetOutputDim("InfNormOut", param_dims); - } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -150,7 +92,11 @@ division by 0 error. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL( - adamax, ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +DELCARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, + PT_INFER_META(phi::AdamaxInferMeta)); + +REGISTER_OPERATOR( + adamax, ops::AdamaxOp, ops::AdamaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamaxInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu deleted file mode 100644 index 80e0219d441..00000000000 --- a/paddle/fluid/operators/optimizers/adamax_op.cu +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adamax_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - adamax, ops::AdamaxOpKernel, - ops::AdamaxOpKernel); diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h deleted file mode 100644 index df0112448b1..00000000000 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdamaxOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); - auto inf_norm_out_tensor = ctx.Output("InfNormOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment_out_tensor->mutable_data(ctx.GetPlace()); - inf_norm_out_tensor->mutable_data(ctx.GetPlace()); - - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto inf_norm = framework::EigenVector::Flatten( - *ctx.Input("InfNorm")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto inf_norm_out = - framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(*place) = - grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); - auto lr_t = lr / (1 - beta1_pow); - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(*place) = - param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index dc5478e8afb..a21f077c09f 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -28,6 +28,98 @@ std::vector GetMetaTensorsDim(const std::vector& tensors) { return dims; } +void AdamaxInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment, + const MetaTensor& inf_norm, + const MetaTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + MetaTensor* param_out, + MetaTensor* moment_out, + MetaTensor* inf_norm_out) { + auto lr_dims = learning_rate.dims(); + PADDLE_ENFORCE_NE( + product(lr_dims), + 0, + errors::InvalidArgument("Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + PADDLE_ENFORCE_EQ( + product(lr_dims), + 1, + errors::InvalidArgument("Learning rate should have 1 dimension")); + auto beta1_pow_dims = beta1_pow.dims(); + PADDLE_ENFORCE_EQ(product(beta1_pow_dims), + 1, + errors::InvalidArgument( + "Beta1 power accumulator should have 1 dimension")); + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + grad.dims(), + errors::InvalidArgument( + "Param and Grad input of AdamaxOp should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + moment.dims(), + errors::InvalidArgument( + "Param and Moment input of AdamaxOp should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + inf_norm.dims(), + errors::InvalidArgument( + "Param and InfNorm input of AdamaxOp should have same dimension")); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + moment_out->set_dims(param_dims); + moment_out->set_dtype(moment.dtype()); + + inf_norm_out->set_dims(param_dims); + inf_norm_out->set_dtype(inf_norm.dtype()); +} + +void AdadeltaInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& avg_squared_grad, + const MetaTensor& avg_squared_update, + float rho, + float epsilon, + MetaTensor* param_out, + MetaTensor* avg_squared_grad_out, + MetaTensor* avg_squared_update_out) { + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + grad.dims(), + errors::InvalidArgument( + "Param and grad input of AdadeltaOp should have same dimension.")); + PADDLE_ENFORCE_EQ( + param_dims, + avg_squared_grad.dims(), + errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp " + "should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + avg_squared_update.dims(), + errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp " + "should have same dimension")); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + avg_squared_grad_out->set_dims(param_dims); + avg_squared_grad_out->set_dtype(avg_squared_grad.dtype()); + + avg_squared_update_out->set_dims(param_dims); + avg_squared_update_out->set_dtype(avg_squared_update.dtype()); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 51738c5e08e..8cb6f70481d 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -39,4 +39,28 @@ void WhereInferMeta(const MetaTensor& condition, const MetaTensor& x, const MetaTensor& y, MetaTensor* out); + +void AdamaxInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment, + const MetaTensor& inf_norm, + const MetaTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + MetaTensor* param_out, + MetaTensor* moment_out, + MetaTensor* inf_norm_out); + +void AdadeltaInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& avg_squared_grad, + const MetaTensor& avg_squared_update, + float rho, + float epsilon, + MetaTensor* param_out, + MetaTensor* avg_squared_grad_out, + MetaTensor* avg_squared_update_out); + } // namespace phi diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h new file mode 100644 index 00000000000..65a6aad4151 --- /dev/null +++ b/paddle/phi/kernels/adadelta_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AdadeltaKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& avg_squared_grad, + const DenseTensor& avg_squared_update, + float rho, + float epsilon, + DenseTensor* param_out, + DenseTensor* avg_squared_grad_out, + DenseTensor* avg_squared_update_out); + +} // namespace phi diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h new file mode 100644 index 00000000000..feaf996f162 --- /dev/null +++ b/paddle/phi/kernels/adamax_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AdamaxKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment, + const DenseTensor& inf_norm, + const DenseTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + DenseTensor* param_out, + DenseTensor* moment_out, + DenseTensor* inf_norm_out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/adadelta_kernel.cc b/paddle/phi/kernels/cpu/adadelta_kernel.cc new file mode 100644 index 00000000000..e9b5397b616 --- /dev/null +++ b/paddle/phi/kernels/cpu/adadelta_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/adadelta_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h" + +PD_REGISTER_KERNEL( + adadelta, CPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/adamax_kernel.cc b/paddle/phi/kernels/cpu/adamax_kernel.cc new file mode 100644 index 00000000000..867c900e70b --- /dev/null +++ b/paddle/phi/kernels/cpu/adamax_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/adamax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/adamax_kernel_impl.h" + +PD_REGISTER_KERNEL(adamax, CPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu new file mode 100644 index 00000000000..7516a277a74 --- /dev/null +++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/adadelta_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h" + +PD_REGISTER_KERNEL( + adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu new file mode 100644 index 00000000000..0817c531318 --- /dev/null +++ b/paddle/phi/kernels/gpu/adamax_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/adamax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/adamax_kernel_impl.h" + +PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h new file mode 100644 index 00000000000..3fbdf435bab --- /dev/null +++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/adadelta_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void AdadeltaKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& avg_squared_grad, + const DenseTensor& avg_squared_update, + float rho, + float epsilon, + DenseTensor* param_out, + DenseTensor* avg_squared_grad_out, + DenseTensor* avg_squared_update_out) { + dev_ctx.template Alloc(param_out); + dev_ctx.template Alloc(avg_squared_grad_out); + dev_ctx.template Alloc(avg_squared_update_out); + + T rho_ = static_cast(rho); + T epsilon_ = static_cast(epsilon); + + auto eigen_param = EigenVector::Flatten(param); + auto eigen_grad = EigenVector::Flatten(grad); + // Squared gradient accumulator + auto eigen_avg_squared_grad = EigenVector::Flatten(avg_squared_grad); + // Squared updates accumulator + auto eigen_avg_squared_update = EigenVector::Flatten(avg_squared_update); + auto eigen_param_out = EigenVector::Flatten(*param_out); + auto eigen_avg_squared_grad_out = + EigenVector::Flatten(*avg_squared_grad_out); + auto eigen_avg_squared_update_out = + EigenVector::Flatten(*avg_squared_update_out); + auto& place = *dev_ctx.eigen_device(); + + eigen_avg_squared_grad_out.device(place) = + rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square(); + auto update = -((eigen_avg_squared_update + epsilon_) / + (eigen_avg_squared_grad_out + epsilon_)) + .sqrt() * + eigen_grad; + eigen_avg_squared_update_out.device(place) = + rho_ * eigen_avg_squared_update + (1 - rho_) * update.square(); + eigen_param_out.device(place) = eigen_param + update; +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h new file mode 100644 index 00000000000..bff553319a2 --- /dev/null +++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/adamax_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void AdamaxKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment, + const DenseTensor& inf_norm, + const DenseTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + DenseTensor* param_out, + DenseTensor* moment_out, + DenseTensor* inf_norm_out) { + dev_ctx.template Alloc(param_out); + dev_ctx.template Alloc(moment_out); + dev_ctx.template Alloc(inf_norm_out); + + T beta1_ = static_cast(beta1); + T beta2_ = static_cast(beta2); + T epsilon_ = static_cast(epsilon); + + auto eigen_param = EigenVector::Flatten(param); + auto eigen_grad = EigenVector::Flatten(grad); + auto eigen_moment = EigenVector::Flatten(moment); + auto eigen_inf_norm = EigenVector::Flatten(inf_norm); + auto eigen_lr = EigenVector::Flatten(learning_rate); + auto eigen_beta1_pow = EigenVector::Flatten(beta1_pow); + + auto eigen_param_out = EigenVector::Flatten(*param_out); + auto eigen_moment_out = EigenVector::Flatten(*moment_out); + auto eigen_inf_norm_out = EigenVector::Flatten(*inf_norm_out); + + auto& place = *dev_ctx.eigen_device(); + + eigen_moment_out.device(place) = + beta1_ * eigen_moment + (1 - beta1_) * eigen_grad; + eigen_inf_norm_out.device(place) = + eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_); + auto lr_t = eigen_lr / (1 - eigen_beta1_pow); + Eigen::DSizes m_dsize(moment_out->numel()); + eigen_param_out.device(place) = + eigen_param - + lr_t.broadcast(m_dsize) * (eigen_moment_out / eigen_inf_norm_out); +} + +} // namespace phi -- GitLab From 0ad25fb923a7e8426e1c34e3e5d2bcabb90b2233 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Mon, 7 Mar 2022 10:49:53 +0800 Subject: [PATCH 075/261] initialize processgroupnccl with store (#40181) --- .../collective/ProcessGroupNCCL.cc | 45 ++++++++----------- .../distributed/collective/ProcessGroupNCCL.h | 5 ++- paddle/fluid/distributed/store/store.h | 23 +++++++--- paddle/fluid/pybind/communication.cc | 36 ++++++++++++--- paddle/fluid/pybind/distributed_py.cc | 44 +----------------- .../tests/unittests/process_group_nccl.py | 19 +++----- 6 files changed, 75 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 88d8fb69eb6..67715f410d4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // Same as Wait void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } -ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, +ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size) - : ProcessGroup(rank, size), strategy_(strategy) {} - -void ProcessGroupNCCL::BcastNCCLId( - std::vector& nccl_ids, // NOLINT - int root, int server_fd) { - if (strategy_.local_rank_ == root) { - std::vector other_trainers; - for (auto& ep : strategy_.trainer_endpoints_) { - if (ep != strategy_.current_endpoint_) { - other_trainers.push_back(ep); - } - } - platform::SendBroadCastCommID(other_trainers, &nccl_ids); - } else { - platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, - &nccl_ids); - } -} + : ProcessGroup(rank, size), store_(store) {} void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT - - int server_fd = -1; - if (rank_ != 0) { - server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) - .socket(); + if (rank_ == 0) { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto nccl_id = std::vector( + reinterpret_cast(&nccl_ids[i]), + reinterpret_cast(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES); + store_->set(key, nccl_id); + } + } else { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&nccl_ids[i], ret.data(), ret.size()); + } } - BcastNCCLId(nccl_ids, 0, server_fd); } // create NCCLManager cache for places_key @@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( } BroadcastUniqueNCCLID(nccl_ids); - VLOG(3) << "init nccl rank: " << strategy_.local_rank_ - << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); std::vector> dev_ctx; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index d63a5e76838..aa2a2b8fa20 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" @@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup { private: }; - ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size); const std::string GetBackendName() const override { return std::string(NCCL_BACKEND_NAME); @@ -118,7 +119,7 @@ class ProcessGroupNCCL : public ProcessGroup { const std::vector& inputs); protected: - ProcessGroupStrategy strategy_; + std::shared_ptr store_; std::shared_ptr nccl_comm_; std::mutex mutex_; std::unordered_map>> diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2581a74d7e8..7b4ae7e70ff 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -25,15 +25,26 @@ namespace distributed { class Store { public: - Store() = delete; + Store() : _timeout(tcputils::kNoTimeout) {} explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {} virtual ~Store() = default; - virtual int64_t add(const std::string& key, int64_t value) = 0; - virtual std::vector get(const std::string& key) = 0; - virtual void wait(const std::string& key) = 0; - virtual void set(const std::string& key, - const std::vector& value) = 0; + virtual int64_t add(const std::string& key, int64_t value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual std::vector get(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void wait(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void set(const std::string& key, const std::vector& value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index c01accaf598..1a6a395545a 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -30,18 +30,42 @@ namespace pybind { using TCPStore = paddle::distributed::TCPStore; -void BindTCPStore(py::module* m) { - py::class_>(*m, "TCPStore") +void BindTCPStore(py::module *m) { + auto Store = + py::class_>( + *m, "Store") + .def(py::init<>()) + .def("set", + [](distributed::Store &self, const std::string &key, + const std::string &value) { + std::vector data(value.begin(), value.end()); + self.set(key, data); + }, + py::arg("key"), py::arg("value"), + py::call_guard()) + .def("get", + [](distributed::Store &self, + const std::string &key) -> py::bytes { + auto data = self.get(key); + return py::bytes(reinterpret_cast(data.data()), + data.size()); + }, + py::arg("key"), py::call_guard()) + .def("add", &distributed::Store::add, + py::call_guard()) + .def("wait", &distributed::Store::wait, + py::call_guard()); + + py::class_>(*m, "TCPStore", Store) .def(py::init([](std::string hostname, uint16_t port, bool is_master, size_t world_size, std::chrono::seconds timeout) { return std::make_shared(hostname, port, is_master, world_size, timeout); }), py::arg("hostname"), py::arg("port"), py::arg("is_master"), - py::arg("world_size"), py::arg("timeout"), - py::call_guard()) - .def("add", &TCPStore::add) - .def("get", &TCPStore::get); + py::arg("world_size"), + py::arg("timeout") = distributed::tcputils::kNoTimeout, + py::call_guard()); } } // namespace pybind diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 17512863357..9870eab8da9 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -197,7 +197,7 @@ void BindDistributed(py::module *m) { py::class_>( *m, "ProcessGroupNCCL", ProcessGroup) - .def(py::init(), + .def(py::init &, int, int>(), py::call_guard()); #endif @@ -210,44 +210,6 @@ void BindDistributed(py::module *m) { .def("synchronize", &distributed::ProcessGroup::Task::Synchronize, py::call_guard()); - // define parallel strategy, it will be removed - py::class_ pg_strategy( - *m, "ProcessGroupStrategy", ""); - pg_strategy.def(py::init()) - .def_property("nranks", - [](const distributed::ProcessGroupStrategy &self) { - return self.nranks_; - }, - [](distributed::ProcessGroupStrategy &self, int nranks) { - self.nranks_ = nranks; - }) - .def_property("local_rank", - [](const distributed::ProcessGroupStrategy &self) { - return self.local_rank_; - }, - [](distributed::ProcessGroupStrategy &self, - int local_rank) { self.local_rank_ = local_rank; }) - .def_property( - "trainer_endpoints", - [](const distributed::ProcessGroupStrategy &self) { - return self.trainer_endpoints_; - }, - [](distributed::ProcessGroupStrategy &self, - std::vector eps) { self.trainer_endpoints_ = eps; }) - .def_property("current_endpoint", - [](const distributed::ProcessGroupStrategy &self) { - return self.current_endpoint_; - }, - [](distributed::ProcessGroupStrategy &self, - const std::string &ep) { self.current_endpoint_ = ep; }) - .def_property("nrings", - [](const distributed::ProcessGroupStrategy &self) { - return self.nrings_; - }, - [](distributed::ProcessGroupStrategy &self, int nrings) { - self.nrings_ = nrings; - }); - #if defined(PADDLE_WITH_GLOO) py::class_(*m, "GlooOptions") .def(py::init<>()) @@ -279,9 +241,7 @@ void BindDistributed(py::module *m) { return std::make_shared(store, rank, world_size, opts); }), - py::arg("store"), py::arg("rank"), - py::arg("world_size"), // py::arg("timeout") = - // kProcessGroupDefaultTimeout, + py::arg("store"), py::arg("rank"), py::arg("world_size"), py::call_guard()) .def_static("create_default_device", &ProcessGroupGloo::createDefaultDevice); diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py index 4833cea9a8d..b1da0777feb 100644 --- a/python/paddle/fluid/tests/unittests/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py @@ -27,22 +27,13 @@ import paddle.fluid.core as core from paddle.fluid.framework import _test_eager_guard from paddle.fluid.dygraph.parallel import ParallelEnv -ProcessGroupStrategy = core.ProcessGroupStrategy - def init_process_group(strategy=None): - # this will remove - if strategy is None: - strategy = ProcessGroupStrategy() - strategy.nranks = ParallelEnv().nranks - strategy.local_rank = ParallelEnv().local_rank - strategy.trainer_endpoints = ParallelEnv().trainer_endpoints - strategy.current_endpoint = ParallelEnv().current_endpoint - if strategy.nranks < 2: - return - - pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank, - strategy.nranks) + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) + pg_group = core.ProcessGroupNCCL(store, rank, nranks) return pg_group -- GitLab From b4eb413efaa109359ca75abdb02bf27ccaf8deb9 Mon Sep 17 00:00:00 2001 From: zn <96479180+kangna-qi@users.noreply.github.com> Date: Mon, 7 Mar 2022 11:33:23 +0800 Subject: [PATCH 076/261] [MLU]support reduce tensors on mlu (#40000) * [MLU]support reduce tensors on mlu * [MLU]fix compiler options --- paddle/fluid/imperative/CMakeLists.txt | 3 +- paddle/fluid/imperative/reducer.cc | 87 ++++++++++++++- paddle/fluid/imperative/reducer.h | 2 +- paddle/fluid/imperative/tests/CMakeLists.txt | 2 +- paddle/fluid/imperative/tests/test_group.cc | 20 +++- paddle/fluid/operators/math/CMakeLists.txt | 2 + .../fluid/operators/math/concat_and_split.cc | 100 ++++++++++++++++++ paddle/fluid/operators/mlu/mlu_baseop.cc | 42 ++++++++ paddle/fluid/operators/mlu/mlu_baseop.h | 11 ++ paddle/fluid/operators/strided_memcpy.h | 5 + 10 files changed, 265 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index e1ce705533a..3d8a5ab21f0 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT WIN32) endif() if(WITH_CNCL) cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + cc_library(reducer SRCS reducer.cc DEPS layer) endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) @@ -41,7 +42,7 @@ if(NOT WIN32) endif(NOT WIN32) if(WITH_GLOO) cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) )) + if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) cc_library(reducer SRCS reducer.cc DEPS layer) endif() endif() diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3a6365b2af2..fec9afbf3b4 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -31,7 +31,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #ifdef PADDLE_WITH_XPU_BKCL // TODO(liuyuhui) support xpu about div nranks in the future #endif + } else if (platform::is_mlu_place(tensor->place())) { + // TODO(zhangna) + VLOG(4) << "divnrank for mlu not support yet"; } } @@ -222,6 +225,56 @@ void SplitTensorsWithType( } #endif +#ifdef PADDLE_WITH_CNCL +// context is used to select the stream for concat +template <> +void ConcatTensorsWithType( + const platform::MLUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +// context is used to select the stream for split +template <> +void SplitTensorsWithType( + const platform::MLUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat npu grads since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + ConcatTensorsWithType( + static_cast(context), + dense_tensors_, &dense_contents_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat mlu grads since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { ConcatTensorsWithType( @@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split npu grad since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + SplitTensorsWithType( + static_cast(context), + &dense_contents_, &dense_tensors_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split mlu grad since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { SplitTensorsWithType( @@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui) support XPU set constant VLOG(3) << "XPU doesn't support set_constant"; } +#elif defined(PADDLE_WITH_CNCL) + if (platform::is_mlu_place(group_tensor.place())) { + // TODO(liuyuhui) support MLU set constant + VLOG(3) << "MLU doesn't support set_constant"; + } #else auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); if (HasGrad(var_index)) { @@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or GLOO.")); + "Not compiled with BKCL or NCCL or CNCL or GLOO.")); #endif } } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index cca773b840c..9fac4b41cbd 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -45,7 +45,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index e4f1cfdb3ba..09de0106ed6 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL) +if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 6c304278d21..5e674af1a08 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) { value.push_back(static_cast(1.0 * j)); } - if (std::is_same::value) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (std::is_same::value || + std::is_same::value) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_CNCL) paddle::memory::Copy(place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0); #endif @@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) { } #endif +#if defined(PADDLE_WITH_CNCL) +TEST(TestGroup, TestMLUConcatSplit) { + platform::MLUPlace mlu_place(0); + platform::CPUPlace cpu_place; + + int size = 3; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); + + size = 15; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); +} +#endif } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 14b12ca3acb..bce927c32dd 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -5,6 +5,8 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif (WITH_MLU) +math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) endif() diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 46126ac59c8..c9308d27c0a 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -226,6 +229,90 @@ class SplitFunctor { }; #endif +#ifdef PADDLE_WITH_MLU +template +class ConcatFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const std::vector& input, int axis, + framework::Tensor* output) { + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto ins_size = input.size(); + + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = context.GetPlace(); + output->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(input[i].data()); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + // MLU should do sth + MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(), + inputs.data(), output_desc.get(), GetBasePtr(output)); + } +}; + +template +class SplitFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { + if (input.numel() == 0) { + return; + } + + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto in_dims = input.dims(); + auto out_size = outputs->size(); + + std::vector outs_dims(out_size, in_dims); + for (size_t i = 0; i < out_size; ++i) { + outs_dims[i][axis] = ref_inputs[i]->dims()[axis]; + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + for (size_t i = 0; i < out_size; i++) { + (*outputs)[i]->Resize(outs_dims[i]); + (*outputs)[i]->mutable_data(context.GetPlace()); + output_descs.emplace_back( + MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, + ToCnnlDataType((*outputs)[i]->dtype()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr((*outputs)[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input.dtype())); + + // MLU should do sth + MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(), + desc_vector.data(), vct_tensor.data()); + } +}; +#endif + #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ template class SplitFunctor; @@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float) FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR) #endif +#ifdef PADDLE_WITH_MLU +#define DEFINE_MLU_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; +DEFINE_MLU_FUNCTOR(float) +DEFINE_MLU_FUNCTOR(platform::float16) +DEFINE_MLU_FUNCTOR(int64_t) +DEFINE_MLU_FUNCTOR(bool) +DEFINE_MLU_FUNCTOR(int) +DEFINE_MLU_FUNCTOR(int8_t) +DEFINE_MLU_FUNCTOR(int16_t) +DEFINE_MLU_FUNCTOR(uint8_t) +#endif } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9de03582cbb..1fdaa153e3c 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + /* static */ void MLUCnnl::Div( const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in0_desc, const void* in0, @@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_descs, output_ptrs)); } +/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + /* static */ void MLUCnnl::GatherFunctor( const ExecutionContext& ctx, const int axis, const int batch_dims, const cnnlTensorDescriptor_t params_desc, const void* params, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2a54a8392c7..b55b10686e9 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -403,6 +403,11 @@ class MLUCnnl { const void* const inputs[], const cnnlTensorDescriptor_t output_desc, void* output); + static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -566,6 +571,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); + static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + static void Scale(const ExecutionContext& ctx, const int axis, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t alpha_desc, const void* alpha, diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c92d468f346..af29aac6b90 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); +#elif defined(PADDLE_WITH_MLU) + auto& mlu_place = place; + auto& mlu_ctx = reinterpret_cast(ctx); + memory::Copy(mlu_place, dst + i * dst_after, mlu_place, + src + i * src_after, sizeof(T) * size, mlu_ctx.stream()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Paddle is not compiled with GPU.")); -- GitLab From 98c427e2a5584191507c1bdce8baa0e9fc1dd88e Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 7 Mar 2022 11:38:43 +0800 Subject: [PATCH 077/261] [bf16] add bf16 kernel: sigmoid & sqrt & softplus & square (#40004) * add activ * refine unittest * refine unittest * refine unittest * refine unittest * refine code --- paddle/fluid/operators/activation_op.kps | 31 +++-- paddle/fluid/operators/amp/fp16_type_traits.h | 7 ++ paddle/fluid/operators/dropout_impl.cu.h | 3 +- paddle/phi/common/bfloat16.h | 4 + .../tests/unittests/test_activation_op.py | 113 ++++++++++++++++++ 5 files changed, 150 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index e1afb3919f8..3b7ce9eaf2b 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -1509,7 +1509,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ grad_functor) \ @@ -1531,7 +1535,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, @@ -1650,7 +1658,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidDoubleGradKernel>, ops::SigmoidDoubleGradKernel>); + ops::SigmoidGradGradFunctor>, + ops::SigmoidDoubleGradKernel>); REGISTER_OP_CUDA_KERNEL( sigmoid_triple_grad, @@ -1659,7 +1669,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidTripleGradKernel>, ops::SigmoidTripleGradKernel>); + ops::SigmoidTripleGradFunctor>, + ops::SigmoidTripleGradKernel< + plat::CUDADeviceContext, + ops::SigmoidTripleGradFunctor>); /* ========================================================================== */ /* =========================== tanh register ============================ */ @@ -1696,7 +1709,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SqrtDoubleGradKernel>, ops::SqrtDoubleGradKernel>); + ops::SqrtGradGradFunctor>, + ops::SqrtDoubleGradKernel>); /* ========================================================================== */ /* =========================== rsqrt register ============================= @@ -1726,6 +1741,8 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareGradGradFunctor>, ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel { using Type = float; }; +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index dcdab033e8f..17665ad67e4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -266,7 +266,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, cudaMemcpyDeviceToDevice, stream)); #endif } else { - T factor = static_cast(1.0f - dropout_prob); + using MT = typename details::MPTypeTrait::Type; + MT factor = static_cast(1.0f - dropout_prob); std::vector ins = {&x}; std::vector outs = {y}; auto functor = phi::funcs::ScaleFunctor(factor); diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index cf99bb8f19a..5f30ee4077b 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { return !((isnan)(a)) && !((isinf)(a)); } +HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) { + return bfloat16(std::abs(static_cast(a))); +} + inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) { os << static_cast(a); return os; diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index b4b5944e27c..5c40b898d23 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -183,6 +183,34 @@ class TestSigmoid(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.01) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSigmoidBF16(OpTest): + def setUp(self): + self.op_type = "sigmoid" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) + out = 1 / (1 + np.exp(-x)) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + class TestSilu(TestActivation): def setUp(self): self.op_type = "silu" @@ -945,6 +973,34 @@ class TestSqrt(TestActivation, TestParameter): self.check_grad(['X'], 'Out') +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSqrtBF16(OpTest): + def setUp(self): + self.op_type = "sqrt" + self.init_dtype() + + np.random.seed(1023) + x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32) + out = np.sqrt(x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + class TestRsqrt(TestActivation): def setUp(self): self.op_type = "rsqrt" @@ -2195,6 +2251,34 @@ class TestSquare(TestActivation): self.check_grad(['X'], 'Out', max_relative_error=0.007) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSquareBF16(OpTest): + def setUp(self): + self.op_type = "square" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32) + out = np.square(x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5) + + class TestPow(TestActivation): def setUp(self): self.op_type = "pow" @@ -2433,6 +2517,35 @@ class TestSoftplus(TestActivation): self.check_grad(['X'], 'Out') +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftplusBF16(OpTest): + def setUp(self): + self.op_type = "softplus" + self.init_dtype() + + beta = 2 + threshold = 15 + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32) + out = ref_softplus(x, beta, threshold) + self.inputs = {'X': convert_float_to_uint16(x)} + self.attrs = {'beta': beta, "threshold": threshold} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05) + + class TestSoftplusAPI(unittest.TestCase): # test paddle.nn.Softplus, paddle.nn.functional.softplus def setUp(self): -- GitLab From 71cb016cad456c6a314885ee0087dac05db03dbe Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Mon, 7 Mar 2022 11:43:47 +0800 Subject: [PATCH 078/261] [AutoParallel]engine support pp (#40084) * engine support pp * fix format * avoid multi print * fix convert * bug fix * add pp unittest --- .../distributed/auto_parallel/engine.py | 24 +++- .../paddle/distributed/auto_parallel/utils.py | 9 +- python/paddle/distributed/utils.py | 16 ++- .../unittests/auto_parallel/CMakeLists.txt | 3 +- .../unittests/auto_parallel/engine_api.py | 132 ++++++++++++++++++ .../auto_parallel/test_engine_api.py | 127 +++-------------- 6 files changed, 186 insertions(+), 125 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 8efb9eb7192..56beb895741 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -99,11 +99,11 @@ class Engine: all_ranks = world_process_group.ranks for rank in all_ranks: self._parallel(rank) - place = _get_device() - if isinstance(place, fluid.CUDAPlace): + self._place = _get_device() + if isinstance(self._place, fluid.CUDAPlace): self._place = fluid.CUDAPlace(ParallelEnv().dev_id) if self._executor is None: - self._executor = fluid.Executor(place) + self._executor = paddle.static.Executor(self._place) def _build(self): serial_main_prog = self._serial_main_progs.get(self.mode, None) @@ -119,12 +119,13 @@ class Engine: labels = [s._create_feed_layer() for s in to_list(labels_spec)] self._input_vars = inputs self._label_vars = labels - feed_list = self._input_vars + self._label_vars + self._feed_vars = self._input_vars + self._label_vars outputs = to_list(self.model(*inputs)) if self.mode != "predict" and self.loss: loss = self.loss(*(outputs + labels)) self._loss_var = loss + self._fetch_vars = {"outputs": outputs, "loss": loss} self._serial_main_progs[self.mode] = serial_main_prog self._serial_startup_progs[self.mode] = serial_startup_prog self._dist_contexts[self.mode] = DistributedContext( @@ -278,19 +279,32 @@ class Engine: dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank] dist_context = self._dist_contexts[self.mode] dist_main_block = dist_main_prog.global_block() + serial_main_prog = self._serial_main_progs[self.mode] + serial_main_block = serial_main_prog.global_block() op_size = len(dist_main_block.ops) places = paddle.static.cuda_places() with fluid.program_guard(dist_main_prog, dist_startup_prog): dataloader = NonIterableGeneratorLoader( dataset, feed_list, places, batch_size, epochs, steps_per_epoch) new_op_size = len(dist_main_block.ops) - for idx in range(new_op_size - 1, op_size - 1, -1): + for _ in range(new_op_size - 1, op_size - 1, -1): op = dist_main_block.ops[new_op_size - 1] new_op_desc = dist_main_block.desc._prepend_op() new_op_desc.copy_from(op.desc) new_op = Operator( dist_main_block, new_op_desc, type=new_op_desc.type()) dist_main_block.ops.insert(0, new_op) + for in_name in new_op.input_arg_names: + if in_name == "lod_tensor_blocking_queue_0": + continue + if in_name not in dist_main_block.vars: + in_var = serial_main_block._var_recursive(in_name) + dist_main_block._clone_variable(in_var, in_var.persistable) + for out_name in new_op.output_arg_names: + if out_name not in dist_main_block.vars: + out_var = serial_main_block._var_recursive(out_name) + dist_main_block._clone_variable(out_var, + out_var.persistable) dist_op = DistributedOperator(new_op) dist_context.add_dist_op_for_program(dist_op) for _ in range(new_op_size - op_size): diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 75e0ae251ef..241eadcbace 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -22,7 +22,6 @@ import logging from functools import reduce import paddle.fluid.core as core -from paddle.framework.io import _to_LodTensor from paddle.distributed.fleet.meta_optimizers.common import OpRole from paddle.fluid.io import is_parameter, is_belong_to_optimizer from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute @@ -739,7 +738,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr): rank_id = paddle.distributed.get_rank() index = cur_attr["process_group"].index(rank_id) param = dist_param_dict[var_name][index] - dist_param_dict[var_name] = _to_LodTensor(param) + dist_param_dict[var_name] = param continue pre_param = dist_param_dict[var_name] @@ -751,7 +750,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr): dist_param_dict[var_name] = complete_param else: complete_param = pre_param[0] - dist_param_dict[var_name] = _to_LodTensor(complete_param) + dist_param_dict[var_name] = complete_param if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping: sliced_param = _slice_parameter_with_dist_attr(complete_param, @@ -798,7 +797,7 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr): assert len(partition_param_list) == 1 or not partition_param_list, \ "Fail to merge parameter" - complete_param = _to_LodTensor(partition_param_list[0][0]) + complete_param = partition_param_list[0][0] return complete_param @@ -818,7 +817,7 @@ def _slice_parameter_with_dist_attr(param, dist_attr): rank_id = paddle.distributed.get_rank() sliced_param_index = _get_sliced_param_index( rank_id, param.shape, dims_mapping, process_shape, process_group) - sliced_param = _to_LodTensor(sliced_param_list[sliced_param_index]) + sliced_param = sliced_param_list[sliced_param_index] return sliced_param diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 53f4a93f648..ae40a42e9d5 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -546,13 +546,15 @@ class Pod(object): def get_logger(log_level, name="root"): logger = logging.getLogger(name) - logger.setLevel(log_level) - - log_handler = logging.StreamHandler() - log_format = logging.Formatter( - '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s') - log_handler.setFormatter(log_format) - logger.addHandler(log_handler) + # Avoid printing multiple logs + if not logger.handlers: + logger.setLevel(log_level) + + log_handler = logging.StreamHandler() + log_format = logging.Formatter( + '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s') + log_handler.setFormatter(log_format) + logger.addHandler(log_handler) return logger diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 0a9eaf34ba5..80bc206ae7b 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -5,7 +5,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS}) set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) - py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS}) + py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS}) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) + set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py new file mode 100644 index 00000000000..8c71c792bf0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py @@ -0,0 +1,132 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import time +import paddle.fluid as fluid +import copy +import os +import numpy as np +import subprocess +import paddle +import paddle.nn as nn +import paddle.fluid as fluid +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +from paddle.fluid import layers +from paddle.io import Dataset, IterableDataset, DataLoader +from paddle.static import InputSpec +from paddle.distributed import fleet +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.engine import Engine + +paddle.enable_static() +global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) +PP_MESH_0 = auto.ProcessMesh([0]) +PP_MESH_1 = auto.ProcessMesh([1]) +batch_size = 1 +batch_num = 10 +hidden_size = 1024 +sequence_len = 512 +image_size = hidden_size +class_num = 10 + +paddle.seed(44) + + +class MyDataset(Dataset): + def __init__(self, num_samples): + super(MyDataset, self).__init__() + self.num_samples = num_samples + + def __getitem__(self, index): + input = np.random.uniform(size=image_size).astype("float32") + label = np.random.randint(0, class_num - 1, dtype="int64") + return input, label + + def __len__(self): + return self.num_samples + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = auto.shard_op( + self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0] + out = self.linear0(input) + out = F.gelu(out, approximate=True) + out = auto.shard_op( + self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0] + out = self.dropout(out) + out = self.linear2(out) + return out + + +def train(): + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + dataset = MyDataset(batch_num * batch_size) + data_spec = [ + InputSpec([batch_size, hidden_size], 'float32', 'x'), + InputSpec([batch_size], 'int64', 'label') + ] + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + # init parallel optimizer + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) + + engine = Engine(mlp, data_spec, strategy=dist_strategy) + engine.prepare(optimizer, loss) + engine.fit(dataset, + batch_size=batch_size, + steps_per_epoch=batch_num * batch_size) + + +if __name__ == "__main__": + train() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py index 0fc1ea41033..a7d51a7e176 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,122 +13,35 @@ # limitations under the License. import unittest -import time -import paddle.fluid as fluid -import copy import os -import numpy as np +import sys +import shutil import subprocess -import paddle -import paddle.nn as nn -import paddle.fluid as fluid -import paddle.static as static -import paddle.nn.functional as F -import paddle.utils as utils -from paddle.fluid import layers -from paddle.io import Dataset, IterableDataset, DataLoader -from paddle.static import InputSpec -from paddle.distributed import fleet -import paddle.distributed.auto_parallel as auto -from paddle.distributed.auto_parallel.engine import Engine - -paddle.enable_static() -global_process_mesh = auto.ProcessMesh(mesh=[0]) -batch_size = 1 -batch_num = 10 -hidden_size = 1024 -sequence_len = 512 -image_size = hidden_size -class_num = 10 - -paddle.seed(44) - - -class MyDataset(Dataset): - def __init__(self, num_samples): - super(MyDataset, self).__init__() - self.num_samples = num_samples - - def __getitem__(self, index): - input = np.random.uniform(size=image_size).astype("float32") - label = np.random.randint(0, class_num - 1, dtype="int64") - return input, label - - def __len__(self): - return self.num_samples - - -class MLPLayer(nn.Layer): - def __init__(self, - hidden_size=1024, - intermediate_size=4 * 1024, - dropout_ratio=0.1, - initializer_range=0.02): - super(MLPLayer, self).__init__() - d_model = hidden_size - dim_feedforward = intermediate_size - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=initializer_range)) - bias_attr = None - - self.linear0 = nn.Linear( - d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) - self.linear1 = nn.Linear( - dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) - self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) - # self.norm = nn.LayerNorm(d_model, epsilon=1e-5) - # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") - - def forward(self, input): - auto.shard_tensor( - input, - dist_attr={ - "process_mesh": global_process_mesh, - "dims_mappig": [-1] - }) - # out = self.norm(input) - out = self.linear0(input) - out = F.gelu(out, approximate=True) - out = self.linear1(out) - # out = self.dropout(out) - out = self.linear2(out) - return out +from paddle.distributed.fleet.launch_utils import run_with_coverage class TestEngineAPI(unittest.TestCase): def test_engine_api(self): - mlp = MLPLayer( - hidden_size=hidden_size, - intermediate_size=4 * hidden_size, - dropout_ratio=0.1, - initializer_range=0.02) - loss = paddle.nn.CrossEntropyLoss() - optimizer = paddle.fluid.optimizer.AdamOptimizer( - learning_rate=0.00001, - beta1=0.9, - beta2=0.999, - epsilon=1e-08, - grad_clip=None) + file_dir = os.path.dirname(os.path.abspath(__file__)) + launch_model_path = os.path.join(file_dir, "engine_api.py") + + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + else: + coverage_args = [] - dataset = MyDataset(batch_num * batch_size) - data_spec = [ - InputSpec([batch_size, hidden_size], 'float32', 'x'), - InputSpec([batch_size], 'int64', 'label') + cmd = [sys.executable, "-u"] + coverage_args + [ + "-m", "launch", "--gpus", "0,1", launch_model_path ] - dist_strategy = fleet.DistributedStrategy() - dist_strategy.amp = False - dist_strategy.pipeline = False - dist_strategy.recompute = False - # init parallel optimizer - dist_strategy.semi_auto = True - fleet.init(is_collective=True, strategy=dist_strategy) + process = subprocess.Popen(cmd) + process.wait() + self.assertEqual(process.returncode, 0) - engine = Engine(mlp, data_spec, strategy=dist_strategy) - engine.prepare(optimizer, loss) - engine.fit(dataset, - batch_size=batch_size, - steps_per_epoch=batch_num * batch_size) + # Remove unnecessary files + log_path = os.path.join(file_dir, "log") + if os.path.exists(log_path): + shutil.rmtree(log_path) if __name__ == "__main__": -- GitLab From fd36ede6d89c1d5397e6b351e020ffbbad0ed6a7 Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Mon, 7 Mar 2022 12:15:34 +0800 Subject: [PATCH 079/261] [phi] move multi_dot OP (#40038) * [phi] move multi_dot OP * fix the segment bug * fix bug * delete useless comment * fix CI bug --- paddle/fluid/operators/multi_dot_op.cc | 397 --------------- .../phi/kernels/cpu/multi_dot_grad_kernel.cc | 22 + paddle/phi/kernels/cpu/multi_dot_kernel.cc | 22 + .../phi/kernels/gpu/multi_dot_grad_kernel.cu | 30 ++ paddle/phi/kernels/gpu/multi_dot_kernel.cu | 25 + .../phi/kernels/impl/multi_dot_kernel_impl.h | 456 ++++++++++++++++++ paddle/phi/kernels/multi_dot_grad_kernel.h | 27 ++ paddle/phi/kernels/multi_dot_kernel.h | 26 + paddle/phi/ops/compat/multi_dot_sig.cc | 27 ++ 9 files changed, 635 insertions(+), 397 deletions(-) create mode 100644 paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/multi_dot_kernel.cc create mode 100644 paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/multi_dot_kernel.cu create mode 100644 paddle/phi/kernels/impl/multi_dot_kernel_impl.h create mode 100644 paddle/phi/kernels/multi_dot_grad_kernel.h create mode 100644 paddle/phi/kernels/multi_dot_kernel.h create mode 100644 paddle/phi/ops/compat/multi_dot_sig.cc diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index fe4609b3ad9..b309e1b87ef 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape( return out_dim; } -template -inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, - const framework::Tensor& matrix_a, - const framework::Tensor& matrix_b, - const framework::DDim& a_dim, - const framework::DDim& b_dim) { - auto place = ctx.GetPlace(); - auto blas = phi::funcs::GetBlas(ctx); - - framework::Tensor matrix_c; - framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.Resize(c_dim); - matrix_c.mutable_data(place); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); - const T alpha = static_cast(1.0); - blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); - return matrix_c; -} - -/** - * @brief Recursively calculate matrix multiplication according to the optimal - * order - * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] - * - * @param - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * save_result: set true by backward - * results: save the intermediate result during backward - */ -template -inline framework::Tensor MatChainMul( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, const uint64_t j, - const bool save_result, std::vector* results) { - if (i == j) { - return *ins[i]; - } - - const auto A = MatChainMul(ctx, ins, ins_dims, order, i, - order[i * ins.size() + j], - save_result, results); - framework::DDim a_dim = A.dims(); - if (i == order[i * ins.size() + j]) { - a_dim = ins_dims[i]; - } - - const auto B = MatChainMul(ctx, ins, ins_dims, order, - order[i * ins.size() + j] + 1, j, - save_result, results); - framework::DDim b_dim = B.dims(); - if (j == order[i * ins.size() + j] + 1) { - b_dim = ins_dims[j]; - } - - auto result = MatMul(ctx, A, B, a_dim, b_dim); - if (save_result) { - (*results)[i * ins.size() + j] = result; - } - return result; -} - -/** - * @brief get the optimal order - */ -std::vector GetOrder(const std::vector& ins, - const std::vector& ins_dims) { - auto n = ins.size(); - // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) - std::vector p(n + 1); - for (uint64_t i = 0; i < n; i++) { - p[i] = ins_dims[i][0]; - } - p[n] = ins_dims[n - 1][1]; - - // m[i, j]: save the lowest cost for multiplying ins[i...j] - std::vector m(n * n, 0); - // define ins[i...j] means multiplying matrices from ins[i] to ins[j] - // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then - // multiply the resulting matrices is the optimal order for ins[i...j] - std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { - for (uint64_t i = 0; i < n - l; i++) { - auto j = i + l; - m[i * n + j] = 0xffffffff; - for (uint64_t k = i; k < j; k++) { - uint64_t q = - m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; - if (q < m[i * n + j]) { - m[i * n + j] = q; - order[i * n + j] = k; - } - } - } - } - return order; -} - -template -static inline framework::Tensor MultiDotMatChainOrder( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, const bool save_result, - std::vector* results) { - auto order = GetOrder(ins, ins_dims); - return MatChainMul(ctx, ins, ins_dims, order, 0, - ins.size() - 1, save_result, results); -} - -inline void GetDims(const std::vector& ins, - std::vector* ins_dims) { - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - (*ins_dims)[i] = ins[i]->dims(); - if (i == 0 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]}); - } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1}); - } - } -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel { } }; -/** - * 1. there are only 2 matrices: direct matrix multiplication A*B - * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C), - * choose the least cost order for calculation - * 3. more than 3 matrices: call MultiDotMatChainOrder - */ -template -class MultiDotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto blas = phi::funcs::GetBlas(ctx); - - auto n = ins.size(); - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - const T scale = static_cast(1.0); - if (n == 2) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ma, Nb}); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); - } else { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ka, Nc}); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); - } - } else { - std::vector results; - const auto tmp = MultiDotMatChainOrder( - ctx, ins, ins_dims, false, &results); - auto out_dim = out->dims(); - *out = tmp; - out->Resize(out_dim); - } - } -}; - class MultiDotOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel { } }; -template -class MultiDotGradKernel : public framework::OpKernel { - public: - /** - * @brief calculate dA and dB - * dA = dout * transpose(B) - * dB = transpose(A) * dout - */ - void CalcGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, const framework::Tensor& A, - const framework::Tensor& B, const framework::DDim& dout_dim, - const framework::DDim& a_dim, const framework::DDim& b_dim, - framework::Tensor* dA, framework::Tensor* dB) const { - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); - T alpha = static_cast(1.0); - auto blas = phi::funcs::GetBlas(ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); - blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); - } - - /** - * @brief calculate multi matrix multiplication grad by a chain order - * @param - * dout: the grad of multi matrix multiplication out - * dx: the out grad of inputs - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * results: the intermediate result of farward - */ - void MatChainMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, - std::vector* dx, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, - const uint64_t j, - const std::vector& results) const { - if (i == j) { - *((*dx)[i]) = dout; - return; - } - - const auto n = ins.size(); - const auto right = order[i * n + j]; - const auto left = order[i * n + j] + 1; - // get the multi result of left sub chain - const auto* A = &results[i * n + right]; - framework::DDim a_dim = A->dims(); - if (i == right) { - A = ins[i]; - a_dim = ins_dims[i]; - } - // get the multi result of right sub chain - const auto* B = &results[left * n + j]; - framework::DDim b_dim = B->dims(); - if (left == j) { - B = ins[j]; - b_dim = ins_dims[j]; - } - framework::Tensor dA, dB; - dA.Resize({dout_dim[0], b_dim[0]}); - dB.Resize({a_dim[1], dout_dim[1]}); - dA.mutable_data(ctx.GetPlace()); - dB.mutable_data(ctx.GetPlace()); - - CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, - results); - MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, - results); - } - - void MultiDotGradMatChainOrder( - const framework::ExecutionContext& ctx, const framework::Tensor& dout, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - std::vector* dx) const { - auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - std::vector results(n * n); - MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, - &results); - MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, - results); - } - - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - - auto blas = phi::funcs::GetBlas(ctx); - auto place = ctx.GetPlace(); - - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - dx[i]->mutable_data(place); - } - - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - framework::DDim dout_dim = dout.dims(); - if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { - dout_dim = phi::make_ddim({1, 1}); - } else if (ins[0]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({1, dout_dim[0]}); - } - } else if (ins[n - 1]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({dout_dim[0], 1}); - } - } - - T alpha = static_cast(1); - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - if (n == 2) { - CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], - dx[0], dx[1]); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ma, Nb}); - tmp_out.mutable_data(place); - tmp_dout.Resize({mat_dim_dout.height_, Nb}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), - ins_dims[2], &tmp_dout, dx[2]); - CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], - ins_dims[1], dx[0], dx[1]); - } else { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ka, Nc}); - tmp_out.mutable_data(place); - tmp_dout.Resize({Ka, mat_dim_dout.width_}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], - tmp_dout.dims(), dx[0], &tmp_dout); - CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], - ins_dims[2], dx[1], dx[2]); - } - } else { - MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx); - if (ins[n - 1]->dims().size() == 1) { - dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); - } - } - } -}; - template class MultiDotOpGradMaker : public framework::SingleGradOpMaker { public: @@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CPU_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CUDA_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); -#endif diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc new file mode 100644 index 00000000000..2cd75404be8 --- /dev/null +++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/multi_dot_grad_kernel.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/multi_dot_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc new file mode 100644 index 00000000000..a4249a98e46 --- /dev/null +++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/multi_dot_kernel.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu new file mode 100644 index 00000000000..6761d945e95 --- /dev/null +++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" +#include "paddle/phi/kernels/multi_dot_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +using float16 = phi::dtype::float16; + +PD_REGISTER_KERNEL(multi_dot_grad, + GPU, + ALL_LAYOUT, + phi::MultiDotGradKernel, + float, + double, + float16) {} diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu new file mode 100644 index 00000000000..60b1fce5ddd --- /dev/null +++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" +#include "paddle/phi/kernels/multi_dot_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +using float16 = phi::dtype::float16; + +PD_REGISTER_KERNEL( + multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {} diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h new file mode 100644 index 00000000000..0833e94fe2c --- /dev/null +++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h @@ -0,0 +1,456 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +inline DenseTensor MatMul(const Context& ctx, + const DenseTensor& matrix_a, + const DenseTensor& matrix_b, + const phi::DDim& a_dim, + const phi::DDim& b_dim) { + auto blas = phi::funcs::GetBlas(ctx); + + DenseTensor matrix_c; + phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]}); + matrix_c.Resize(c_dim); + ctx.template Alloc(&matrix_c); + + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); + const T alpha = static_cast(1.0); + blas.MatMul(matrix_a.data(), + mat_dim_a, + matrix_b.data(), + mat_dim_b, + alpha, + matrix_c.data(), + T(0)); + return matrix_c; +} + +/** + * @brief Recursively calculate matrix multiplication according to the optimal + * order + * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] + * + * @param + * ins: the input tensors + * ins_dims: the shape of ins after reshape + * order: the optimal order + * i: the left of sub chain + * j: the righe of sub chain + * save_result: set true by backward + * results: save the intermediate result during backward + */ +template +inline DenseTensor MatChainMul(const Context& ctx, + const std::vector& ins, + const std::vector& ins_dims, + const std::vector& order, + const uint64_t i, + const uint64_t j, + const bool save_result, + std::vector* results) { + if (i == j) { + return *ins[i]; + } + + const auto A = MatChainMul(ctx, + ins, + ins_dims, + order, + i, + order[i * ins.size() + j], + save_result, + results); + phi::DDim a_dim = A.dims(); + if (i == order[i * ins.size() + j]) { + a_dim = ins_dims[i]; + } + + const auto B = MatChainMul(ctx, + ins, + ins_dims, + order, + order[i * ins.size() + j] + 1, + j, + save_result, + results); + phi::DDim b_dim = B.dims(); + if (j == order[i * ins.size() + j] + 1) { + b_dim = ins_dims[j]; + } + + auto result = MatMul(ctx, A, B, a_dim, b_dim); + if (save_result) { + (*results)[i * ins.size() + j] = result; + } + return result; +} + +/** + * @brief get the optimal order + */ +template +std::vector GetOrder(const std::vector& ins, + const std::vector& ins_dims) { + auto n = ins.size(); + // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) + std::vector p(n + 1); + for (uint64_t i = 0; i < n; i++) { + p[i] = ins_dims[i][0]; + } + p[n] = ins_dims[n - 1][1]; + + // m[i, j]: save the lowest cost for multiplying ins[i...j] + std::vector m(n * n, 0); + // define ins[i...j] means multiplying matrices from ins[i] to ins[j] + // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then + // multiply the resulting matrices is the optimal order for ins[i...j] + std::vector order(n * n); + for (uint64_t l = 1; l < n; l++) { + for (uint64_t i = 0; i < n - l; i++) { + auto j = i + l; + m[i * n + j] = 0xffffffff; + for (uint64_t k = i; k < j; k++) { + uint64_t q = + m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; + if (q < m[i * n + j]) { + m[i * n + j] = q; + order[i * n + j] = k; + } + } + } + } + return order; +} + +template +static inline DenseTensor MultiDotMatChainOrder( + const Context& ctx, + const std::vector& ins, + const std::vector& ins_dims, + const bool save_result, + std::vector* results) { + auto order = GetOrder(ins, ins_dims); + return MatChainMul( + ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results); +} + +template +inline void GetDims(const std::vector& ins, + std::vector* ins_dims) { + const auto n = ins.size(); + for (size_t i = 0; i < n; i++) { + (*ins_dims)[i] = ins[i]->dims(); + if (i == 0 && (*ins_dims)[i].size() == 1) { + (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]}); + } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { + (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1}); + } + } +} + +template +void MultiDotKernel(const Context& ctx, + const std::vector& x, + DenseTensor* out) { + auto ins = x; + ctx.template Alloc(out); + + auto blas = phi::funcs::GetBlas(ctx); + + auto n = ins.size(); + std::vector ins_dims(n); + GetDims(ins, &ins_dims); + + const T scale = static_cast(1.0); + if (n == 2) { + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); + blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); + } else if (n == 3) { + const auto Ma = ins_dims[0][0]; + const auto Ka = ins_dims[0][1]; + const auto Nb = ins_dims[1][1]; + const auto Nc = ins_dims[2][1]; + const uint64_t cost1 = Ma * Nb * (Ka + Nc); + const uint64_t cost2 = Ka * Nc * (Nb + Ma); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); + auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); + if (cost1 < cost2) { + DenseTensor tmp_out; + phi::DDim tmp_dim = phi::make_ddim({Ma, Nb}); + tmp_out.Resize(tmp_dim); + ctx.template Alloc(&tmp_out); + blas.MatMul( + *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0)); + auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); + blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); + } else { + DenseTensor tmp_out; + phi::DDim tmp_dim = phi::make_ddim({Ka, Nc}); + tmp_out.Resize(tmp_dim); + ctx.template Alloc(&tmp_out); + std::cout << tmp_out << std::endl; + blas.MatMul( + *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0)); + auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); + blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); + } + } else { + std::vector results; + const auto tmp = + MultiDotMatChainOrder(ctx, ins, ins_dims, false, &results); + auto out_dim = out->dims(); + *out = tmp; + out->Resize(out_dim); + } +} + +/** + * @brief calculate dA and dB + * dA = dout * transpose(B) + * dB = transpose(A) * dout + */ +template +void CalcGrad(const Context& ctx, + const DenseTensor& dout, + const DenseTensor& A, + const DenseTensor& B, + const phi::DDim& dout_dim, + const phi::DDim& a_dim, + const phi::DDim& b_dim, + DenseTensor* dA, + DenseTensor* dB) { + auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); + T alpha = static_cast(1.0); + auto blas = phi::funcs::GetBlas(ctx); + blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); + blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); +} + +/** + * @brief calculate multi matrix multiplication grad by a chain order + * @param + * dout: the grad of multi matrix multiplication out + * dx: the out grad of inputs + * ins: the input tensors + * ins_dims: the shape of ins after reshape + * order: the optimal order + * i: the left of sub chain + * j: the righe of sub chain + * results: the intermediate result of farward + */ +template +void MatChainMulGrad(const Context& ctx, + const DenseTensor& dout, + std::vector* dx, + const std::vector& ins, + const phi::DDim& dout_dim, + const std::vector& ins_dims, + const std::vector& order, + const uint64_t i, + const uint64_t j, + const std::vector& results) { + if (i == j) { + *((*dx)[i]) = dout; + return; + } + + const auto n = ins.size(); + const auto right = order[i * n + j]; + const auto left = order[i * n + j] + 1; + // get the multi result of left sub chain + const auto* A = &results[i * n + right]; + phi::DDim a_dim = A->dims(); + if (i == right) { + A = ins[i]; + a_dim = ins_dims[i]; + } + // get the multi result of right sub chain + const auto* B = &results[left * n + j]; + phi::DDim b_dim = B->dims(); + if (left == j) { + B = ins[j]; + b_dim = ins_dims[j]; + } + DenseTensor dA, dB; + dA.Resize({dout_dim[0], b_dim[0]}); + dB.Resize({a_dim[1], dout_dim[1]}); + ctx.template Alloc(&dA); + ctx.template Alloc(&dB); + + CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); + MatChainMulGrad( + ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results); + MatChainMulGrad( + ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results); +} + +template +void MultiDotGradMatChainOrder(const Context& ctx, + const DenseTensor& dout, + const std::vector& ins, + const phi::DDim& dout_dim, + const std::vector& ins_dims, + std::vector* dx) { + auto order = GetOrder(ins, ins_dims); + auto n = ins.size(); + std::vector results(n * n); + MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, &results); + MatChainMulGrad( + ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results); +} + +template +void MultiDotGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& x, + std::vector x_grad) { + auto ins = x; + auto dout = out_grad; + auto dx = x_grad; + + auto blas = phi::funcs::GetBlas(ctx); + + const auto n = ins.size(); + for (size_t i = 0; i < n; i++) { + ctx.template Alloc(dx[i]); + } + + std::vector ins_dims(n); + GetDims(ins, &ins_dims); + + phi::DDim dout_dim = dout.dims(); + if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { + dout_dim = phi::make_ddim({1, 1}); + } else if (ins[0]->dims().size() == 1) { + if (dout_dim.size() == 1) { + dout_dim = phi::make_ddim({1, dout_dim[0]}); + } + } else if (ins[n - 1]->dims().size() == 1) { + if (dout_dim.size() == 1) { + dout_dim = phi::make_ddim({dout_dim[0], 1}); + } + } + + T alpha = static_cast(1); + auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); + if (n == 2) { + CalcGrad(ctx, + dout, + *ins[0], + *ins[1], + dout_dim, + ins_dims[0], + ins_dims[1], + dx[0], + dx[1]); + } else if (n == 3) { + const auto Ma = ins_dims[0][0]; + const auto Ka = ins_dims[0][1]; + const auto Nb = ins_dims[1][1]; + const auto Nc = ins_dims[2][1]; + const uint64_t cost1 = Ma * Nb * (Ka + Nc); + const uint64_t cost2 = Ka * Nc * (Nb + Ma); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); + auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); + if (cost1 < cost2) { + DenseTensor tmp_out, tmp_dout; + tmp_out.Resize({Ma, Nb}); + ctx.template Alloc(&tmp_out); + tmp_dout.Resize({mat_dim_dout.height_, Nb}); + ctx.template Alloc(&tmp_dout); + blas.MatMul( + *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0)); + CalcGrad(ctx, + dout, + tmp_out, + *ins[2], + dout_dim, + tmp_out.dims(), + ins_dims[2], + &tmp_dout, + dx[2]); + CalcGrad(ctx, + tmp_dout, + *ins[0], + *ins[1], + tmp_dout.dims(), + ins_dims[0], + ins_dims[1], + dx[0], + dx[1]); + } else { + DenseTensor tmp_out, tmp_dout; + tmp_out.Resize({Ka, Nc}); + ctx.template Alloc(&tmp_out); + tmp_dout.Resize({Ka, mat_dim_dout.width_}); + ctx.template Alloc(&tmp_dout); + blas.MatMul( + *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0)); + CalcGrad(ctx, + dout, + *ins[0], + tmp_out, + dout_dim, + ins_dims[0], + tmp_dout.dims(), + dx[0], + &tmp_dout); + CalcGrad(ctx, + tmp_dout, + *ins[1], + *ins[2], + tmp_dout.dims(), + ins_dims[1], + ins_dims[2], + dx[1], + dx[2]); + } + } else { + MultiDotGradMatChainOrder( + ctx, dout, ins, dout_dim, ins_dims, &dx); + if (ins[n - 1]->dims().size() == 1) { + dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h new file mode 100644 index 00000000000..e6d8ecd744e --- /dev/null +++ b/paddle/phi/kernels/multi_dot_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiDotGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& x, + std::vector x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/multi_dot_kernel.h b/paddle/phi/kernels/multi_dot_kernel.h new file mode 100644 index 00000000000..09866e8dde5 --- /dev/null +++ b/paddle/phi/kernels/multi_dot_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiDotKernel(const Context& ctx, + const std::vector& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc new file mode 100644 index 00000000000..598cbd980f3 --- /dev/null +++ b/paddle/phi/ops/compat/multi_dot_sig.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MultiDotGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping); -- GitLab From 6a0d60d27ff44ea425e35afa9b3f4bd884fb6506 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 7 Mar 2022 13:20:07 +0800 Subject: [PATCH 080/261] [bf16] add bf16 kernel: gaussian_random fill_constant fill_any_like (#40027) * add gaussian random * add full * refine reduce * refine code * refine gaussian_random unittest * add unittest for fill_any_like fill_constant --- paddle/fluid/operators/gaussian_random_op.cu | 3 +- .../phi/kernels/funcs/distribution_helper.h | 9 ++-- paddle/phi/kernels/gpu/full_kernel.cu | 10 ++-- .../phi/kernels/gpu/gaussian_random_kernel.cu | 13 ++++-- .../kernels/primitive/compute_primitives.h | 1 + .../tests/unittests/test_fill_any_like_op.py | 21 ++++++++- .../tests/unittests/test_fill_constant_op.py | 21 +++++++++ .../unittests/test_gaussian_random_op.py | 46 ++++++++++++++++++- 8 files changed, 110 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 717ec774414..00ce10bfe3b 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -45,7 +45,8 @@ struct GaussianGenerator { thrust::minstd_rand rng; rng.seed(seed_); using MT = typename details::MPTypeTrait::Type; - thrust::normal_distribution dist(mean_, std_); + thrust::normal_distribution dist(static_cast(mean_), + static_cast(std_)); unsigned int new_n = n + offset_; rng.discard(new_n); MT out = dist(rng); diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index 3ef39dc55d1..acc31d68b78 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/core/hostdevice.h" @@ -255,11 +256,13 @@ __global__ void DistributionKernel(size_t size, using SType = hiprandStatePhilox4_32_10_t; #endif size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; - T args[kCount]; + using MT = typename phi::dtype::MPTypeTrait::Type; + MT args[kCount]; T result[kCount]; for (size_t i = idx; i < size; i += total_thread * kCount) { - kps::ElementwiseRandom(&args[0], dist, &state); - kps::ElementwiseUnary( + kps::ElementwiseRandom( + &args[0], dist, &state); + kps::ElementwiseUnary( &result[0], &args[0], trans); kps::WriteData( out_data + i, &result[0], size - i, 1, stride, 1); diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 1f756bfdbed..a905979f08b 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -63,9 +63,11 @@ void FullLikeKernel(const Context& dev_ctx, auto value = val.to(); using CommonType = typename std::common_type< float, - typename std::conditional::value, - float, - T>::type>::type; + typename std::conditional< + std::is_same::value || + std::is_same::value, + float, + T>::type>::type; auto common_type_value = static_cast(value); @@ -110,6 +112,7 @@ PD_REGISTER_KERNEL(full, int64_t, bool, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -123,6 +126,7 @@ PD_REGISTER_KERNEL(full_like, int, int64_t, bool, + phi::dtype::bfloat16, phi::dtype::float16) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu index da16800ad02..e2fe2190c1c 100644 --- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -18,8 +18,8 @@ #include #include #include - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" @@ -46,8 +46,9 @@ struct GaussianGenerator { __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed_); - using MT = typename phi::kps::details::MPTypeTrait::Type; - thrust::normal_distribution dist(mean_, std_); + using MT = typename phi::dtype::MPTypeTrait::Type; + thrust::normal_distribution dist(static_cast(mean_), + static_cast(std_)); unsigned int new_n = n + offset_; rng.discard(new_n); MT out = dist(rng); @@ -83,9 +84,10 @@ void GaussianRandomKernel(const Context& dev_ctx, if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { - using MT = typename phi::kps::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; funcs::normal_distribution dist; - funcs::normal_transform trans(mean, std); + funcs::normal_transform trans(static_cast(mean), + static_cast(std)); funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); @@ -110,5 +112,6 @@ PD_REGISTER_KERNEL(gaussian_random, ALL_LAYOUT, phi::GaussianRandomKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double) {} diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 19427551fb3..632ad00f6d0 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -22,6 +22,7 @@ #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +// #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" namespace phi { diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py index 5bc2d1cda18..9be2e57ff0c 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py @@ -21,7 +21,7 @@ from paddle.fluid import Program, program_guard import paddle.compat as cpt import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 class TestFillAnyLikeOp(OpTest): @@ -47,6 +47,25 @@ class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp): self.value = 0.0 +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFillAnyLikeOpBfloat16(OpTest): + def setUp(self): + self.op_type = "fill_any_like" + self.dtype = np.uint16 + self.value = 0.0 + self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)} + self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16} + self.outputs = { + 'Out': + convert_float_to_uint16(self.value * np.ones_like(self.inputs["X"])) + } + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + class TestFillAnyLikeOpValue1(TestFillAnyLikeOp): def init(self): self.value = 1.0 diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py index 822c952893e..15071b2b6aa 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py @@ -83,6 +83,27 @@ class TestFillConstantOp4(OpTest): self.check_output() +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFillConstantBF16Op(OpTest): + def setUp(self): + '''Test fill_constant op with specified value + ''' + self.op_type = "fill_constant" + self.dtype = np.uint16 + self.inputs = {} + self.attrs = { + 'shape': [123, 92], + 'value': 3.8, + 'dtype': core.VarDesc.VarType.BF16 + } + self.outputs = {'Out': convert_float_to_uint16(np.full((123, 92), 3.8))} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + class TestFillConstantOpWithSelectedRows(unittest.TestCase): def check_with_place(self, place): scope = core.Scope() diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 31caf4bd6be..738441a46d3 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -22,7 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.op import Operator from paddle.fluid.executor import Executor -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float import paddle @@ -65,6 +65,50 @@ class TestGaussianRandomOp(OpTest): "hist: " + str(hist) + " hist2: " + str(hist2)) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestGaussianRandomBF16Op(OpTest): + def setUp(self): + self.op_type = "gaussian_random" + self.set_attrs() + self.inputs = {} + self.use_mkldnn = False + self.attrs = { + "shape": [123, 92], + "mean": self.mean, + "std": self.std, + "seed": 10, + "dtype": paddle.fluid.core.VarDesc.VarType.BF16, + "use_mkldnn": self.use_mkldnn + } + paddle.seed(10) + + self.outputs = {'Out': np.zeros((123, 92), dtype='float32')} + + def set_attrs(self): + self.mean = 1.0 + self.std = 2. + + def test_check_output(self): + self.check_output_with_place_customized( + self.verify_output, place=core.CUDAPlace(0)) + + def verify_output(self, outs): + outs = convert_uint16_to_float(outs) + self.assertEqual(outs[0].shape, (123, 92)) + hist, _ = np.histogram(outs[0], range=(-3, 5)) + hist = hist.astype("float32") + hist /= float(outs[0].size) + data = np.random.normal(size=(123, 92), loc=1, scale=2) + hist2, _ = np.histogram(data, range=(-3, 5)) + hist2 = hist2.astype("float32") + hist2 /= float(outs[0].size) + self.assertTrue( + np.allclose( + hist, hist2, rtol=0, atol=0.05), + "hist: " + str(hist) + " hist2: " + str(hist2)) + + class TestMeanStdAreInt(TestGaussianRandomOp): def set_attrs(self): self.mean = 1 -- GitLab From b46e49deaff2a98133b7176729874a6f8e9198a6 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 7 Mar 2022 13:23:16 +0800 Subject: [PATCH 081/261] [Phi] Remove storage deps of empty (#40136) * remove storage deps of empty * remove invalid empty method * remove error empty using * fix test_sparse_utils_dev_api * revert some sparse change * add memset for conv grad * resolve conflict * resolve conflict * resolve conflict --- paddle/phi/api/lib/data_transform.cc | 1 + paddle/phi/kernels/CMakeLists.txt | 15 ++++++++-- paddle/phi/kernels/cast_kernel.h | 2 +- paddle/phi/kernels/complex_kernel.h | 6 ++-- paddle/phi/kernels/concat_kernel.h | 2 +- paddle/phi/kernels/dot_kernel.h | 2 +- paddle/phi/kernels/empty_kernel.h | 23 ++++----------- paddle/phi/kernels/flatten_kernel.h | 2 +- paddle/phi/kernels/full_kernel.h | 4 +-- paddle/phi/kernels/funcs/reduce_function.h | 9 +++--- .../kernels/impl/matmul_grad_kernel_impl.h | 16 +++++----- .../impl/triangular_solve_grad_kernel_impl.h | 4 +-- paddle/phi/kernels/math_kernel.h | 12 ++++---- paddle/phi/kernels/matmul_kernel.h | 2 +- paddle/phi/kernels/reshape_kernel.h | 2 +- paddle/phi/kernels/scale_kernel.h | 2 +- paddle/phi/kernels/sign_kernel.h | 2 +- .../kernels/sparse/convolution_grad_kernel.h | 3 ++ .../phi/kernels/sparse/convolution_kernel.h | 13 +++++++++ .../sparse/cpu/convolution_grad_kernel.cc | 1 + .../phi/kernels/sparse/sparse_utils_kernel.h | 29 ++++++++++--------- paddle/phi/kernels/split_kernel.h | 2 +- paddle/phi/kernels/transpose_kernel.h | 2 +- paddle/phi/tests/api/scale_api.h | 1 + .../kernels/test_sparse_utils_dev_api.cc | 29 +++++++++++++++++++ 25 files changed, 116 insertions(+), 70 deletions(-) diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index ae67e2ebb35..79b8ac6d0b8 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/transfer_layout_kernel.h" diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index e9108787082..16fae8d879c 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -9,13 +9,22 @@ add_subdirectory(funcs) # phi depends all phi kernel targets set_property(GLOBAL PROPERTY PHI_KERNELS "") +# [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) -# NOTE: Some kernels depend on some targets that are not commonly used. +# [ 2. Kernels that most kernels depend on ] +# There are a few kernels that are very basic operations, and most of the +# kernels depend on these kernels. +set(COMMON_BAISC_KERNELS empty_kernel full_kernel) +kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS}) +kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) + +# [ 3. Kernels with special dependencies ] +# Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel) @@ -24,8 +33,8 @@ kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) -# auto parse and build kernel targets by cmake -register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS}) +# 4. auto parse and build kernel targets by cmake +register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) # phi sparse kernels add_subdirectory(sparse) diff --git a/paddle/phi/kernels/cast_kernel.h b/paddle/phi/kernels/cast_kernel.h index c760b2842d0..5e07388f5fb 100644 --- a/paddle/phi/kernels/cast_kernel.h +++ b/paddle/phi/kernels/cast_kernel.h @@ -29,7 +29,7 @@ template DenseTensor Cast(const Context& dev_ctx, const DenseTensor& x, DataType out_dtype) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); CastInferMeta(x, out_dtype, &meta_out); CastKernel(dev_ctx, x, out_dtype, &dense_out); diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 2c52001ece1..07f93f9b926 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -38,7 +38,7 @@ template < std::is_same>::value, bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); UnchangedInferMeta(x, &meta_out); ConjKernel(dev_ctx, x, &dense_out); @@ -64,7 +64,7 @@ template < std::is_same>::value, bool> = true> DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); RealAndImagInferMeta(x, &meta_out); RealKernel(dev_ctx, x, &dense_out); @@ -90,7 +90,7 @@ template < std::is_same>::value, bool> = true> DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); RealAndImagInferMeta(x, &meta_out); ImagKernel(dev_ctx, x, &dense_out); diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index ed969e963ec..4e72159aeca 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -38,7 +38,7 @@ DenseTensor Concat(const Context& dev_ctx, meta_x_ptr.push_back(&meta_x.back()); } - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); ConcatKernel(dev_ctx, x, axis, &dense_out); diff --git a/paddle/phi/kernels/dot_kernel.h b/paddle/phi/kernels/dot_kernel.h index 9377fba204b..9c7703440d8 100644 --- a/paddle/phi/kernels/dot_kernel.h +++ b/paddle/phi/kernels/dot_kernel.h @@ -29,7 +29,7 @@ template DenseTensor Dot(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); DotInferMeta(x, y, &meta_out); DotKernel(dev_ctx, x, y, &dense_out); diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h index 0b8d95ee94f..f66f4419fd7 100644 --- a/paddle/phi/kernels/empty_kernel.h +++ b/paddle/phi/kernels/empty_kernel.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" @@ -34,28 +34,17 @@ void EmptyLikeKernel(const Context& dev_ctx, DataType dtype, DenseTensor* out); -// TODO(chenweihang): the tensor creation method need to be replaced later, -// all kernel api call Empty here instead of making tensor self template DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) { - phi::DenseTensor dense_out( - phi::make_intrusive( - dev_ctx.GetPlace()), - std::move(meta)); + phi::DenseTensor dense_out; + dense_out.set_meta(meta); + dev_ctx.Alloc(&dense_out, dense_out.dtype()); return dense_out; } -template -DenseTensor Empty(const Context& dev_ctx) { - return Empty(dev_ctx, - {paddle::experimental::CppTypeToDataType::Type(), - {-1}, - DataLayout::NCHW}); -} - template DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateInferMeta(shape, dtype, &meta_out); @@ -65,7 +54,7 @@ DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) { template DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateLikeInferMeta(x, dtype, &meta_out); diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h index de57dcf2e8d..808af7d9b7b 100644 --- a/paddle/phi/kernels/flatten_kernel.h +++ b/paddle/phi/kernels/flatten_kernel.h @@ -40,7 +40,7 @@ DenseTensor Flatten(const Context& dev_ctx, const DenseTensor& x, int start_axis, int stop_axis) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); FlattenInferMeta(x, start_axis, stop_axis, &meta_out); FlattenKernel(dev_ctx, x, start_axis, stop_axis, &dense_out); diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index 05929ba83f3..c44f048051d 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -41,7 +41,7 @@ template DenseTensor Full(const Context& dev_ctx, const ScalarArray& shape, const Scalar& val) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateInferMeta(shape, dtype, &meta_out); @@ -53,7 +53,7 @@ template DenseTensor FullLike(const Context& dev_ctx, const DenseTensor& x, const Scalar& val) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateLikeInferMeta(x, dtype, &meta_out); diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 7df772682ec..ce6bb0d559c 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -344,9 +344,8 @@ struct ReduceConfig { const phi::GPUContext& dev_ctx, phi::DenseTensor* tmp) { if (should_reduce_again) { - tmp->ResizeAndAllocate(phi::make_ddim( + tmp->Resize(phi::make_ddim( {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); - output_data = dev_ctx.Alloc(tmp); } else { output_data = y_data; @@ -1053,8 +1052,8 @@ CubTensorReduceImpl(const Tx* x_data, reducer, reducer.initial(), stream); - phi::DenseTensor tmp = - phi::Empty(dev_ctx, {static_cast(temp_storage_bytes)}); + phi::DenseTensor tmp = phi::Empty( + dev_ctx, {static_cast(temp_storage_bytes)}); auto* temp_storage = dev_ctx.Alloc(&tmp); @@ -1106,7 +1105,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, // y_data; phi::DDim tmp_ddim; - phi::DenseTensor tmp = phi::Empty(dev_ctx); + phi::DenseTensor tmp; auto x_data = x.data(); auto y_data = y->data(); diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index 7c8d10e0565..d06bdc55030 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -329,8 +329,8 @@ void MatmulGradKernel(const Context& dev_ctx, x_conj = Conj(dev_ctx, x); y_conj = Conj(dev_ctx, y); - DenseTensor dx_help = Empty(dev_ctx); - DenseTensor dy_help = Empty(dev_ctx); + DenseTensor dx_help; + DenseTensor dy_help; if (transpose_x) { if (transpose_y) { @@ -686,8 +686,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, y_conj = Conj(dev_ctx, y); } - DenseTensor dx_help = Empty(dev_ctx); - DenseTensor dy_help = Empty(dev_ctx); + DenseTensor dx_help; + DenseTensor dy_help; if (transpose_x) { if (transpose_y) { @@ -1373,10 +1373,10 @@ void MatmulTripleGradKernel(const Context& dev_ctx, VLOG(3) << "It need cost much time to reduce sum for the broadcast and " "wastes the memory. So we should avoid the case in reality"; - DenseTensor out_dx_help = Empty(dev_ctx); - DenseTensor out_dy_help = Empty(dev_ctx); - DenseTensor out_d_ddx_help = Empty(dev_ctx); - DenseTensor out_d_ddy_help = Empty(dev_ctx); + DenseTensor out_dx_help; + DenseTensor out_dy_help; + DenseTensor out_d_ddx_help; + DenseTensor out_d_ddy_help; if (out_d_dout) { ddx_conj = Conj(dev_ctx, ddx); diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h index a6868ebe6ca..9b1e4b1d3a6 100644 --- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -49,7 +49,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, DenseTensor dy_bst = phi::Empty(dev_ctx, y_bst_dims_array); if (dy) { // calculate x's conjugate for complex - DenseTensor x_conj = phi::Empty(dev_ctx); + DenseTensor x_conj; x_conj.Resize(x.dims()); phi::funcs::ForRange x_for_range(dev_ctx, x.numel()); @@ -76,7 +76,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, DenseTensor dx_bst = phi::Empty(dev_ctx, x_bst_dims_array); if (dx) { // calculate x's conjugate for complex - DenseTensor out_conj = phi::Empty(dev_ctx); + DenseTensor out_conj; out_conj.Resize(out.dims()); phi::funcs::ForRange out_for_range(dev_ctx, out.numel()); diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index 342393d79bd..fe8f3b749cd 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -109,7 +109,7 @@ template DenseTensor Add(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ElementwiseInferMeta(x, y, &meta_out); AddKernel(dev_ctx, x, y, &dense_out); @@ -120,7 +120,7 @@ template DenseTensor Subtract(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ElementwiseInferMeta(x, y, &meta_out); SubtractKernel(dev_ctx, x, y, &dense_out); @@ -131,7 +131,7 @@ template DenseTensor Divide(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ElementwiseInferMeta(x, y, &meta_out); DivideKernel(dev_ctx, x, y, &dense_out); @@ -142,7 +142,7 @@ template DenseTensor Multiply(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ElementwiseInferMeta(x, y, &meta_out); MultiplyKernel(dev_ctx, x, y, &dense_out); @@ -154,7 +154,7 @@ DenseTensor Mean(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis, bool keep_dim) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out); MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); @@ -167,7 +167,7 @@ DenseTensor Sum(const Context& dev_ctx, const std::vector& axis, DataType dtype, bool keep_dim) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); SumInferMeta(x, axis, dtype, keep_dim, &meta_out); SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h index 1f1cb22c271..b524b9e5863 100644 --- a/paddle/phi/kernels/matmul_kernel.h +++ b/paddle/phi/kernels/matmul_kernel.h @@ -35,7 +35,7 @@ DenseTensor Matmul(const Context& dev_ctx, const DenseTensor& y, bool transpose_x = false, bool transpose_y = false) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out); MatmulKernel(dev_ctx, x, y, transpose_x, transpose_y, &dense_out); diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h index 1a3d0db8a8a..848f162a2a8 100644 --- a/paddle/phi/kernels/reshape_kernel.h +++ b/paddle/phi/kernels/reshape_kernel.h @@ -38,7 +38,7 @@ template DenseTensor Reshape(const Context& dev_ctx, const DenseTensor& x, const std::vector& shape) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); InferMetaFromVecValue(x, shape, &meta_out); ReshapeKernel(dev_ctx, x, ScalarArray(shape), &dense_out); diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h index 22e6efb03ac..7537dc1130b 100644 --- a/paddle/phi/kernels/scale_kernel.h +++ b/paddle/phi/kernels/scale_kernel.h @@ -34,7 +34,7 @@ DenseTensor Scale(const Context& dev_ctx, const Scalar& scale, float bias, bool bias_after_scale) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); UnchangedInferMeta(x, &meta_out); ScaleKernel( diff --git a/paddle/phi/kernels/sign_kernel.h b/paddle/phi/kernels/sign_kernel.h index 7ee1145012d..4b5900d90f4 100644 --- a/paddle/phi/kernels/sign_kernel.h +++ b/paddle/phi/kernels/sign_kernel.h @@ -25,7 +25,7 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); template DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) { - auto dense_out = phi::Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); UnchangedInferMeta(x, &meta_out); SignKernel(dev_ctx, x, &dense_out); diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 1a6ac852448..3ada3473355 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { @@ -45,6 +47,7 @@ std::vector Conv3dGrad(const Context& dev_ctx, const int groups) { DenseTensor x_grad = phi::Empty(dev_ctx); DenseTensor kernel_grad = phi::Empty(dev_ctx); + // TODO(zhangkaihuo): call InferMeta func here Conv3dGradKernel(dev_ctx, x, rulebook, diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 71160a6365d..1c1e62c8306 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -14,11 +14,24 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" namespace phi { + +template +DenseTensor Empty(const Context& dev_ctx) { + phi::DenseTensor dense_out( + phi::make_intrusive( + dev_ctx.GetPlace()), + {paddle::experimental::CppTypeToDataType::Type(), + {-1}, + DataLayout::NCHW}); + return dense_out; +} + namespace sparse { struct Dims4D { diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index d4f770ce871..cb6cf435435 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -74,6 +74,7 @@ void Conv3dGradKernel(const Context& dev_ctx, dev_ctx.Alloc( kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T)); T* d_kernel_ptr = kernel_grad->data(); + memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel()); Gather(x.non_zero_elements().data(), rulebook_ptr + rulebook_len, diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index d96d134a26b..c83b2130ed4 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" @@ -63,8 +64,8 @@ template SparseCooTensor DenseToSparseCoo(const Context& dev_ctx, const DenseTensor& x, const int64_t sparse_dim) { - DenseTensor indices = phi::Empty(dev_ctx); - DenseTensor values = phi::Empty(dev_ctx); + DenseTensor indices; + DenseTensor values; SparseCooTensor coo(indices, values, x.dims()); DenseToSparseCooKernel(dev_ctx, x, sparse_dim, &coo); return coo; @@ -78,8 +79,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx, template SparseCooTensor SparseCsrToCoo(const Context& dev_ctx, const SparseCsrTensor& x) { - DenseTensor indices = phi::Empty(dev_ctx); - DenseTensor values = phi::Empty(dev_ctx); + DenseTensor indices; + DenseTensor values; SparseCooTensor coo(indices, values, x.dims()); SparseCsrToCooKernel(dev_ctx, x, &coo); return coo; @@ -93,9 +94,9 @@ void SparseCooToCsrKernel(const Context& dev_ctx, template SparseCsrTensor SparseCooToCsr(const Context& dev_ctx, const SparseCooTensor& x) { - DenseTensor non_zero_crows = phi::Empty(dev_ctx); - DenseTensor non_zero_cols = phi::Empty(dev_ctx); - DenseTensor non_zero_elements = phi::Empty(dev_ctx); + DenseTensor non_zero_crows; + DenseTensor non_zero_cols; + DenseTensor non_zero_elements; SparseCsrTensor csr( non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); SparseCooToCsrKernel(dev_ctx, x, &csr); @@ -113,8 +114,8 @@ void DenseToSparseCsrKernel(const Context& dev_ctx, phi::errors::InvalidArgument( "SparseCsrTensor only support 2-D or 3-D Tensor.")); const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3; - DenseTensor indices = phi::Empty(dev_ctx); - DenseTensor values = phi::Empty(dev_ctx); + DenseTensor indices; + DenseTensor values; SparseCooTensor coo(indices, values, x.dims()); DenseToSparseCooKernel(dev_ctx, x, sparse_dim, &coo); SparseCooToCsrKernel(dev_ctx, coo, out); @@ -122,9 +123,9 @@ void DenseToSparseCsrKernel(const Context& dev_ctx, template SparseCsrTensor DenseToSparseCsr(const Context& dev_ctx, const DenseTensor& x) { - DenseTensor non_zero_crows = phi::Empty(dev_ctx); - DenseTensor non_zero_cols = phi::Empty(dev_ctx); - DenseTensor non_zero_elements = phi::Empty(dev_ctx); + DenseTensor non_zero_crows; + DenseTensor non_zero_cols; + DenseTensor non_zero_elements; SparseCsrTensor csr( non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); DenseToSparseCsrKernel(dev_ctx, x, &csr); @@ -148,8 +149,8 @@ template void SparseCsrToDenseKernel(const Context& dev_ctx, const SparseCsrTensor& x, DenseTensor* out) { - DenseTensor indices = phi::Empty(dev_ctx); - DenseTensor values = phi::Empty(dev_ctx); + DenseTensor indices; + DenseTensor values; SparseCooTensor coo(indices, values, x.dims()); SparseCsrToCooKernel(dev_ctx, x, &coo); SparseCooToDenseKernel(dev_ctx, coo, out); diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h index 840fe4366ce..e42b25e60c4 100644 --- a/paddle/phi/kernels/split_kernel.h +++ b/paddle/phi/kernels/split_kernel.h @@ -50,7 +50,7 @@ std::vector Split(const Context& dev_ctx, result.reserve(out_number); for (size_t i = 0; i < out_number; ++i) { - result.emplace_back(phi::Empty(dev_ctx)); + result.emplace_back(DenseTensor()); out_meta.emplace_back(&result.back()); out_meta_ptr.push_back(&out_meta.back()); } diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h index 3d89b324bab..b8d7fbaa275 100644 --- a/paddle/phi/kernels/transpose_kernel.h +++ b/paddle/phi/kernels/transpose_kernel.h @@ -32,7 +32,7 @@ template DenseTensor Transpose(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis) { - auto dense_out = Empty(dev_ctx); + DenseTensor dense_out; MetaTensor meta_out(&dense_out); TransposeInferMeta(x, axis, &meta_out); TransposeKernel(dev_ctx, x, axis, &dense_out); diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h index d93f00129b9..6b9bb7aecef 100644 --- a/paddle/phi/tests/api/scale_api.h +++ b/paddle/phi/tests/api/scale_api.h @@ -20,6 +20,7 @@ #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc index 3e2ad0495f3..b8f214b79e2 100644 --- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc @@ -90,6 +90,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, phi::CPUContext dev_ctx_cpu; dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); // 1. test cpu auto cpu_sparse_out = @@ -300,6 +304,11 @@ void TestSparseCsrToCoo(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); auto cpu_sparse_out = sparse::SparseCsrToCoo(dev_ctx_cpu, csr); CheckResult(&dev_ctx_cpu, cpu_sparse_out, @@ -473,6 +482,11 @@ void TestCooToCsr(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); auto cpu_sparse_out = sparse::SparseCooToCsr(dev_ctx_cpu, coo); CheckCsrResult(&dev_ctx_cpu, cpu_sparse_out, @@ -563,6 +577,11 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, const auto alloc = std::make_shared( paddle::platform::CPUPlace()); phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); // 1. test cpu auto cpu_sparse_out = sparse::DenseToSparseCsr(dev_ctx_cpu, dense_x); @@ -667,6 +686,11 @@ void TestSparseCooToDense(const DDim& dense_dims, const int64_t non_zero_num, const int64_t sparse_dim) { phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); const auto alloc = std::make_shared( paddle::platform::CPUPlace()); @@ -836,6 +860,11 @@ void TestSparseCsrToDense(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.Init(); + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); DenseTensor cpu_sparse_out = sparse::SparseCsrToDense(dev_ctx_cpu, csr); int cmp_cpu = memcmp(cpu_sparse_out.data(), dense_data.data(), -- GitLab From d255bfe0203bb81b8c68b86d77ed14c350beaf52 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Mon, 7 Mar 2022 13:25:57 +0800 Subject: [PATCH 082/261] fix_conv2d_trt_convert_test_case (#39882) * fix_conv2d_trt_convert_test_case * fix_conv2d_trt_convert_test_case * fix_conv2d_trt_convert_test_case * fix_conv2d_trt_convert_test_case --- .../test_trt_convert_conv2d_transpose.py | 44 ++++++------------- .../test_trt_convert_deformable_conv.py | 14 +----- ..._trt_convert_depthwise_conv2d_transpose.py | 44 ++++++------------- 3 files changed, 29 insertions(+), 73 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py index e21d67839eb..65fc35f9c56 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py @@ -37,6 +37,13 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest): if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]: return False + if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1: + return False + + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000: + return False + return True def sample_program_configs(self): @@ -175,9 +182,9 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-3) - self.trt_param.precision = paddle_infer.PrecisionType.Int8 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + # self.trt_param.precision = paddle_infer.PrecisionType.Int8 + # yield self.create_inference_config(), generate_trt_nodes_num( + # attrs, False), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) @@ -187,41 +194,18 @@ class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-3) - self.trt_param.precision = paddle_infer.PrecisionType.Int8 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), (1e-5, 1e-5) + # self.trt_param.precision = paddle_infer.PrecisionType.Int8 + # yield self.create_inference_config(), generate_trt_nodes_num( + # attrs, True), (1e-5, 1e-5) def add_skip_trt_case(self): def teller1(program_config, predictor_config): - if program_config.ops[0].attrs[ - 'padding_algorithm'] == "SAME" or program_config.ops[ - 0].attrs['padding_algorithm'] == "VALID": - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." - ) - - def teller2(program_config, predictor_config): - if program_config.ops[0].attrs['dilations'][ - 0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1: - return True - return False - - self.add_skip_case( - teller2, SkipReasons.TRT_NOT_IMPLEMENTED, - "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." - ) - - def teller3(program_config, predictor_config): if self.trt_param.precision == paddle_infer.PrecisionType.Int8: return True return False self.add_skip_case( - teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, "When precisionType is int8 without relu op, output is different between Trt and Paddle." ) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py index 9d29034d7fe..c692e92861b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py @@ -147,7 +147,7 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest): if len(attrs[0]['paddings']) == 4: return 1, 2 else: - return 1, 2 + return 1, 4 attrs = [ program_config.ops[i].attrs @@ -160,20 +160,8 @@ class TrtConvertDeformableConvTest(TrtLayerAutoScanTest): yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), 1e-5 - def add_skip_trt_case(self): - def teller1(program_config, predictor_config): - if len(program_config.ops[0].attrs["strides"]) != 2: - return False - - return True - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "In deformable conv, length of Attr(strides) should be 2.") - def test(self): self.trt_param.workspace_size = 1 << 28 - self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py index 66a007f64b6..5f77e7de0df 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py @@ -40,6 +40,13 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest): if inputs['input_data'].shape[1] != attrs[0]['groups']: return False + if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1: + return False + + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000: + return False + return True def sample_program_configs(self): @@ -139,9 +146,9 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-3) - self.trt_param.precision = paddle_infer.PrecisionType.Int8 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + # self.trt_param.precision = paddle_infer.PrecisionType.Int8 + # yield self.create_inference_config(), generate_trt_nodes_num( + # attrs, False), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) @@ -151,41 +158,18 @@ class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-5) - self.trt_param.precision = paddle_infer.PrecisionType.Int8 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), (1e-5, 1e-5) + # self.trt_param.precision = paddle_infer.PrecisionType.Int8 + # yield self.create_inference_config(), generate_trt_nodes_num( + # attrs, True), (1e-5, 1e-5) def add_skip_trt_case(self): def teller1(program_config, predictor_config): - if program_config.ops[0].attrs[ - 'padding_algorithm'] == "SAME" or program_config.ops[ - 0].attrs['padding_algorithm'] == "VALID": - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." - ) - - def teller2(program_config, predictor_config): - if program_config.ops[0].attrs['dilations'][ - 0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1: - return True - return False - - self.add_skip_case( - teller2, SkipReasons.TRT_NOT_IMPLEMENTED, - "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." - ) - - def teller3(program_config, predictor_config): if self.trt_param.precision == paddle_infer.PrecisionType.Int8: return True return False self.add_skip_case( - teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, "When precisionType is int8 without relu op, output is different between Trt and Paddle." ) -- GitLab From 55a3bfbd0c8b07f3eda70ae4206efc6389872622 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 7 Mar 2022 14:05:12 +0800 Subject: [PATCH 083/261] [Phi] Fix macro name typo (#40204) --- paddle/fluid/operators/optimizers/adadelta_op.cc | 4 ++-- paddle/fluid/operators/optimizers/adamax_op.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index 3cafbce04d3..315831ddc0f 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -82,8 +82,8 @@ $$ namespace ops = paddle::operators; namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, - PT_INFER_META(phi::AdadeltaInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, + PD_INFER_META(phi::AdadeltaInferMeta)); REGISTER_OPERATOR( adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index 29f3d3b09de..036839dd130 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -92,8 +92,8 @@ division by 0 error. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, - PT_INFER_META(phi::AdamaxInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, + PD_INFER_META(phi::AdamaxInferMeta)); REGISTER_OPERATOR( adamax, ops::AdamaxOp, ops::AdamaxOpMaker, -- GitLab From 0fb6bca45d3d6c879a55d5cd5c388da713eec780 Mon Sep 17 00:00:00 2001 From: Wei Shengyu Date: Mon, 7 Mar 2022 14:11:28 +0800 Subject: [PATCH 084/261] fix infer shapes of pool_with_index (#40139) * dbg pool infer shapes * dbg * fix format --- paddle/fluid/operators/pool_with_index_op.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index e0c24935b47..d061f9ae056 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); } else { for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { + output_shape.push_back(in_x_dims[i + 2]); + } else { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } } ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); -- GitLab From c52a664e86a53c77a3ee33400edb49de36d81f4e Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 7 Mar 2022 14:33:58 +0800 Subject: [PATCH 085/261] [Phi]Move elementwise_div grad/double grad Kernel to Phi (#40172) * move elementwise_div grad * change mutable_data to alloc * fix compile bugs --- .../new_executor/standalone_executor_test.cc | 2 +- .../elementwise/elementwise_div_op.cc | 36 --- .../elementwise/elementwise_div_op.cu | 96 -------- .../elementwise/elementwise_div_op.h | 211 ------------------ .../elementwise/elementwise_functor.h | 61 ----- .../elementwise/elementwise_op_function.h | 71 +----- .../test_elementwise_div_grad_grad.cc | 2 +- .../kernels/cpu/elementwise_grad_kernel.cc | 37 ++- paddle/phi/kernels/elementwise_grad_kernel.h | 21 ++ paddle/phi/kernels/funcs/broadcast_function.h | 20 ++ .../phi/kernels/funcs/elementwise_functor.h | 68 ++++++ .../phi/kernels/funcs/elementwise_grad_base.h | 27 +++ paddle/phi/kernels/gpu/elementwise_grad.h | 126 +++++++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 62 ++++- .../impl/elementwise_grad_kernel_impl.h | 156 +++++++++++++ paddle/phi/kernels/math_kernel.cc | 1 + paddle/phi/ops/compat/elementwise_sig.cc | 22 ++ 17 files changed, 547 insertions(+), 472 deletions(-) delete mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.cu diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 2c3359ffa8e..62d87b6917e 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -54,7 +54,7 @@ USE_OP(slice_grad); USE_OP(lookup_table_grad); USE_OP(sqrt); USE_OP(elementwise_max); -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); USE_OP(sgd); USE_OP(squared_l2_norm); USE_OP(memcpy_h2d); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 38cd232e4d1..13fd9b81a87 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -102,42 +102,6 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, ops::ElementwiseDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); - -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu deleted file mode 100644 index 9eb4b0352e5..00000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = ctx.template device_context(); - const auto place = ctx.GetPlace(); - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, out, y}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, DivGradXFunctor()); - } else if (dy != nullptr && dx == nullptr) { - std::vector ins = {dout, out, y}; - GetGradXOrYOut( - dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor()); - } -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index c58a7f36548..e9adb9abdb5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -20,142 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -void default_elementwise_sub(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseSubFunctor(), z); - } -} - -template -void default_elementwise_div(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseDivFunctor(), z); - } -} - -template -class ElementwiseDivKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); - auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); - auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); - phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); - } -}; - -template -struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } -}; - -template -struct DivGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template -struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * out / y; - } -}; - -template -struct DivGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex out_div_y_conj((out / y).real, - -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template -struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; - } -}; - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - - ElemwiseGradCompute, DivGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseDivGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseDivGrad(ctx, x, y, out, dout, dx, dy); - } -}; - class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { } }; -template -class ElementwiseDivDoubleGradKernel : public framework::OpKernel { - using Tensor = framework::Tensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Input("Out"); - auto* ddX = ctx.Input("DDX"); - auto* ddY = ctx.Input("DDY"); - auto* dX = ctx.Input("DX"); - - auto* dY = ctx.Output(framework::GradVarName("Y")); - auto* dOut = ctx.Output("DOut"); - auto* ddOut = ctx.Output("DDOut"); - - int axis = ctx.Attr("axis"); - - if (dY) dY->mutable_data(Y->dims(), ctx.GetPlace()); - if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); - GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); - - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - // dY = Out * dX * ddY / Y - dX * ddX / Y - // dOut = - dX * ddY - // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can - // inplace ddx - Tensor tmp; - if (dOut) { - tmp = *dOut; - } else { - auto& dev_ctx = ctx.template device_context(); - tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); - } - if (dY) { - // dX_div_Y = dX / Y; - Tensor dX_div_Y = tmp; - default_elementwise_div(ctx, dX, Y, &dX_div_Y); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - ElemwiseGradCompute, DivDoubleDY>( - ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY, - DivGradDX(), DivDoubleDY()); - } - - if (ddOut) { - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); - default_elementwise_div(ctx, &tmp, Y, ddOut); - } - - if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 86f5be3071c..8e0bf78e9b7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -90,67 +90,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -template -struct DivGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - // dx = dout / y - // dy = - dout * out / y - phi::Array outs; - outs[0] = a / c; - outs[1] = -a * b / c; - return outs; - } -}; - -template -struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - Complex c_conj(c.real, -c.imag); - Complex out_div_c_conj((b / c).real, -(b / c).imag); - outs[0] = a / c_conj; - outs[1] = -a * out_div_c_conj; - return outs; - } -}; - -// Float div grad -template -struct DivGradXFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -// Complex div grad -template -struct DivGradXFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a / b_conj; - } -}; - -// Float mul and div -template -struct DivGradYFunctor { - inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { - return -a * b / c; - } -}; - -// Complex mul and div -template -struct DivGradYFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b, - const Complex c) const { - Complex out_div_c_conj((b / c).real, -(b / c).imag); - return -a * out_div_c_conj; - } -}; - // Fmax template struct FMaxFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 61862aa9f87..80b07721f0b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -45,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif @@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim &x_dim = x.dims(); - const framework::DDim &y_dim = y.dims(); const auto &dev_ctx = ctx.template device_context(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - phi::funcs::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } + phi::funcs::ElemwiseGradCompute( + dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } // It is a common implementation to compute binary calculation with the support @@ -1174,14 +1167,6 @@ static inline std::vector GetReduceDim(const framework::DDim &in, } #if defined(__NVCC__) || defined(__HIPCC__) -template -void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, - framework::Tensor *src, framework::Tensor *dst) { - std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceImpl>( - dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, - dev_ctx.stream()); -} template void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, @@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy, Functor func) { - framework::Tensor tmp_dx; - framework::Tensor tmp_dy; - dx->mutable_data(place); - dy->mutable_data(place); - std::vector outs; - if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) { - outs = {dx, dy}; - } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, dy}; - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - outs = {dx, &tmp_dy}; - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, &tmp_dy}; - } - - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); - - if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } + phi::GetGradXAndYOut(dev_ctx, place, axis, ins, *dout, dx, dy, + func); } template @@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dxy, Functor func) { - framework::Tensor tmp_dxy; - dxy->mutable_data(place); - - std::vector outs; - if (dxy->dims() != dout->dims()) { - tmp_dxy.mutable_data(dout->dims(), place); - outs = {&tmp_dxy}; - } else { - outs = {dxy}; - } - - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); - if (dxy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); - } + phi::GetGradXOrYOut(dev_ctx, place, axis, ins, *dout, dxy, + func); } #endif diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 9aa206efed8..7890d634e99 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -28,7 +28,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); namespace paddle { namespace operators { diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index e48ee805959..c9177f1c46e 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -18,7 +18,6 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -108,6 +107,20 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + funcs::ElementwiseGradPreProcess(dout, dx); + phi::funcs::ElemwiseGradCompute, DivGradDY>( + dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX(), DivGradDY()); +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -171,3 +184,25 @@ PD_REGISTER_KERNEL(subtract_double_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_grad, + CPU, + ALL_LAYOUT, + phi::DivideGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} + +PD_REGISTER_KERNEL(divide_double_grad, + CPU, + ALL_LAYOUT, + phi::DivideDoubleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index a1b296e326f..bcd5a98f07e 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -64,4 +64,25 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, int axis, DenseTensor* ddout); +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy); + +template +void DivideDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dx, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dy, + DenseTensor* dout, + DenseTensor* ddout); } // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index aab31cfbd55..7634c246273 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -592,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx, #endif +template +void DefaultElementwiseOperator(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int axis = -1) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + dev_ctx.template Alloc(z); + if (x_dims.size() >= y_dims.size()) { + funcs::ElementwiseCompute(dev_ctx, x, y, axis, Functor(), z); + } else { + funcs::ElementwiseCompute( + dev_ctx, x, y, axis, InverseFunctor(), z); + } +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index c0a3985cd17..5615a450b5c 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" @@ -92,5 +93,72 @@ struct InverseDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; } }; +template +using ComplexType = phi::dtype::complex; + +template +struct DivGradXYFunctor { + inline HOSTDEVICE phi::Array operator()(const InT a, + const InT b, + const InT c) { + // dx = dout / y + // dy = - dout * out / y + phi::Array outs; + outs[0] = a / c; + outs[1] = -a * b / c; + return outs; + } +}; + +template +struct DivGradXYFunctor, ComplexType> { + inline HOSTDEVICE phi::Array, 2> operator()( + const ComplexType a, + const ComplexType b, + const ComplexType c) { + phi::Array, 2> outs; + ComplexType c_conj(c.real, -c.imag); + ComplexType out_div_c_conj((b / c).real, -(b / c).imag); + outs[0] = a / c_conj; + outs[1] = -a * out_div_c_conj; + return outs; + } +}; + +// Float div grad +template +struct DivGradXFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } +}; + +// ComplexType div grad +template +struct DivGradXFunctor> { + inline HOSTDEVICE ComplexType operator()(const ComplexType a, + const ComplexType b) const { + ComplexType b_conj(b.real, -b.imag); + return a / b_conj; + } +}; + +// Float mul and div +template +struct DivGradYFunctor { + inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { + return -a * b / c; + } +}; + +// ComplexType mul and div +template +struct DivGradYFunctor> { + inline HOSTDEVICE ComplexType operator()(const ComplexType a, + const ComplexType b, + const ComplexType c) const { + ComplexType out_div_c_conj((b / c).real, -(b / c).imag); + return -a * out_div_c_conj; + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index dff0cfe5b8b..17bf8735873 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -24,6 +24,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, #endif +template +void ElemwiseGradCompute(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + const DDim &x_dim = x.dims(); + const DDim &y_dim = y.dims(); + if (x.dims() == y.dims()) { + ElemwiseGradComputeNoBroadcast( + dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { + ElemwiseGradComputeWithBroadcast( + dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index 20799f4e37b..b356f19555f 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -14,12 +14,101 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_grad_base.h" #include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { +template +void ReduceWrapper(const GPUContext &dev_ctx, + int axis, + DenseTensor *src, + DenseTensor *dst) { + std::vector reduce_dims = + funcs::GetReduceDim(dst->dims(), src->dims(), axis); + funcs::TensorReduceImpl>( + dev_ctx, + *src, + dst, + kps::IdentityFunctor(), + reduce_dims, + dev_ctx.stream()); +} + +template +void GetGradXAndYOut(const GPUContext &dev_ctx, + const Place &place, + int axis, + std::vector ins, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + Functor func) { + DenseTensor tmp_dx; + DenseTensor tmp_dy; + dev_ctx.Alloc(dx); + dev_ctx.Alloc(dy); + std::vector outs; + if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) { + outs = {dx, dy}; + } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) { + tmp_dx.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dx); + outs = {&tmp_dx, dy}; + } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) { + tmp_dy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dy); + outs = {dx, &tmp_dy}; + } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) { + tmp_dy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dy); + tmp_dx.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dx); + outs = {&tmp_dx, &tmp_dy}; + } + + funcs::BroadcastKernel( + dev_ctx, ins, &outs, axis, func); + + if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); + } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); + } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); + ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); + } +} + +template +void GetGradXOrYOut(const GPUContext &dev_ctx, + const Place &place, + int axis, + std::vector ins, + const DenseTensor &dout, + DenseTensor *dxy, + Functor func) { + DenseTensor tmp_dxy; + dev_ctx.Alloc(dxy); + + std::vector outs; + if (dxy->dims() != dout.dims()) { + tmp_dxy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dxy); + outs = {&tmp_dxy}; + } else { + outs = {dxy}; + } + + funcs::BroadcastKernel(dev_ctx, ins, &outs, axis, func); + if (dxy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); + } +} + /* ****************************** Add Grad @@ -243,4 +332,41 @@ void elementwise_sub_grad(const GPUContext &ctx, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); } +/* +****************************** + Div Grad +****************************** +*/ +template +void ElementwiseDivGrad(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + const auto place = dev_ctx.GetPlace(); + if (dx != nullptr && dy != nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXAndYOut( + dev_ctx, + place, + axis, + ins, + dout, + dx, + dy, + funcs::DivGradXYFunctor()); + } else if (dx != nullptr && dy == nullptr) { + std::vector ins = {&dout, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor()); + } else if (dy != nullptr && dx == nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor()); + } +} + } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index d00888aee67..45c8b9a2163 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -15,9 +15,11 @@ #include "paddle/phi/kernels/elementwise_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -102,6 +104,38 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + const auto place = dev_ctx.GetPlace(); + if (dx != nullptr && dy != nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXAndYOut( + dev_ctx, + place, + axis, + ins, + dout, + dx, + dy, + funcs::DivGradXYFunctor()); + } else if (dx != nullptr && dy == nullptr) { + std::vector ins = {&dout, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor()); + } else if (dy != nullptr && dx == nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor()); + } +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -168,3 +202,29 @@ PD_REGISTER_KERNEL(subtract_double_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_grad, + GPU, + ALL_LAYOUT, + phi::DivideGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_double_grad, + GPU, + ALL_LAYOUT, + phi::DivideDoubleGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index ac7d6fd1a0e..e8831f90213 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -14,8 +14,11 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" namespace phi { @@ -103,4 +106,157 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, } } +/* +****************************** + Divide Grad +****************************** +*/ + +template +struct DivGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } +}; + +template +struct DivGradDX> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex y_conj(y.real, -y.imag); + return dout / y_conj; + } +}; + +template +struct DivGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return -dout * out / y; + } +}; + +template +struct DivGradDY> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex out_div_y_conj((out / y).real, -(out / y).imag); + return -dout * out_div_y_conj; + } +}; + +template +struct DivDoubleDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return y * out * dout - x * dout; + } +}; + +template +void DivideDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dx, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dy, + DenseTensor* dout, + DenseTensor* ddout) { + if (dy) { + dy->Resize(y.dims()); + dev_ctx.template Alloc(dy); + } + if (dout) { + dout->Resize(out.dims()); + dev_ctx.template Alloc(dout); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + // ddX_safe == null ? 0 : ddX + // ddY_safe == null ? 0 : ddY + DenseTensor ddX_safe, ddY_safe; + phi::funcs::GetDoubleGradSafeTensor( + dev_ctx, dx, ddx.get_ptr(), &ddX_safe); + phi::funcs::GetDoubleGradSafeTensor( + dev_ctx, y, ddy.get_ptr(), &ddY_safe); + + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + // dY = Out * dX * ddY / Y - dX * ddX / Y + // dOut = - dX * ddY + // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can + // inplace ddx + DenseTensor tmp; + if (dout) { + tmp = *dout; + } else { + tmp.Resize(out.dims()); + dev_ctx.template Alloc(&tmp); + } + if (dy) { + // dX_div_Y = dX / Y; + DenseTensor dX_div_Y = tmp; + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, dx, y, &dX_div_Y, axis); + + // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the + // first output tensor is nullptr, the branch to calculate first + // output tensor will not be activated, DivGradDx function will not + // be called and can be ignored, the first branch has little effect + // on running speed. + + // dY = Out * dX * ddY / Y - dX * ddX / Y + phi::funcs::ElemwiseGradCompute, DivDoubleDY>( + dev_ctx, + ddX_safe, + ddY_safe, + out, + dX_div_Y, + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDY()); + } + + if (ddout) { + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, ddY_safe, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseSubtractFunctor>( + dev_ctx, ddX_safe, tmp, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + } + + if (dout) { + // dOut = - dX * ddY + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dx, ddY_safe, dout, axis); + auto& place = *dev_ctx.eigen_device(); + auto dout_result = phi::EigenVector::Flatten(*dout); + dout_result.device(place) = static_cast(-1) * dout_result; + } +} + } // namespace phi diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index 8b17d8bd250..a5d3f51e544 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -208,6 +208,7 @@ PD_REGISTER_KERNEL(divide, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(multiply, diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 89846ea0563..d4a25866907 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -106,6 +106,22 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); } +KernelSignature ElementwiseDivGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("divide_grad", + {"X", "Y", "Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + +KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("divide_double_grad", + {"Y", "Out", "DX", "DDX", "DDY"}, + {"axis"}, + {GradVarName("Y"), "DOut", "DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -117,6 +133,8 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad); PD_REGISTER_ARG_MAPPING_FN(elementwise_add, phi::ElementwiseAddOpArgumentMapping); @@ -136,3 +154,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad, phi::ElementwiseSubGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad, phi::ElementwiseSubDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad, + phi::ElementwiseDivGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad, + phi::ElementwiseDivDoubleGradOpArgumentMapping); -- GitLab From 6fd96a0400e5e618795ad20f8e85a2e975ea4194 Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 7 Mar 2022 15:41:27 +0800 Subject: [PATCH 086/261] Add mlir trt engine type. (#40197) * infrt add trt engine * update engine name --- .../backends/tensorrt/test_trt_engine.cc | 8 ++--- paddle/infrt/backends/tensorrt/trt_engine.cc | 26 ++++++++--------- paddle/infrt/backends/tensorrt/trt_engine.h | 11 +++++-- paddle/infrt/backends/tensorrt/trt_utils.h | 9 +++--- .../dialect/tensorrt/trt_dilaect_types.h | 29 +++++++++++++++++++ paddle/infrt/dialect/tensorrt/trt_op_base.td | 3 ++ paddle/infrt/dialect/tensorrt/trt_ops.cc | 25 ++++++++++++++++ paddle/infrt/dialect/tensorrt/trt_ops.h | 5 +++- 8 files changed, 91 insertions(+), 25 deletions(-) create mode 100644 paddle/infrt/dialect/tensorrt/trt_dilaect_types.h diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 54b7bc3e8af..12cf14060e2 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -17,8 +17,8 @@ #include #include #include -#include "glog/logging.h" -#include "gtest/gtest.h" +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -86,7 +86,7 @@ TrtUniquePtr ConstructNetwork( inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } TEST(trt, run_static) { - TRTEngine static_trt_engine(0); + TrtEngine static_trt_engine(0); auto net = ConstructNetwork( static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); BuildOptions static_build_options; @@ -164,7 +164,7 @@ TEST(trt, run_static) { } TEST(trt, run_dynamic) { - TRTEngine engine(0); + TrtEngine engine(0); auto net = ConstructNetwork( engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false); BuildOptions build_options; diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index a204fe42b45..232653e8c41 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -17,7 +17,7 @@ #include #include -#include "glog/logging.h" +#include #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" @@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime( phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); } -TRTEngine::TRTEngine(int device_id) : device_id_(device_id) { +TrtEngine::TrtEngine(int device_id) : device_id_(device_id) { FreshDeviceId(); logger_.reset(new TrtLogger()); builder_.reset(createInferBuilder(logger_->GetTrtLogger())); phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); } -nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() { +nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() { CHECK_NOTNULL(builder_); return builder_.get(); } -void TRTEngine::Build(TrtUniquePtr network, +void TrtEngine::Build(TrtUniquePtr network, const BuildOptions& build_options) { FreshDeviceId(); ModelToBuildEnv(std::move(network), build_options); CHECK_NOTNULL(engine_); } -bool TRTEngine::ModelToBuildEnv( +bool TrtEngine::ModelToBuildEnv( TrtUniquePtr network, const BuildOptions& build) { CHECK_NOTNULL(builder_); @@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv( return true; } -bool TRTEngine::NetworkToEngine(const BuildOptions& build) { +bool TrtEngine::NetworkToEngine(const BuildOptions& build) { TrtUniquePtr config{builder_->createBuilderConfig()}; CHECK_NOTNULL(config); CHECK(SetupNetworkAndConfig(build, *network_, *config)); @@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) { return true; } -bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build, +bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, INetworkDefinition& network, IBuilderConfig& config) { builder_->setMaxBatchSize(build.max_batch); @@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build, return true; } -bool TRTEngine::SetUpInference( +bool TrtEngine::SetUpInference( const InferenceOptions& inference, const std::unordered_map& inputs, std::unordered_map* outputs) { @@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference( return true; } -void TRTEngine::Run(const phi::GPUContext& ctx) { +void TrtEngine::Run(const phi::GPUContext& ctx) { if (is_dynamic_shape_) { DynamicRun(ctx); } else { @@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) { } } -void TRTEngine::StaticRun(const phi::GPUContext& ctx) { +void TrtEngine::StaticRun(const phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) { runtime_batch, buffers.data(), ctx.stream(), nullptr); } -void TRTEngine::DynamicRun(const phi::GPUContext& ctx) { +void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) { contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr); } -void TRTEngine::FreshDeviceId() { +void TrtEngine::FreshDeviceId() { int count; cudaGetDeviceCount(&count); CHECK_LT(device_id_, count); phi::backends::gpu::SetDeviceId(device_id_); } -void TRTEngine::GetEngineInfo() { +void TrtEngine::GetEngineInfo() { #if IS_TRT_VERSION_GE(8200) LOG(INFO) << "====== engine info ======"; std::unique_ptr infer_inspector( diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index f72bdaf3ac0..3c8243e3c38 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -56,13 +56,18 @@ using namespace nvinfer1; // NOLINT // // We have encapsulated this logic, please use the following programming model. // -// TRTEngine trt_engine; +// TrtEngine trt_engine; // trt_engine.Build(...); // trt_engine.SetUpInference(...); // trt_engine.Run(...); -class TRTEngine { +class TrtEngine { public: - explicit TRTEngine(int device_id); + explicit TrtEngine(int device_id = 0); + + TrtEngine(const TrtEngine&) = delete; + TrtEngine& operator=(const TrtEngine&) = delete; + TrtEngine(TrtEngine&&) = default; + TrtEngine& operator=(TrtEngine&&) = default; nvinfer1::IBuilder* GetTrtBuilder(); diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h index 4b129af1d53..c66a850ffb1 100644 --- a/paddle/infrt/backends/tensorrt/trt_utils.h +++ b/paddle/infrt/backends/tensorrt/trt_utils.h @@ -15,16 +15,17 @@ #pragma once +#include +#include +#include +#include + #include #include #include #include #include -#include -#include -#include -#include "glog/logging.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h new file mode 100644 index 00000000000..efcf7dd5be1 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "mlir/IR/Types.h" + +namespace infrt { +namespace trt { + +class EngineType + : public mlir::Type::TypeBase { + public: + using Base::Base; +}; + +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td index 5722f17d597..128960ee03e 100755 --- a/paddle/infrt/dialect/tensorrt/trt_op_base.td +++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td @@ -27,6 +27,9 @@ class TRT_PaddleAttr : Attr()">, "PaddlePaddle " # description # " attribute">; +def TRT_EngineType : + Type()">, "!trt.engine">, + BuildableType<"getType<::infrt::trt::EngineType>()">; //===----------------------------------------------------------------------===// // PaddlePaddle type definitions diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index 35b7967892c..f179939e232 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -13,23 +13,48 @@ // limitations under the License. #include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include #include #include #include #include #include +#include "paddle/infrt/dialect/tensorrt/trt_dilaect_types.h" namespace infrt { namespace trt { TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context) : mlir::Dialect("trt", context, mlir::TypeID::get()) { + addTypes(); addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc" // NOLINT >(); } +mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const { + llvm::StringRef keyword; + if (parser.parseKeyword(&keyword)) return mlir::Type(); + // parse trt dilaect types, for example: !trt.engine + if (keyword == "engine") { + return infrt::trt::EngineType::get(getContext()); + } + parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ") + << keyword; + return mlir::Type(); +} + +void TensorRTDialect::printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const { + // print trt dilaect types, for example: !trt.engien + if (type.isa()) { + printer << "engine"; + return; + } + llvm_unreachable("unknown infrt::trt type."); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 95b2ed41fdf..978b9906e5f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -35,8 +35,11 @@ namespace trt { class TensorRTDialect : public mlir::Dialect { public: - explicit TensorRTDialect(mlir::MLIRContext* context); + explicit TensorRTDialect(mlir::MLIRContext *context); static llvm::StringRef getDialectNamespace() { return "trt"; } + mlir::Type parseType(mlir::DialectAsmParser &parser) const; // NOLINT + void printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const; // NOLINT }; } // namespace trt -- GitLab From 7296433504eae988cadf198bd9e4aaccde73d8aa Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Mon, 7 Mar 2022 16:33:59 +0800 Subject: [PATCH 087/261] [phi] move is_empty to phi (#39919) * Add is_empty * fixed for CI * fixed code style * resolve conflict * deal with comments * replace pt by pd --- paddle/fluid/operators/is_empty_op.cc | 20 ++++----- paddle/fluid/operators/is_empty_op.cu.cc | 23 ---------- paddle/phi/infermeta/unary.cc | 6 +++ paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/is_empty_kernel.cc | 53 ++++++++++++++++++++++++ paddle/phi/kernels/is_empty_kernel.h | 24 +++++++++++ 6 files changed, 92 insertions(+), 36 deletions(-) delete mode 100644 paddle/fluid/operators/is_empty_op.cu.cc create mode 100644 paddle/phi/kernels/is_empty_kernel.cc create mode 100644 paddle/phi/kernels/is_empty_kernel.h diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 2750367dc77..c835bb3cf60 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty"); - ctx->SetOutputDim("Out", {1}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto *x = ctx.Input("X"); @@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor, + PD_INFER_META(phi::IsEmptyInferMeta)); REGISTER_OPERATOR( is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); + paddle::framework::EmptyGradOpMaker, + IsEmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc deleted file mode 100644 index 3c256503baf..00000000000 --- a/paddle/fluid/operators/is_empty_op.cu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 85db1547f16..b9eb5196b1e 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" @@ -307,6 +308,11 @@ void InferMetaFromVecValue(const MetaTensor& x, } } +void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(phi::make_ddim({1})); + out->set_dtype(DataType::BOOL); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index d4e21fbd824..37b17f6e3d1 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -69,6 +69,8 @@ void InferMetaFromVecValue(const MetaTensor& x, const std::vector& shape, MetaTensor* out); +void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc new file mode 100644 index 00000000000..26c2f978005 --- /dev/null +++ b/paddle/phi/kernels/is_empty_kernel.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/is_empty_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void IsEmptyKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + // Note: is_empty is always executed on CPU and the output data should + // always be allocated for CPUPlace. We reigister CUDA kernel for this op to + // avoid the unnecessary data transform. + bool* out_data = dev_ctx.template HostAlloc(out); + out_data[0] = phi::product(x.dims()) == 0; +} + +} // namespace phi + +PD_REGISTER_KERNEL(is_empty, + CPU, + ALL_LAYOUT, + phi::IsEmptyKernel, + float, + double, + int, + int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(is_empty, + GPU, + ALL_LAYOUT, + phi::IsEmptyKernel, + float, + double, + int, + int64_t) {} +#endif diff --git a/paddle/phi/kernels/is_empty_kernel.h b/paddle/phi/kernels/is_empty_kernel.h new file mode 100644 index 00000000000..3bcf6f9054e --- /dev/null +++ b/paddle/phi/kernels/is_empty_kernel.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace phi -- GitLab From 2a3d9eca64b0312a6bf49ffe6f470a084886bbe4 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Mon, 7 Mar 2022 16:38:21 +0800 Subject: [PATCH 088/261] cuBlasLt Epilogue To Fuse Linear + ReLU|GeLU (#39437) * Added cuBlasLtHandle_t to device context. * Added fused_gemm_epilogue op. 1. Added fused_gemm_epilogue op to leverage cuBlastLt Epilogue. 2. Support fusion Act(X*Y + bias), X'dims >=2 and Y'dims shoule be 2. 2. Act currently only be supported ReLU. (Will add GeLU in the future). * Added UT to fused_gemm_epilogue op. * Added LinearAct Pattern 1. Added LinearAct into graph_pattern_detector.* to define (2.)'s pattern. 2. LinearAct is used to detect act(element_add(matmul_v2(x, w), bias)). 3. act currently only support ReLU (Will support GeLU in the future). * Added FuseGemmEpiloguePass 1, Added FuseGemmEpiloguePass to handle nn.Linear + Act{ReLU} fusion (GeLU will be supported in the future). 2. Only support matmul_v2 from nn.Linear. * Added pybind to BuildStrageter.fuse_gemm_epilogue_. * Added UT for fuse_gemm_epilogue_pass. * GeLU support and EpilogueSingleton 1. Added GeLU support to fused_gemm_epilogue op. 2. Added EpilogueSingleton to cache auxiliary pointer. 3. Added related UTs. * Rename cublaslt_epilogue_opto gemm_epilogue_op.*. * Added both train and infer pattern to LinearAct. 1. Added support of fwd graph with grap_ops linking to LinearAct. 2. Added related changes to fuse_gemm_epilogue_pass for above modification. * Changed CUDA requirement from 11.4 to 11.6 for fuse_gemm_epilogue_pass. * Added identity activation support to gemm_epilogue_op. * Added Linear Fusion (matmul_v2 + ele_add) 1. Added matmul_v2 + ele_add pattern to LinearActPattern. 2. Added matmul_v2 + ele_add support to fuse_gemm_epilogue_pass. * Rename gemm_epilogue_op.* to fused_gemm_epilogue_op.* * Add fused_gemm_epilogue_grad op. 1. Added fused_gemm_epilogue_grad to support backward epilogue fusion. * Add UTs to fused_gemm_epilogue_grad_op. * Change attribute name in fused_gemm_epilogue_grad_op for clearing. * Allow DX and DBias be dispensable to fused_gemm_epilogue_grad op. * Added ElementwiseAdd+Matmul+Act graph pattern detection. * Fuse backward of Linear( Act(x)) 1. Added backward fusion pass to Linear( Act(x)). 2. Added backward fusion pass to Linear(x). * Added UTs to backward fusion of Linear(Act(x)). * Complete document of arguments to fused_gemm_epilogue_op. * Made arguments of some functions pass by reference. * Modify code with review comments. 1. Made arguments of some function pass by reference. 2. Removed redundant code. 3. Followed Google code style to change code. * Made 'const' code style be consistent * Fixed random seed of python UTs. * Set Compiling constrains to cuBlasLt 1. Require CUDA 11.6+ 2. Remove fuse_gemm_epilogue related tests when CUDA < 11.6. * Code Reivew from Paddle 1. Changed arguments name is_first_gemm to without_x_gradient for clearing. 2. Applied PADDLE_THROW in fused_gemm_epilogue_op. * Remove EpilogueSingleton 1. Applied ReserveSpace to replace Epilogue for passing auxiliary pointers between FWD and BWD. * Fix a logical error and enhance UTs. 1. Added act op count checking in UTs. 2. Fix issue to fuse backward or ReLU(Linear(X)). 3. TODO: solve GELU fusion issues. * Fix Linear and GeLU fusion issues. 1. Modified graph_detech_pattern to fit with both linear wiht gelu or relu. 2. Modified data range in Uts to allow negative values. * Removed fused_gemm_epilogue_op.h. * Rename namespace pten to phi. * Rename name of arguments in fused_gemm_epilogue_op 1. bias -> Bias. 2. out -> Out. 3. reserve_space -> ReserveSpace. * Change EpiloguePassActivationCache as local variable. 1. Removed singleton in EpiloguePassActivationCache. 2. Made EpiloguePassActivationCache as an argument to each pass functions. --- cmake/operators.cmake | 10 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 9 + .../fluid/framework/details/build_strategy.h | 3 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/fuse_gemm_epilogue_pass.cc | 471 ++++++++++++++++++ .../framework/ir/fuse_gemm_epilogue_pass.h | 100 ++++ .../framework/ir/graph_pattern_detector.cc | 178 ++++++- .../framework/ir/graph_pattern_detector.h | 59 +++ paddle/fluid/operators/fused/CMakeLists.txt | 7 +- .../operators/fused/fused_gemm_epilogue_op.cc | 353 +++++++++++++ .../operators/fused/fused_gemm_epilogue_op.cu | 376 ++++++++++++++ .../platform/device/gpu/cuda/cuda_helper.h | 24 + paddle/fluid/platform/device/gpu/gpu_types.h | 6 + paddle/fluid/platform/device_context.cc | 22 + paddle/fluid/platform/device_context.h | 31 ++ paddle/fluid/pybind/pybind.cc | 26 + paddle/phi/backends/gpu/forwards.h | 4 + paddle/phi/backends/gpu/gpu_context.cc | 32 ++ paddle/phi/backends/gpu/gpu_context.h | 6 + paddle/phi/backends/gpu/gpu_decls.h | 5 + .../fluid/tests/unittests/CMakeLists.txt | 11 + .../unittests/test_fuse_gemm_epilogue_pass.py | 392 +++++++++++++++ .../test_fused_gemm_epilogue_grad_op.py | 239 +++++++++ .../unittests/test_fused_gemm_epilogue_op.py | 450 +++++++++++++++++ tools/static_mode_white_list.py | 3 + 26 files changed, 2788 insertions(+), 32 deletions(-) create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7affd59de16..9e8c81c2985 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -293,11 +293,11 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") - - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 66dfb81755f..948eaab40b4 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c99200ec98a..fdf74d2f769 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + AppendPassWithCheck(strategy_.fuse_gemm_epilogue_, + "fuse_gemm_epilogue_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. @@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass); !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) +USE_PASS(fuse_gemm_epilogue_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 70a083dd70b..5eb584aaefa 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -1,4 +1,5 @@ // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -124,6 +125,8 @@ struct BuildStrategy { paddle::optional fuse_broadcast_ops_{paddle::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // Fuse GEMM+Epilogue via cublasLt epilogue. + bool fuse_gemm_epilogue_{false}; // mkldnn_enabled_op_types specify the operator type list to // use MKLDNN acceleration. It is null in default, means diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0d53a54ff82..a1f2d6edca6 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -157,6 +157,7 @@ endif() cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc new file mode 100644 index 00000000000..f48224cbdc2 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { + EpiloguePassActivationCache cache; + + graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache); + graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache); + graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache); + graph = FuseLinearFwd(graph, false); + graph = FuseLinearFwd(graph, true); + graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache); + graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache); + graph = FuseLinearBwd(graph, false); + graph = FuseLinearBwd(graph, true); +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, + bool is_training) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, {}, is_training, false); + + int found_linear_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + std::string activation = "none"; + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, ele_out); + + GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name(); + found_linear_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( + ir::Graph *graph, const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act); + + int found_linear_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + auto activation = act_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, act_out); + + // Only need to check weight.shape[1] for auxiliary pointer + // and mark it the act op is fused for backward epilogue fusion. + // That because cuBlasLt epilogue's restriction. + if (is_training) { + int divisor_of_n = activation == "relu" ? 128 : 8; + if (matmul_w_shape[1] % divisor_of_n) return; + + VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace")); + auto *reserve_space_node = g->CreateVarNode(&reserve_space); + + cache->InsertFusedActivation( + GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()), + reserve_space_node); + + gemm_epilogue_node->Op()->SetOutput("ReserveSpace", + {reserve_space_node->Name()}); + + if (!is_act_grad_x_from_act) { + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern); + act_grad_op->Op()->RenameInput(ele_out->Name(), + reserve_space_node->Name()); + IR_NODE_LINK_TO(reserve_space_node, act_grad_op); + } + IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node); + } + + GraphSafeRemoveNodes(g, + {matmul_op, matmul_out, ele_add_op, ele_out, act_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> " + << act_out->Name(); + found_linear_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, + bool without_x_gradient) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + + Node *matmul_grad_dx = nullptr; + if (!without_x_gradient) { + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx, + ele_add_matmul_act_pattern); + matmul_grad_dx = matmul_grad_dx_ptr; + } + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + std::string activation_grad = "none"; + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + if (matmul_grad_dx) { + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", + {matmul_grad_dx->Name()}); + } + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + if (matmul_grad_dx) { + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx); + } + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op}); + + std::string matmul_grad_dx_name = + matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " "; + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_w->Name() << " and " << matmul_grad_dx_name; + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, act_grad_types, false, + is_act_grad_x_from_act); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx, + ele_add_matmul_act_pattern); + + auto key = + GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId()); + if (!cache->HasFusedActivation(key)) { + return; + } + auto *reserve_space_node = cache->GetFusedActivationSpace(key); + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + auto activation_grad = act_grad_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace", + {reserve_space_node->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node); + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op, + matmul_grad_dx, act_grad_op}); + + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name() + << "\n\t " << matmul_grad_dx->Name() << " -> " + << act_grad_op->Name() << " -> " << act_grad_dx->Name(); + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +bool FuseGemmEpiloguePass::IsGemmFromLinear_( + const std::vector &x_shape, const std::vector &w_shape, + OpDesc *matmul_v2_op) const { + if (w_shape.size() != 2 || x_shape.size() < 2) return false; + for (auto attr_name : + {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y", + "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) { + if (matmul_v2_op->HasAttr(attr_name)) { + std::vector tmp_vec = + BOOST_GET_CONST(std::vector, matmul_v2_op->GetAttr(attr_name)); + if (tmp_vec.size() > 0) return false; + } + } + if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || + BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) + return false; + + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_gemm_epilogue_pass, + paddle::framework::ir::FuseGemmEpiloguePass); diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h new file mode 100644 index 00000000000..575ffee73d6 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class Graph; +class Node; + +class EpiloguePassActivationCache { + public: + EpiloguePassActivationCache() {} + + EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete; + void operator=(const EpiloguePassActivationCache &) = delete; + + bool HasFusedActivation(const std::string &key) const { + return fused_activation_space_map_.count(key); + } + + ir::Node *GetFusedActivationSpace(const std::string &key) { + if (HasFusedActivation(key)) { + return fused_activation_space_map_.find(key)->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "The key (%d) of EpiloguePassActivationCache does not exist.", key)); + } + + void InsertFusedActivation(const std::string &key, ir::Node *const value) { + if (!HasFusedActivation(key)) { + mtx.lock(); + fused_activation_space_map_.insert({key, value}); + mtx.unlock(); + } else { + PADDLE_THROW(platform::errors::AlreadyExists( + "The key (%d) of EpiloguePassActivationCache already exist.", key)); + } + } + + private: + std::unordered_map fused_activation_space_map_; + std::mutex mtx; +}; + +class FuseGemmEpiloguePass : public FusePassBase { + public: + virtual ~FuseGemmEpiloguePass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const; + ir::Graph *FuseLinearActFwd(ir::Graph *graph, + const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const; + ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const; + ir::Graph *FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const; + + private: + bool IsGemmFromLinear_(const std::vector &x_shape, + const std::vector &w_shape, + OpDesc *matmul_v2_op) const; + const std::string GetReserveSpaceCacheKey(const std::string var_name, + int block_id) const { + return std::to_string(block_id) + var_name; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e4c9dc72128..d7d866fa98b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1461,31 +1461,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()( return bn_grad; } -PDNode *patterns::ElewiseAddAct::operator()( - paddle::framework::ir::PDNode *ele_x_var, - std::unordered_set act_types) { - auto *ele_y_var = pattern->NewNode(ele_y_repr()) - ->assert_is_op_input("elementwise_add", "Y"); - - auto *ele_add = - pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); - - auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) - ->assert_is_op_output("elementwise_add", "Out"); - - ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); - - auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); - - auto *act_out_var = - pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); - - ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); - act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); - - return act_out_var; -} - PDNode *patterns::ElewiseAddActInplaceGrad::operator()( paddle::framework::ir::PDNode *d_act_out_var, std::unordered_set act_types) { @@ -1526,6 +1501,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::LinearAct::operator()( + paddle::framework::ir::PDNode *linear_x_var, + const std::unordered_set &act_types, bool with_grad_link, + bool is_act_grad_x_from_act) { + auto *matmul_w_var = + pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y"); + + auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2"); + + auto *matmul_out_var = pattern->NewNode(matmul_out_repr()) + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X"); + + auto *ele_bias_var = pattern->NewNode(ele_bias_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var}); + ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var}); + + if (with_grad_link) { + matmul_out_var->assert_is_op_input("elementwise_add_grad", "X"); + auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad") + ->assert_is_op("elementwise_add_grad"); + elementwise_add_grad_op->LinksFrom({matmul_out_var}); + } + + if (act_types.size() > 0) { + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_output(act_types, "Out"); + + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + if (with_grad_link && !is_act_grad_x_from_act) { + std::unordered_set act_grad_types; + for (const auto &act : act_types) { + std::string act_grad(act); + act_grad.append("_grad"); + act_grad_types.insert(act_grad); + } + + ele_out_var->assert_is_ops_input(act_grad_types, "X"); + auto *act_grad_op = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + act_grad_op->LinksFrom({ele_out_var}); + } + + return act_out_var; + } + + return ele_out_var; +} + +PDNode *patterns::ElewiseAddMatmulAct::operator()( + paddle::framework::ir::PDNode *dout_var, + const std::unordered_set &act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act) { + auto *ele_grad_bias_var = + pattern->NewNode(ele_grad_bias_repr()) + ->assert_is_op_input("elementwise_add_grad", "Y"); + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + auto *ele_grad_dx_var = + pattern->NewNode(ele_grad_dx_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + auto *ele_grad_dbias_var = + pattern->NewNode(ele_grad_dbias_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var}) + .LinksTo({ele_grad_dx_var, ele_grad_dbias_var}); + + ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad", + GradVarName("Out")); + + auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr()) + ->assert_is_op_input("matmul_v2_grad", "X"); + auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr()) + ->assert_is_op_input("matmul_v2_grad", "Y"); + auto *matmul_grad = + pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad"); + auto *matmul_grad_dx_var = + pattern->NewNode(matmul_grad_dx_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("X")); + auto *matmul_grad_dw_var = + pattern->NewNode(matmul_grad_dw_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("Y")); + matmul_grad->LinksFrom( + {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var}); + if (without_x_gradient) { + matmul_grad->LinksTo({matmul_grad_dw_var}); + } else { + matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var}); + } + + if (!without_x_gradient && act_grad_types.size() > 0) { + matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input( + act_grad_types, GradVarName("Out")); + + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *act_grad_dx_var = + pattern->NewNode(act_grad_dx_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")); + + auto *act_grad_x_var = matmul_grad_x_var; + if (!is_act_grad_x_from_act) { + auto *ele_out_var = pattern->NewNode(ele_out_repr()) + ->assert_is_ops_input(act_grad_types, "X"); + act_grad_x_var = ele_out_var; + } + + act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var}) + .LinksTo({act_grad_dx_var}); + return act_grad; + } + + return matmul_grad; +} + // conv_type: conv2d, conv3d, conv2d_transpose PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, std::string conv_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d6400ed6945..0f21906d08d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -863,6 +863,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(ele_y); }; +// The following patterns are used to fuse linear and act (ReLu or GeLU) +// formula: act(F.linear(x)) +// op: matmul_v2 + elementwise_add + act +// named nodes: matmul, elementwise_add, act +// matmul_w, matmul_out +// ele_bias, elewise_add_out, act_out +struct LinearAct : public PatternBase { + LinearAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "linear_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_types, + bool with_grad_link, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(matmul); + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(matmul_w); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_bias); + PATTERN_DECL_NODE(act_out); +}; + +// The following patterns are used to fuse linear_grad and act_grad (ReLu or +// GeLU) +// formula: the backward of F.linear( act(x) ) +// op: elementwise_add_grad + matmul_v2_grad + act_grad +// named nodes: ele_add_grad, matmul_grad, act_grad +// ele_grad_bias, ele_grad_dx, ele_grad_dbias +// matmul_grad_x, matmul_grad_dx, matmul_grad_dx +// matmul_grad_dw, act_grad_dx +struct ElewiseAddMatmulAct : public PatternBase { + ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add_grad); + PATTERN_DECL_NODE(matmul_grad); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(ele_grad_bias); + PATTERN_DECL_NODE(ele_grad_dx); + PATTERN_DECL_NODE(ele_grad_dbias); + PATTERN_DECL_NODE(matmul_grad_x); + PATTERN_DECL_NODE(matmul_grad_w); + PATTERN_DECL_NODE(matmul_grad_dx); + PATTERN_DECL_NODE(matmul_grad_dw); + PATTERN_DECL_NODE(act_grad_dx); +}; + // Conv with Elementwise_add as bias // op: conv + elementwise_add // named nodes: diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 67287afa6ae..80e7f5c001d 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -19,7 +19,8 @@ register_operators(EXCLUDES fused_attention_op fused_transformer_op fused_feedforward_op - resnet_unit_op) + resnet_unit_op + fused_gemm_epilogue_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() + + if (CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc new file mode 100644 index 00000000000..4c4e3661e6d --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -0,0 +1,353 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedGemmEpilogueOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias", + "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGemmEpilogueOp"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto bias_dims = ctx->GetInputDim("Bias"); + + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + bias_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input tensor bias's dimension of FusedGemmEpilogueOp " + " should be == 1, but got %d.", + bias_dims.size())); + + PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1], + platform::errors::InvalidArgument( + "The Input tensor bias's dimension 0" + " should be == Y[-1], but got bias's shape = [%s] " + "and Y's shape = [%s]", + bias_dims, y_dims)); + + auto x_mat_dims = + phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1); + + int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1]; + int K_from_y = trans_y ? y_dims[1] : y_dims[0]; + + PADDLE_ENFORCE_EQ( + K_from_x, K_from_y, + platform::errors::InvalidArgument( + "The last dimension of X should be equal with Y's first dimension." + "But received X[-1] = [%d], Y[0] = [%d].", + K_from_x, K_from_y)); + + auto activation = ctx->Attrs().Get("activation"); + + if ((activation != "relu") && (activation != "gelu") && + (activation != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + + if (activation == "none" && ctx->HasOutput("ReserveSpace")) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The ReserveSpace would not be used when activation = \"none\"")); + } + + // cublasLt's restriction for auxiliary. + if (ctx->HasOutput("ReserveSpace") && activation != "none") { + int min_size_of_n = activation == "relu" ? 128 : 8; + int N_size = trans_y ? y_dims[0] : y_dims[1]; + PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0, + platform::errors::InvalidArgument( + "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) " + "should be multiple of %d when auxiliary_key given " + "and activation=%s, but got N = %d.", + min_size_of_n, activation, N_size)); + } + + std::vector out_dims; + out_dims.reserve(static_cast(x_dims.size())); + if (trans_x) { + for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]); + } else { + for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]); + } + + if (trans_y) { + out_dims.push_back(y_dims[0]); + } else { + out_dims.push_back(y_dims[1]); + } + + ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); + // Note (Ming Huang): Reserve space of relu is a bit-mask, + // which cannot pass nan_and_inf checking if shape is set. + if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) { + ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias)."); + AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias)."); + AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias)."); + + AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias)."); + AddOutput("ReserveSpace", + R"DOC(Reserve GPU space to place + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable() + .AsExtra(); + + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); + + AddAttr( + "activation", + R"DOC((string, default none), The activation function. It could be + one of {none, relu, gelu}. When none is given, Act would be null + operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogue Operator +This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)). +It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU). + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut", + "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp"); + + auto dout_dims = ctx->GetInputDim("DOut"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_GE( + dout_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + dout_dims.size())); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueGradOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + dout_dims.size(), x_dims.size(), + platform::errors::InvalidArgument( + "The Input tensor DOut's and X's dimension of " + "FusedGemmEpilogueGradOp " + " should be the same, but got DOut's dim = %d and X's = %d.", + dout_dims.size(), x_dims.size())); + + auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1); + + auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[1], y_dims[1], + platform::errors::InvalidArgument( + "The last dimension of DOut should be equal with Y's last" + "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", + dout_mat_dims[1], y_dims[1])); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[0], x_mat_dims[0], + platform::errors::InvalidArgument( + "The first dimension of DOut should be equal with X's first" + "dimension. But received DOut[0] = [%d], Y[0] = [%d].", + dout_mat_dims[0], x_mat_dims[0])); + + auto activation_grad = ctx->Attrs().Get("activation_grad"); + if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") && + (activation_grad != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation_grad)); + } + + if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "The ReserveSpace should not be empty. " + "when activation_grad == {relu_grad, gelu_grad}.")); + } + + if (ctx->HasOutput("DX")) { + std::vector dx_dims; + dx_dims.reserve(static_cast(x_dims.size())); + for (int i = 0; i < x_dims.size(); ++i) { + dx_dims.push_back(x_dims[i]); + } + ctx->SetOutputDim("DX", phi::make_ddim(dx_dims)); + } + + std::vector dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size()); + ctx->SetOutputDim("DY", phi::make_ddim(dy_dims)); + + if (ctx->HasOutput("DBias")) { + std::vector dbias_dims; + dbias_dims.push_back(y_dims[1]); + ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DOut", + "The input grad tensor to Out of Out = (Act(X) * Y) + bias"); + AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias"); + AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias"); + AddInput("ReserveSpace", + R"DOC(A GPU space to fetch + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue_grad op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable(); + + AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + AddOutput("DY", + "The output grad tensor to Y of Out = (Act(X) * Y) + bias."); + AddOutput("DBias", + "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + + AddAttr( + "activation_grad", + R"DOC((string, default none), The backward activation function. It could be + one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would + be null operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogueGrad Operator +This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias). +It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear. + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, + ops::FusedGemmEpilogueGradOpMaker) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu new file mode 100644 index 00000000000..e16c9e8f483 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -0,0 +1,376 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + bool enable_auxiliary = reserve_space == nullptr ? false : true; + + out->mutable_data(ctx.GetPlace()); + auto* out_data = out->data(); + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtMatmulDesc_t operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc, compute_type, scale_type)); + cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, + sizeof(transx))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, + sizeof(transy))); + + cublasLtEpilogue_t epiloque_func = + get_epilogue_type_(activation, enable_auxiliary); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func, + sizeof(epiloque_func))); + const T* bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data, + sizeof(bias_data))); + + if (enable_auxiliary && activation != "none") { + size_t reserve_space_size = 0; + if (activation == "relu") { + // Count in bits. + reserve_space_size = phi::product(out->dims()) / 8; + } else { + reserve_space_size = phi::product(out->dims()) * sizeof(T); + } + reserve_space->mutable_data(ctx.GetPlace(), out->type(), + reserve_space_size); + void* aux_data = reinterpret_cast(reserve_space->data()); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; + if (trans_x) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, M, K, M)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + if (trans_y) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, K, N, K)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc, mat_type, N, M, N)); + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx, workspace_size); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, operation_desc, alpha, y->data(), y_desc, x->data(), + x_desc, beta, out_data, out_desc, out_data, out_desc, algo, + workspace->ptr(), workspace_size, stream)); + } + + private: + static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation, + bool enable_auxiliary) { + if (activation == "relu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS + : CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == "gelu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS + : CUBLASLT_EPILOGUE_GELU_BIAS; + } else if (activation == "none") { + return CUBLASLT_EPILOGUE_BIAS; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } +}; + +template +class FusedGemmEpilogueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation_grad = ctx.Attr("activation_grad"); + + auto dout_mat_dims = + phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); + auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + + int64_t M = x_mat_dims[0]; + int64_t K = y->dims()[0]; + int64_t N = y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + cublasOperation_t trans_dout = CUBLAS_OP_N; + cublasLtMatrixLayout_t dout_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dout_desc, mat_type, N, M, N)); + + if (dx) { + cublasLtMatmulDesc_t dx_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dx_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_y = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, + sizeof(trans_y))); + cublasLtEpilogue_t epiloque_func_for_dx = + get_epilogue_type_(activation_grad); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dx, sizeof(epiloque_func_for_dx))); + + if (activation_grad != "none") { + auto* aux_data = reserve_space->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, K, M, K)); + + memory::allocation::AllocationPtr dx_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dx->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dx_operation_desc, alpha, y->data(), y_desc, + dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, + algo, dx_workspace->ptr(), workspace_size, stream)); + } + + if (dy) { + cublasLtMatmulDesc_t dy_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dy_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, + sizeof(trans_x))); + cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr + ? CUBLASLT_EPILOGUE_DEFAULT + : CUBLASLT_EPILOGUE_BGRADA; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); + + if (dbias) { + dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dbias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &dbias_data, sizeof(dbias_data))); + } + + cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, N, K, N)); + + memory::allocation::AllocationPtr dy_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dy_operation_desc, alpha, dout->data(), dout_desc, + x->data(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo, + dy_workspace->ptr(), workspace_size, stream)); + } + } + + private: + static cublasLtEpilogue_t get_epilogue_type_( + const std::string& activation_grad) { + if (activation_grad == "relu_grad") { + return CUBLASLT_EPILOGUE_DRELU; + } else if (activation_grad == "gelu_grad") { + return CUBLASLT_EPILOGUE_DGELU; + } else if (activation_grad == "none") { + return CUBLASLT_EPILOGUE_DEFAULT; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation_grad attribute of fused_gemm_epilogue op should " + "be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation_grad=%s.", + activation_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDA_VERSION >= 11060 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); +#endif diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index ab7d474c1ac..a32db3a9921 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -19,6 +19,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -110,5 +111,28 @@ class CublasHandleHolder { mutable std::mutex mtx_; }; +class CublasLtHandleHolder { + public: + CublasLtHandleHolder() { + PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_)); + } + const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; } + + ~CublasLtHandleHolder() PADDLE_MAY_THROW { + PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_)); + } + + inline void Call(const std::function& callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder); + + cublasLtHandle_t handle_; + mutable std::mutex mtx_; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index d7362fe9cbd..d0b48eca502 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -1,4 +1,5 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -24,6 +25,7 @@ #else #include #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/dynload/cudnn.h" #endif @@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +// TODO(Ming Huang): Since there is no blasLt handler, +// use rocblas_handle for workround. +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); + using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f60cbc48694..18ac979b48e 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -1,4 +1,6 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place, InitCuBlasContext(); InitCuDNNContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif InitCuSparseContext(); InitCuSolverContext(); #endif @@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) { DestoryCuDNNContext(); DestoryCuBlasContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + DestoryCuBlasLtContext(); +#endif DestoryCuSolverContext(); #endif @@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) { InitCuBlasContext(); InitCuDNNContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif InitCuSolverContext(); #endif } @@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() { DestoryCuDNNContext(); DestoryCuBlasContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif DestoryCuSparseContext(); DestoryCuSolverContext(); #endif @@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { } return phi::GPUContext::cublas_handle(); } +#if CUDA_VERSION >= 11060 +cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const { + if (thread_ctx_.count(this)) { + return context()->CublasLtHandle()->GetCublasLtHandle(); + } + return phi::GPUContext::cublaslt_handle(); +} +#endif cusparseHandle_t CUDADeviceContext::cusparse_handle() const { if (thread_ctx_.count(this)) { return context()->CusparseHandle()->GetCusparseHandle(); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 29b6477b683..e104170ca24 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -1,4 +1,6 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -29,6 +31,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusparse.h" @@ -332,6 +335,12 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + const std::unique_ptr& CublasLtHandle() const { + return cublaslt_handle_; + } +#endif + const std::unique_ptr& CusparseHandle() const { return cusparse_handle_; } @@ -348,6 +357,14 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + /*! \brief Call cublasLt function safely. */ + inline void CublasLtCall( + const std::function& callback) const { + cublaslt_handle_->Call(callback); + } +#endif + /*! \brief Call cusparse function safely. */ inline void CusparseCall( const std::function& callback) const { @@ -394,6 +411,12 @@ class CUDAContext { #endif #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + void InitCuBlasLtContext() { + cublaslt_handle_.reset(new CublasLtHandleHolder()); + } +#endif + void InitCuSparseContext() { cusparse_handle_.reset(new CusparseHandleHolder(RawStream())); } @@ -472,6 +495,10 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); } +#endif + void DestoryCuSparseContext() { cusparse_handle_.reset(); } #endif @@ -497,6 +524,9 @@ class CUDAContext { std::unique_ptr cublas_tensor_core_handle_; std::unique_ptr cublas_tf32_tensor_core_handle_; #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + std::unique_ptr cublaslt_handle_; +#endif cusolverDnHandle_t cusolver_dn_handle_; std::unique_ptr cusparse_handle_; #endif @@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext { rocblas_handle cublas_handle() const; #else cublasHandle_t cublas_handle() const; + cublasLtHandle_t cublaslt_handle() const; cusparseHandle_t cusparse_handle() const; #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c016321ef80..0a1cf604d2e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -3440,6 +3441,31 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = static.BuildStrategy() build_strategy.fuse_elewise_add_act_ops = True )DOC") + .def_property( + "fuse_gemm_epilogue", + [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_gemm_epilogue_ = b; + }, + R"DOC((bool, optional): fuse_gemm_epilogue indicate whether + to fuse matmul_op, elemenewist_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + )DOC") .def_property( "fuse_bn_act_ops", [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index d0787159e1e..33daa2bba6b 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *; // Forward declaration of cuBLAS types. using cublasHandle_t = struct cublasContext *; +// Forward declaration of cuBLASLt types. +using cublasLtHandle_t = struct cublasLtContext *; + // Forward declaration of cuSOLVER types. using cusolverDnHandle_t = struct cusolverDnContext *; diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index dbcc1660c64..09deb575f24 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -171,6 +172,7 @@ struct GPUContext::Impl { InitStream(); InitEigenDevice(); InitBlasHandle(); + InitBlasLtHandle(); InitDNNHandle(); InitSolverHandle(); InitSparseHandle(); @@ -183,6 +185,7 @@ struct GPUContext::Impl { InitGpuProperties(); InitStream(); InitBlasHandle(); + InitBlasLtHandle(); InitDNNHandle(); InitSolverHandle(); InitSparseHandle(); @@ -212,6 +215,7 @@ struct GPUContext::Impl { } #endif DestroyInternalBlasHandle(); + DestroyInternalBlasLtHandle(); DestoryInternalStream(); } @@ -418,6 +422,25 @@ struct GPUContext::Impl { void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; } + void InitBlasLtHandle() { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + phi::dynload::cublasLtCreate(&blaslt_handle_); +#endif + } + + void DestroyInternalBlasLtHandle() { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + phi::dynload::cublasLtDestroy(blaslt_handle_); +#endif + } + + void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } + + blasLtHandle_t GetBlasLtHandle() const { + PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); + return blaslt_handle_; + } + void InitDNNHandle() { if (phi::dynload::HasCUDNN()) { #ifdef PADDLE_WITH_HIP @@ -679,6 +702,7 @@ struct GPUContext::Impl { blasHandle_t blas_handle_{nullptr}; blasHandle_t blas_tensor_core_handle_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; + blasLtHandle_t blaslt_handle_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; solverHandle_t solver_handle_{nullptr}; sparseHandle_t sparse_handle_{nullptr}; @@ -725,6 +749,10 @@ blasHandle_t GPUContext::cublas_handle() const { return impl_->GetBlasHandle(); } +blasLtHandle_t GPUContext::cublaslt_handle() const { + return impl_->GetBlasLtHandle(); +} + solverHandle_t GPUContext::cusolver_dn_handle() const { return impl_->GetSolverHandle(); } @@ -815,6 +843,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) { impl_->SetBlasHandle(blas); } +void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { + impl_->SetBlasLtHandle(blaslt); +} + void GPUContext::SetDnnHandle(dnnHandle_t handle) { impl_->SetDnnHandle(handle); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index b9d843982dc..3eb4360ad35 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -93,6 +94,9 @@ class GPUContext : public DeviceContext { /*! \brief Return cublas handle in the device context. */ blasHandle_t cublas_handle() const; + /*! \brief Return cublasLt handle in the device context. */ + blasLtHandle_t cublaslt_handle() const; + /*! \brief Return cusolver handle in the device context. */ solverHandle_t cusolver_dn_handle() const; @@ -193,6 +197,8 @@ class GPUContext : public DeviceContext { void SetBlasHandle(blasHandle_t); + void SetBlasLtHandle(blasLtHandle_t); + void SetDnnHandle(dnnHandle_t); void SetSolverHandle(solverHandle_t); diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 0be24392e1b..4a6b9d2fd87 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -1,4 +1,5 @@ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +// TODO(Ming Huang): Since there is no blasLt handler, +// use rocblas_handle for workround. +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); + DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1443eebf293..f8102ec4080 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -125,6 +125,17 @@ if(NOT WITH_GPU) LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op) LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api) LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op) + LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass) +endif() + +if (WITH_GPU) + if (CUDA_VERSION LESS 11.6) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op) + LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass) + endif() endif() if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py new file mode 100644 index 00000000000..7f3180e21d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py @@ -0,0 +1,392 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test cases for role makers.""" + +from __future__ import print_function +import paddle +import os +import unittest +import numpy as np +import paddle.fluid.core as core + + +def compare(ref, res, atol, rtol): + + ref = np.array(ref).flatten() + res = np.array(res).flatten() + + tmp_ref = ref.astype(np.float) + tol = atol + rtol * abs(tmp_ref) + + diff = abs(res - ref) + + indices = np.transpose(np.where(diff > tol)) + if len(indices) == 0: + return True + return False + + +def verify_node_count(graph, node_name, target_count): + count = 0 + for node in graph.nodes(): + if node.name() == node_name: + count += 1 + return count == target_count + + +class MultiFCLayer(paddle.nn.Layer): + def __init__(self, hidden, Activation): + super(MultiFCLayer, self).__init__() + self.linear1 = paddle.nn.Linear(hidden, hidden) + self.linear2 = paddle.nn.Linear(hidden, hidden) + self.linear3 = paddle.nn.Linear(hidden, hidden) + + self.relu1 = Activation() + self.relu2 = Activation() + self.relu3 = Activation() + + def forward(self, x, matmul_y, ele_y): + output = self.linear1(x) + output = self.relu1(output) + output = self.linear2(output) + + output1 = paddle.matmul(output, matmul_y) + output = self.linear3(output) + output = self.relu2(output) + + output = paddle.matmul(output, matmul_y) + output = paddle.add(output, ele_y) + output = self.relu3(output) + output = paddle.add(output, output1) + return output + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueFWDBase(unittest.TestCase): + def setUp(self): + self.batch = 64 + self.seqlen = 128 + self.hidden = 768 + + paddle.enable_static() + + self.main_prog = paddle.static.Program() + self.startup_prog = paddle.static.Program() + + with paddle.static.program_guard(self.main_prog, self.startup_prog): + data = paddle.static.data( + name="_data", + shape=[-1, self.seqlen, self.hidden], + dtype='float32') + matmul_y = paddle.static.data( + name="_matmul_y", + shape=[1, self.hidden, self.hidden], + dtype='float32') + ele_y = paddle.static.data( + name="_ele_y", shape=[self.hidden, ], dtype='float32') + + multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) + with paddle.static.amp.fp16_guard(): + out = multi_layer(data, matmul_y, ele_y) + self.loss = paddle.mean(out) + + self.data_arr = np.random.random( + (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 + self.matmul_y_arr = np.random.random( + (1, self.hidden, self.hidden)).astype("float32") - 0.5 + self.ele_y_arr = np.random.random( + (self.hidden, )).astype("float32") - 0.5 + + self.place = paddle.CUDAPlace(0) + self.exe = paddle.static.Executor(self.place) + self.exe.run(self.startup_prog) + + self._pre_test_hooks() + + self.feed = { + "_data": self.data_arr, + "_matmul_y": self.matmul_y_arr, + "_ele_y": self.ele_y_arr + } + self.reference = self.exe.run(self.main_prog, + feed=self.feed, + fetch_list=[self.loss.name]) + + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def _test_output(self): + build_strategy = paddle.static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + program = paddle.static.CompiledProgram(self.main_prog) + program = program.with_data_parallel( + loss_name=self.loss.name, + build_strategy=build_strategy, + places=paddle.static.cuda_places()) + + result = self.exe.run(program, + feed=self.feed, + fetch_list=[self.loss.name]) + self.assertTrue( + compare(self.reference, result, self.atol, self.rtol), + "[{}] outputs are miss-matched.".format(type(self).__name__)) + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue", 3), + "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". + format(type(self).__name__)) + act_fwd_name = self._get_act_type()[1] + self.assertTrue( + verify_node_count(program._graph, act_fwd_name, 1), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_fwd_name)) + + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu" + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase): + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.GELU, "gelu" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueBWDBase(unittest.TestCase): + def setUp(self): + self.batch = 64 + self.seqlen = 128 + self.hidden = 768 + + paddle.enable_static() + + self.main_prog = paddle.static.Program() + self.startup_prog = paddle.static.Program() + + with paddle.static.program_guard(self.main_prog, self.startup_prog): + data = paddle.static.data( + name="_data", + shape=[-1, self.seqlen, self.hidden], + dtype='float32') + matmul_y = paddle.static.data( + name="_matmul_y", + shape=[1, self.hidden, self.hidden], + dtype='float32') + ele_y = paddle.static.data( + name="_ele_y", shape=[self.hidden, ], dtype='float32') + + multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) + with paddle.static.amp.fp16_guard(): + out = multi_layer(data, matmul_y, ele_y) + self.loss = paddle.mean(out) + paddle.static.append_backward(loss=self.loss) + + self.data_arr = np.random.random( + (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 + self.matmul_y_arr = np.random.random( + (1, self.hidden, self.hidden)).astype("float32") - 0.5 + self.ele_y_arr = np.random.random( + (self.hidden, )).astype("float32") - 0.5 + + self.place = paddle.CUDAPlace(0) + self.exe = paddle.static.Executor(self.place) + self.exe.run(self.startup_prog) + + self._pre_test_hooks() + + self.feed = { + "_data": self.data_arr, + "_matmul_y": self.matmul_y_arr, + "_ele_y": self.ele_y_arr + } + + self.fetch = [ + self.loss.name, + '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()), + '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()), + '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear3.full_name()) + ] + self.outs_ref = self.exe.run(self.main_prog, + feed=self.feed, + fetch_list=self.fetch) + + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def _test_output(self): + build_strategy = paddle.static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + program = paddle.static.CompiledProgram(self.main_prog) + program = program.with_data_parallel( + loss_name=self.loss.name, + build_strategy=build_strategy, + places=paddle.static.cuda_places()) + + outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch) + + for ref, res in zip(self.outs_ref, outs_res): + self.assertTrue( + compare(ref, res, self.atol, self.rtol), + "[{}] output is miss-matched.".format(type(self).__name__)) + + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue", 3), + "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". + format(type(self).__name__)) + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3), + "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.". + format(type(self).__name__)) + _, act_fwd_name, act_bwd_name = self._get_act_type() + self.assertTrue( + verify_node_count(program._graph, act_fwd_name, 1), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_fwd_name)) + self.assertTrue( + verify_node_count(program._graph, act_bwd_name, 2), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_bwd_name)) + + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu", "relu_grad" + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase): + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu", "relu_grad" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase): + def _pre_test_hooks(self): + self.atol = 5e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.GELU, "gelu", "gelu_grad" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py new file mode 100644 index 00000000000..2ea1bf2e9cb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py @@ -0,0 +1,239 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci + + +def get_outputs(DOut, X, Y): + DX = np.dot(DOut, Y.T) + DY = np.dot(X.T, DOut) + DBias = np.sum(DOut, axis=0) + + return DX, DY, DBias + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP32( + TestFuseGemmEpilogueGradOpDXYBiasFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP64( + TestFuseGemmEpilogueGradOpDXYBiasFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY, 'DBias': DBias} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP32( + TestFuseGemmEpilogueGradOpDYBiasFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP64( + TestFuseGemmEpilogueGradOpDYBiasFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py new file mode 100644 index 00000000000..f826898f9e5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py @@ -0,0 +1,450 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci + + +def gelu(x): + y_ref = 0.5 * x * ( + 1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) + return y_ref.astype(x.dtype) + + +def relu(x): + mask = x > 0 + return x * mask + + +def get_output(X, Y, bias, act): + out = np.dot(X, Y) + bias + if act == 'relu': + return relu(out) + elif act == 'gelu': + return gelu(out) + else: + return out + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'relu') + } + self.attrs = {"activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].T, self.inputs['Y'], + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_x': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'].T, + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_y': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T, + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].reshape( + (-1, 4)), self.inputs['Y'], self.inputs['Bias'], + 'relu').reshape((2, 2, 8, 128)) + } + self.attrs = {"activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP32MultiDimX( + TestFuseGemmEpilogueOpReluMMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP64MultiDimX( + TestFuseGemmEpilogueOpReluMMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].reshape( + (4, -1)).T, self.inputs['Y'], self.inputs['Bias'], + 'relu').reshape((2, 2, 8, 128)) + } + self.attrs = {'trans_x': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX( + TestFuseGemmEpilogueOpReluMTMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX( + TestFuseGemmEpilogueOpReluMTMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'gelu'} + + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'gelu') + } + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'none') + } + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 7356f0c8db0..365047f7e83 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -729,4 +730,6 @@ STATIC_MODE_TESTING_LIST = [ 'test_lu_op', 'test_margin_cross_entropy_op', 'test_pull_gpups_sparse_op', + 'test_fused_gemm_epilogue_op', + 'test_fused_gemm_epilogue_grad_op', ] -- GitLab From 1c29196e8de08edc18dbfc6c77ebcd22e595e1fd Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 7 Mar 2022 16:43:29 +0800 Subject: [PATCH 089/261] [Phi]Move bincount OP to phi (#39947) * move bincount OP to phi * fix dtype * set_dtype by weights or x * fix conflicts --- paddle/fluid/operators/bincount_op.cc | 62 ++------ paddle/fluid/operators/bincount_op.cu | 162 --------------------- paddle/fluid/operators/bincount_op.h | 109 -------------- paddle/phi/infermeta/binary.cc | 50 +++++++ paddle/phi/infermeta/binary.h | 4 + paddle/phi/kernels/bincount_kernel.h | 28 ++++ paddle/phi/kernels/cpu/bincount_kernel.cc | 106 ++++++++++++++ paddle/phi/kernels/gpu/bincount_kernel.cu | 164 ++++++++++++++++++++++ paddle/phi/ops/compat/bincount_sig.cc | 25 ++++ 9 files changed, 386 insertions(+), 324 deletions(-) delete mode 100644 paddle/fluid/operators/bincount_op.cu delete mode 100644 paddle/fluid/operators/bincount_op.h create mode 100644 paddle/phi/kernels/bincount_kernel.h create mode 100644 paddle/phi/kernels/cpu/bincount_kernel.cc create mode 100644 paddle/phi/kernels/gpu/bincount_kernel.cu create mode 100644 paddle/phi/ops/compat/bincount_sig.cc diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc index b37334a14ba..062e7d510d5 100644 --- a/paddle/fluid/operators/bincount_op.cc +++ b/paddle/fluid/operators/bincount_op.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bincount_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of BincountOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of BincountOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto minlength = ctx->Attrs().Get("minlength"); - - PADDLE_ENFORCE_GE(minlength, 0, - platform::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ(input_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (ctx->HasInput("Weights")) { - auto weights_dim = ctx->GetInputDim("Weights"); - PADDLE_ENFORCE_EQ(weights_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], input_dim[0], - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, input_dim)); - } - - ctx->SetOutputDim("Out", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = @@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor, + PD_INFER_META(phi::BincountInferMeta)); REGISTER_OPERATOR( bincount, ops::BincountOp, ops::BincountOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - bincount, ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel); + paddle::framework::EmptyGradOpMaker, + BincountInferShapeFunctor); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu deleted file mode 100644 index cc576d0af92..00000000000 --- a/paddle/fluid/operators/bincount_op.cu +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__global__ void KernelBincount(const InputT* input, const int total_elements, - const bool has_weights, const T* weights, - OutT* output) { - if (!has_weights) { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); - } - } else { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); - } - } -} - -template -void BincountCUDAInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - const int input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - - auto input_max_scala = framework::EigenScalar::From(input_max_t); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - - auto* place = context.template device_context().eigen_device(); - input_max_scala.device(*place) = input_x.maximum(); - input_min_scala.device(*place) = input_x.minimum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - - InputT input_min = input_min_cpu.data()[0]; - - PADDLE_ENFORCE_GE( - input_min, static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = - static_cast(input_max_cpu.data()[0]) + 1L; - - output_size = std::max(output_size, static_cast(minlength)); - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - const T* weights_data = has_weights ? weights->data() : nullptr; - - auto stream = - context.template device_context().stream(); - - if (!has_weights) { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } - } -} - -template -class BincountCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountCUDAInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountCUDAInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bincount, ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h deleted file mode 100644 index 84256bf78e4..00000000000 --- a/paddle/fluid/operators/bincount_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void BincountInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - auto input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - - PADDLE_ENFORCE_GE( - *std::min_element(input_data, input_data + input_numel), - static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = static_cast(*std::max_element( - input_data, input_data + input_numel)) + - 1L; - output_size = std::max(output_size, static_cast(minlength)); - - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - if (has_weights) { - const T* weights_data = weights->data(); - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } - - } else { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += 1L; - } - } -} - -template -class BincountKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 94b489906c6..55230aa8d05 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -456,6 +456,56 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out) { + auto input_dim = x.dims(); + + PADDLE_ENFORCE_GE(minlength, + 0, + phi::errors::InvalidArgument( + "The minlength should be greater than or equal to 0." + "But received minlength is %d", + minlength)); + + PADDLE_ENFORCE_EQ( + input_dim.size(), + 1, + phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." + "But the dimension of Input(X) is [%d]", + input_dim.size())); + + if (weights.is_initialized()) { + auto weights_dim = weights->dims(); + PADDLE_ENFORCE_EQ(weights_dim.size(), + 1, + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be 1-D tensor." + "But the dimension of Input(Weights) is [%d]", + weights_dim.size())); + + PADDLE_ENFORCE_EQ( + weights_dim[0], + input_dim[0], + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be equal to the 'shape' of " + "Input(X)." + "But received: the 'shape' of Input(Weights) is [%s]," + "the 'shape' of Input(X) is [%s]", + weights_dim, + input_dim)); + } + out->set_dims(phi::make_ddim({-1})); + if (weights.is_initialized()) { + out->set_dtype(weights->dtype()); + } else { + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index caf9185c900..106c22f7548 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -85,6 +85,10 @@ void BCELossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out); void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h new file mode 100644 index 00000000000..3ba69d36548 --- /dev/null +++ b/paddle/phi/kernels/bincount_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc new file mode 100644 index 00000000000..c9dc44c1e04 --- /dev/null +++ b/paddle/phi/kernels/cpu/bincount_kernel.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bincount_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void BincountInner(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + const DenseTensor* input = &x; + DenseTensor* output = out; + const InputT* input_data = input->data(); + + auto input_numel = input->numel(); + + if (input_data == nullptr) { + phi::DDim out_dim{0}; + output->Resize(out_dim); + dev_ctx.template Alloc(output); + return; + } + + PADDLE_ENFORCE_GE( + *std::min_element(input_data, input_data + input_numel), + static_cast(0), + phi::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = static_cast(*std::max_element( + input_data, input_data + input_numel)) + + 1L; + output_size = std::max(output_size, static_cast(minlength)); + + phi::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = weights.is_initialized(); + + if (has_weights) { + const T* weights_data = weights->data(); + if (weights->dtype() == DataType::FLOAT32) { + float* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } else { + double* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } + + } else { + int64_t* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()(dev_ctx, output, 0L); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += 1L; + } + } +} + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + if (x.dtype() == DataType::INT32) { + BincountInner(dev_ctx, x, weights, minlength, out); + } else if (x.dtype() == DataType::INT64) { + BincountInner(dev_ctx, x, weights, minlength, out); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(bincount, + CPU, + ALL_LAYOUT, + phi::BincountKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu new file mode 100644 index 00000000000..a4ec894790c --- /dev/null +++ b/paddle/phi/kernels/gpu/bincount_kernel.cu @@ -0,0 +1,164 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bincount_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +inline int GET_BLOCKS(const int N) { + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; +} + +template +__global__ void KernelBincount(const InputT* input, + const int total_elements, + const bool has_weights, + const T* weights, + OutT* output) { + if (!has_weights) { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); + } + } else { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], + static_cast(weights[i])); + } + } +} + +template +void BincountCUDAInner(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + const DenseTensor* input = &x; + DenseTensor* output = out; + const InputT* input_data = input->data(); + + const int input_numel = input->numel(); + + if (input_data == nullptr) { + phi::DDim out_dim{0}; + output->Resize(out_dim); + dev_ctx.template Alloc(output); + return; + } + auto input_x = EigenVector::Flatten(*input); + DenseTensor input_min_t, input_max_t; + input_max_t.Resize({1}); + auto* input_max_data = dev_ctx.template Alloc(&input_max_t); + input_min_t.Resize({1}); + auto* input_min_data = dev_ctx.template Alloc(&input_min_t); + + auto input_max_scala = EigenScalar::From(input_max_t); + auto input_min_scala = EigenScalar::From(input_min_t); + + auto* place = dev_ctx.eigen_device(); + input_max_scala.device(*place) = input_x.maximum(); + input_min_scala.device(*place) = input_x.minimum(); + + DenseTensor input_min_cpu, input_max_cpu; + paddle::framework::TensorCopySync( + input_max_t, phi::CPUPlace(), &input_max_cpu); + paddle::framework::TensorCopySync( + input_min_t, phi::CPUPlace(), &input_min_cpu); + + InputT input_min = input_min_cpu.data()[0]; + + PADDLE_ENFORCE_GE( + input_min, + static_cast(0), + phi::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = + static_cast(input_max_cpu.data()[0]) + 1L; + + output_size = std::max(output_size, static_cast(minlength)); + phi::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = weights.is_initialized(); + + const T* weights_data = has_weights ? weights->data() : nullptr; + auto stream = dev_ctx.stream(); + + if (!has_weights) { + int64_t* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()(dev_ctx, output, 0L); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + const auto& weights_type = + paddle::framework::TransToProtoVarType(weights->dtype()); + + if (weights->dtype() == DataType::FLOAT32) { + float* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + double* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } + } +} + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + if (x.dtype() == DataType::INT32) { + BincountCUDAInner(dev_ctx, x, weights, minlength, out); + } else if (x.dtype() == DataType::INT64) { + BincountCUDAInner(dev_ctx, x, weights, minlength, out); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(bincount, + GPU, + ALL_LAYOUT, + phi::BincountKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc new file mode 100644 index 00000000000..35067c256ed --- /dev/null +++ b/paddle/phi/ops/compat/bincount_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping); -- GitLab From c09adab84fdd1fb13ac751871787d3337ba3ca77 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Mon, 7 Mar 2022 16:57:56 +0800 Subject: [PATCH 090/261] refactor unittest for nearest_interp_v2_op_xpu. test=kunlun (#39804) * refactor unittest for nearest_interp_v2_op_xpu. test=kunlun * fix code style. test=kunlun * fix code style. test=kunlun --- .../xpu/test_nearest_interp_v2_op_xpu.py | 731 +++++++++--------- 1 file changed, 349 insertions(+), 382 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py index 8c1ce68e9d0..7a3b4a5a217 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ from __future__ import print_function import unittest import numpy as np -import paddle -import paddle.fluid.core as core import sys sys.path.append("..") + +import paddle + +from op_test import OpTest from op_test_xpu import XPUOpTest -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() @@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X, return out.astype(X.dtype) -class TestNearestInterpOp(XPUOpTest): - def setUp(self): - self.use_xpu = True - self.out_size = None - self.actual_shape = None - self.data_layout = 'NCHW' - self.init_test_case() - self.op_type = "nearest_interp_v2" - input_np = np.random.random(self.input_shape).astype("float32") - - if self.data_layout == "NCHW" and len(self.input_shape) == 4: - in_d = 1 - in_h = self.input_shape[2] - in_w = self.input_shape[3] - else: - in_d = 1 - in_h = self.input_shape[1] - in_w = self.input_shape[2] - - if self.data_layout == "NCDHW" and len(self.input_shape) == 5: - in_d = self.input_shape[2] - in_h = self.input_shape[3] - in_w = self.input_shape[4] - else: - in_d = self.input_shape[1] - in_h = self.input_shape[2] - in_w = self.input_shape[3] - scale_d = 0 - scale_h = 0 - scale_w = 0 - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - scale_d = scale_h = scale_w = float(self.scale) - if isinstance(self.scale, list) and len(self.scale) == 1: - scale_d = scale_w = scale_h = self.scale[0] - elif isinstance(self.scale, list) and len(self.scale) > 1: - if len(self.scale) == 5: - scale_w = self.scale[2] - scale_h = self.scale[1] - scale_d = self.scale[0] - else: - scale_w = self.scale[1] - scale_h = self.scale[0] +class XPUNearestInterpOpWrapper(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'nearest_interp_v2' + self.use_dynamic_create_class = False - out_h = int(in_h * scale_h) - out_w = int(in_w * scale_w) - out_d = int(in_d * scale_d) - else: - if len(self.input_shape) == 5: - out_d = self.out_d - out_h = self.out_h - out_w = self.out_w + class TestNearestInterpOp(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() - if len(self.input_shape) == 4: - output_np = nearest_neighbor_interp_np( - input_np, out_h, out_w, scale_h, scale_w, self.out_size, - self.actual_shape, self.align_corners, self.data_layout) - elif len(self.input_shape) == 5: - output_np = nearest_neighbor_interp3d_np( - input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w, - self.out_size, self.actual_shape, self.align_corners, - self.data_layout) - self.inputs = {'X': input_np} - if self.out_size is not None: - self.inputs['OutSize'] = self.out_size - if self.actual_shape is not None: - self.inputs['OutSize'] = self.actual_shape - if len(self.input_shape) == 5: - self.attrs = { - 'out_d': self.out_d, - 'out_h': self.out_h, - 'out_w': self.out_w, - 'interp_method': self.interp_method, - 'align_corners': self.align_corners, - 'data_layout': self.data_layout - } - else: + self.out_size = None + self.actual_shape = None + self.data_layout = 'NCHW' + + self.interp_method = 'nearest' + self.scale = 0. + self.align_corners = True + + self.init_test_case() + self.op_type = "nearest_interp_v2" + input_np = np.random.random(self.input_shape).astype(self.dtype) + + # in + if self.data_layout == "NCHW" and len(self.input_shape) == 4: + in_d = 1 + in_h = self.input_shape[2] + in_w = self.input_shape[3] + else: + in_d = 1 + in_h = self.input_shape[1] + in_w = self.input_shape[2] + + if self.data_layout == "NCDHW" and len(self.input_shape) == 5: + in_d = self.input_shape[2] + in_h = self.input_shape[3] + in_w = self.input_shape[4] + else: + in_d = self.input_shape[1] + in_h = self.input_shape[2] + in_w = self.input_shape[3] + + # scale + scale_d = 0 + scale_h = 0 + scale_w = 0 + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_d = scale_h = scale_w = float(self.scale) + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_d = scale_w = scale_h = self.scale[0] + self.scale = [self.scale[0], self.scale[0]] + elif isinstance(self.scale, list) and len(self.scale) > 1: + if len(self.scale) == 5: + scale_w = self.scale[2] + scale_h = self.scale[1] + scale_d = self.scale[0] + else: + scale_w = self.scale[1] + scale_h = self.scale[0] + + out_h = int(in_h * scale_h) + out_w = int(in_w * scale_w) + out_d = int(in_d * scale_d) + else: + if len(self.input_shape) == 5: + out_d = self.out_d + out_h = self.out_h + out_w = self.out_w + + # output_np + if len(self.input_shape) == 4: + output_np = nearest_neighbor_interp_np( + input_np, out_h, out_w, scale_h, scale_w, self.out_size, + self.actual_shape, self.align_corners, self.data_layout) + elif len(self.input_shape) == 5: + output_np = nearest_neighbor_interp3d_np( + input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w, + self.out_size, self.actual_shape, self.align_corners, + self.data_layout) + self.outputs = {'Out': output_np} + + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + + if len(self.input_shape) == 5: + self.attrs = { + 'out_d': self.out_d, + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + else: + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + + if self.scale: + self.attrs['scale'] = self.scale + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.input_shape = [2, 3, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.out_size = np.array([3, 3]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support 5-dim input_shape + class TestNearestNeighborInterpCase1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 1, 7, 8] + self.out_d = 1 + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.align_corners = True + """ + + class TestNearestNeighborInterpCase2(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + + class TestNearestNeighborInterpCase3(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + + class TestNearestNeighborInterpCase4(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.out_size = np.array([2, 2]).astype("int32") + + class TestNearestNeighborInterpCase5(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = np.array([11, 11]).astype("int32") + + class TestNearestNeighborInterpCase6(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([65, 129]).astype("int32") + + class TestNearestNeighborInterpSame(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [2, 3, 32, 64] + self.out_h = 32 + self.out_w = 64 + + class TestNearestNeighborInterpActualShape(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support NHWC data_layout + class TestNearestNeighborInterpDataLayout(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 4, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 8]).astype("int32") + self.align_corners = True + self.data_layout = "NHWC" + """ + + class TestNearestInterpWithoutCorners(TestNearestInterpOp): + def set_align_corners(self): + self.align_corners = False + + class TestNearestNeighborInterpScale1(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = 2. + self.out_size = np.array([66, 40]).astype("int32") + + class TestNearestNeighborInterpScale2(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 5, 7] + self.out_h = 64 + self.out_w = 32 + self.scale = 1.5 + self.out_size = np.array([66, 40]).astype("int32") + + class TestNearestNeighborInterpScale3(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = [2.0, 3.0] + self.out_size = np.array([66, 40]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support 5-dim input_shape + class TestNearestNeighbor3DInterp(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 4, 7, 5] + self.out_d = 8 + self.out_h = 64 + self.out_w = 32 + self.scale = [4.0, 2.0, 3.0] + self.out_size = np.array([8, 66, 40]).astype("int32") + self.align_corners = True + """ + + class TestNearestInterpOp_attr_tensor(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + + self.out_size = None + self.actual_shape = None + + self.interp_method = 'nearest' + self.scale = 0. + self.align_corners = True + + self.init_test_case() + self.op_type = "nearest_interp_v2" + self.shape_by_1Dtensor = False + self.scale_by_1Dtensor = False self.attrs = { - 'out_h': self.out_h, - 'out_w': self.out_w, 'interp_method': self.interp_method, 'align_corners': self.align_corners, - 'data_layout': self.data_layout } - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - self.scale = [self.scale] - if isinstance(self.scale, list) and len(self.scale) == 1: - self.scale = [self.scale[0], self.scale[0]] - self.attrs['scale'] = self.scale - self.outputs = {'Out': output_np} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', in_place=True) - - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 3, 4, 5] - self.out_h = 2 - self.out_w = 2 - self.scale = 0. - self.out_size = np.array([3, 3]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support 5-dim input_shape -class TestNearestNeighborInterpCase1(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [4, 1, 1, 7, 8] - self.out_d = 1 - self.out_h = 1 - self.out_w = 1 - self.scale = 0. - self.align_corners = True -""" - - -class TestNearestNeighborInterpCase2(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpCase3(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [1, 1, 32, 64] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpCase4(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [4, 1, 7, 8] - self.out_h = 1 - self.out_w = 1 - self.scale = 0. - self.out_size = np.array([2, 2]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpCase5(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.out_size = np.array([11, 11]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpCase6(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [1, 1, 32, 64] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([65, 129]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpSame(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 3, 32, 64] - self.out_h = 32 - self.out_w = 64 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpActualShape(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support NHWC data_layout -class TestNearestNeighborInterpDataLayout(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 4, 4, 5] - self.out_h = 2 - self.out_w = 2 - self.scale = 0. - self.out_size = np.array([3, 8]).astype("int32") - self.align_corners = True - self.data_layout = "NHWC" -""" - - -class TestNearestInterpWithoutCorners(TestNearestInterpOp): - def set_align_corners(self): - self.align_corners = False - - -class TestNearestNeighborInterpScale1(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 7, 5] - self.out_h = 64 - self.out_w = 32 - self.scale = 2. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpScale2(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 5, 7] - self.out_h = 64 - self.out_w = 32 - self.scale = 1.5 - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpScale3(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 7, 5] - self.out_h = 64 - self.out_w = 32 - self.scale = [2.0, 3.0] - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support 5-dim input_shape -class TestNearestNeighbor3DInterp(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 4, 7, 5] - self.out_d = 8 - self.out_h = 64 - self.out_w = 32 - self.scale = [4.0, 2.0, 3.0] - self.out_size = np.array([8, 66, 40]).astype("int32") - self.align_corners = True -""" - - -class TestNearestInterpOp_attr_tensor(XPUOpTest): - def setUp(self): - self.use_xpu = True - self.out_size = None - self.actual_shape = None - self.init_test_case() - self.op_type = "nearest_interp_v2" - self.shape_by_1Dtensor = False - self.scale_by_1Dtensor = False - self.attrs = { - 'interp_method': self.interp_method, - 'align_corners': self.align_corners, - } - - input_np = np.random.random(self.input_shape).astype("float32") - self.inputs = {'X': input_np} - - if self.scale_by_1Dtensor: - self.inputs['Scale'] = np.array([self.scale]).astype("float32") - elif self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - scale_h = scale_w = float(self.scale) - if isinstance(self.scale, list) and len(self.scale) == 1: - scale_w = scale_h = self.scale[0] - elif isinstance(self.scale, list) and len(self.scale) > 1: - scale_w = self.scale[1] - scale_h = self.scale[0] - out_h = int(self.input_shape[2] * scale_h) - out_w = int(self.input_shape[3] * scale_w) - else: - out_h = self.out_h - out_w = self.out_w - - if self.shape_by_1Dtensor: - self.inputs['OutSize'] = self.out_size - elif self.out_size is not None: - size_tensor = [] - for index, ele in enumerate(self.out_size): - size_tensor.append(("x" + str(index), np.ones( - (1)).astype('int32') * ele)) - self.inputs['SizeTensor'] = size_tensor - - self.attrs['out_h'] = self.out_h - self.attrs['out_w'] = self.out_w - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - self.scale = [self.scale] - if isinstance(self.scale, list) and len(self.scale) == 1: - self.scale = [self.scale[0], self.scale[0]] - self.attrs['scale'] = self.scale - output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0, - self.out_size, self.actual_shape, - self.align_corners) - self.outputs = {'Out': output_np} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', in_place=True) - - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 5, 4, 4] - self.out_h = 3 - self.out_w = 3 - self.scale = 0. - self.out_size = [3, 3] - self.align_corners = True - - -# out_size is a tensor list -class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.out_size = [8, 12] - self.align_corners = True - - -# out_size is a 1-D tensor -class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - self.shape_by_1Dtensor = True - - -# scale is a 1-D tensor -class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 2.0 - self.out_size = None - self.align_corners = True - self.scale_by_1Dtensor = True + input_np = np.random.random(self.input_shape).astype(self.dtype) + self.inputs = {'X': input_np} + + if self.scale_by_1Dtensor: + self.inputs['Scale'] = np.array([self.scale]).astype("float32") + elif self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + out_h = int(self.input_shape[2] * scale_h) + out_w = int(self.input_shape[3] * scale_w) + else: + out_h = self.out_h + out_w = self.out_w + + if self.shape_by_1Dtensor: + self.inputs['OutSize'] = self.out_size + elif self.out_size is not None: + size_tensor = [] + for index, ele in enumerate(self.out_size): + size_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + self.inputs['SizeTensor'] = size_tensor + + self.attrs['out_h'] = self.out_h + self.attrs['out_w'] = self.out_w + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + output_np = nearest_neighbor_interp_np( + input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape, + self.align_corners) + self.outputs = {'Out': output_np} + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.input_shape = [2, 5, 4, 4] + self.out_h = 3 + self.out_w = 3 + self.out_size = [3, 3] + + # out_size is a tensor list + class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = [8, 12] + + # out_size is a 1-D tensor + class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + self.shape_by_1Dtensor = True + + # scale is a 1-D tensor + class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 2.0 + self.out_size = None + self.scale_by_1Dtensor = True + + +support_types = get_xpu_op_support_types('nearest_interp_v2') +for stype in support_types: + create_test_class(globals(), XPUNearestInterpOpWrapper, stype) if __name__ == "__main__": unittest.main() -- GitLab From 79a32715b9aca4a6e522ffcf91bac82e7a6cd380 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 7 Mar 2022 17:24:16 +0800 Subject: [PATCH 091/261] [OpTest] Support to test paddle API end-to-end for check_eager (#40169) * add python api test in TestOp * test_python_api if self.python_api is set * fix code by CR --- paddle/fluid/imperative/tracer.cc | 33 +++++++ paddle/fluid/imperative/tracer.h | 5 + paddle/fluid/pybind/imperative.cc | 21 +++++ .../paddle/fluid/tests/unittests/op_test.py | 94 +++++++++++++++++++ .../fluid/tests/unittests/test_selu_op.py | 1 + 5 files changed, 154 insertions(+) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 4336a5c77c1..01c9d2847e0 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -18,12 +18,14 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/common/place.h" DECLARE_bool(use_mkldnn); DECLARE_string(tracer_mkldnn_ops_on); @@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, return false; } +phi::KernelSignature Tracer::GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); + framework::RuntimeContext ctx({}, {}); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(phi::CPUPlace()); + const auto& op_info = op->Info(); + auto* attr_checker = op_info.Checker(); + if (attr_checker) { + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); + } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + auto dygraph_exe_ctx = + imperative::DygraphExecutionContext( + *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + default_attrs); + auto* opbase_with_kernel = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + platform::errors::InvalidArgument( + "This op type:`%s` is not a OperatorWithKernel, only " + "OperatorWithKernel can get KernelSignature", + type)); + return phi::KernelSignature( + std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx))); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 73ecbbe6143..fd13fce6a6e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace imperative { @@ -154,6 +155,10 @@ class Tracer { } } + phi::KernelSignature GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 3da17b95a66..9b373a58181 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -56,6 +56,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace pybind { @@ -2073,6 +2074,26 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) + .def("_get_kernel_signature", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs) { + // TODO(xiongkun): move this function outside of tracer. + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + auto to_vector = [](paddle::SmallVector &vec) { + return std::vector(vec.begin(), vec.end()); + }; + auto ret = self.GetExpectedKernelSignature(type, ins_map, + outs_map, attrs); + auto kernelsig_ins = to_vector(std::get<0>(ret.args)); + auto kernelsig_attrs = to_vector(std::get<1>(ret.args)); + auto kernelsig_outs = to_vector(std::get<2>(ret.args)); + return std::make_tuple(kernelsig_ins, kernelsig_attrs, + kernelsig_outs); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 628791afef5..0c7f269a087 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -29,6 +29,7 @@ from copy import copy import paddle import paddle.fluid as fluid +from paddle.fluid.framework import _dygraph_tracer import paddle.fluid.core as core from paddle.fluid.framework import _in_eager_mode from paddle.fluid.framework import _test_eager_guard @@ -395,6 +396,7 @@ class OpTest(unittest.TestCase): hasattr(self, "attrs") and "use_xpu" in self.attrs and self.attrs["use_xpu"] == True) + # set the self.output_dtype . def infer_dtype_from_inputs_outputs(self, inputs, outputs): def is_np_data(input): return isinstance(input, (np.ndarray, np.generic)) @@ -679,6 +681,91 @@ class OpTest(unittest.TestCase): else: return var_dict + def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place): + """ for quick verify, here we take a simplest strategy: + 1. we only check variable in api_outs. + 2. we simply check the numpy (tensor) . + 3. we set atol and rtol as 1e-5, because they are unrelated to dtype. + """ + for name in api_outs: + np_api = np.array(api_outs[name]) + np_dyg = np.array(dygraph_outs[name]) + self.assertTrue( + np.allclose( + np_api, np_dyg, equal_nan=False), + "Output (" + name + ") has diff at " + str(place) + "\nExpect " + + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " + + self.__class__.__name__) + + def _calc_python_api_output(self, place): + def prepare_python_api_arguments(op_proto_ins, op_proto_attrs, + kernel_sig): + """ map from `op proto inputs and attrs` to `api input list and api attrs dict` + """ + # NOTE(xiongkun): why don't use input arguments dicts ? + # Because we don't know the python api name of each arguments. + inputs_sig, attrs_sig, outputs_sig = kernel_sig + input_arguments = [op_proto_ins[name] for name in inputs_sig] + attr_arguments = { + name: op_proto_attrs[name] + for name in attrs_sig if name in op_proto_attrs + } + return input_arguments, attr_arguments + + def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): + if not isinstance(ret_tuple, (tuple, list)): + ret_tuple = [ret_tuple] + assert len(output_sig) == len( + ret_tuple), "expect %d outputs, but get %d outputs" % ( + len(output_sig), len(ret_tuple)) + return {a: b for a, b in zip(output_sig, ret_tuple)} + + def assumption_assert_and_transform(args, argvs): + """ + currently only support "X" is [Tensor], don't support multi-tensor in "X" + """ + for inp in args: + assert isinstance(inp, list) and len( + inp + ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`" + args = [inp[0] for inp in args] + return args, argvs + + def cal_python_api(python_api, args, argvs, kernel_sig): + args, argvs = assumption_assert_and_transform(args, argvs) + inputs_sig, attrs_sig, outputs_sig = kernel_sig + ret_tuple = python_api(*args, **argvs) + return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) + + with fluid.dygraph.base.guard(place=place): + block = fluid.default_main_program().global_block() + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + # prepare input variable + inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, + True, False, block) + # prepare output variable + outputs = self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block) + + # prepare attrbutes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + + kernel_sig = _dygraph_tracer()._get_kernel_signature( + self.op_type, inputs, outputs, attrs_outputs) + + assert hasattr( + self, "python_api" + ), "Please set the `self.python_api` if you want to compare python api output." + arg, argv = prepare_python_api_arguments(inputs, attrs_outputs, + kernel_sig) + """ we directly return the cal_python_api value because the value is already tensor. + """ + return cal_python_api(self.python_api, arg, argv, kernel_sig) + def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): self.__class__.op_type = self.op_type # for ci check, please not delete it for now with fluid.dygraph.base.guard(place=place): @@ -699,6 +786,7 @@ class OpTest(unittest.TestCase): for attrs_name in self.attrs: if self.attrs[attrs_name] is not None: attrs_outputs[attrs_name] = self.attrs[attrs_name] + block.append_op( type=self.op_type, inputs=inputs, @@ -1150,6 +1238,12 @@ class OpTest(unittest.TestCase): if check_dygraph: dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) + + if hasattr(self, "python_api"): + api_outs = self._calc_python_api_output(place) + self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, + place) + if check_eager: with _test_eager_guard(): eager_dygraph_outs = self._calc_dygraph_output( diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py index e71adae8d9b..f1619881794 100644 --- a/python/paddle/fluid/tests/unittests/test_selu_op.py +++ b/python/paddle/fluid/tests/unittests/test_selu_op.py @@ -42,6 +42,7 @@ def ref_selu(x, class SeluTest(OpTest): def setUp(self): self.op_type = "selu" + self.python_api = paddle.nn.functional.selu self.x_shape = [3, 5, 5, 10] self.dtype = np.float64 self.init_x_shape() -- GitLab From b798fb071e8f2861f6c59b073f3389ea1d897fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Mon, 7 Mar 2022 21:38:16 +0800 Subject: [PATCH 092/261] [infrt] fold the infrt.cvtTensorOp. test=develop (#40214) --- cmake/external/llvm.cmake | 4 +- paddle/infrt/CMakeLists.txt | 2 - paddle/infrt/dialect/infrt/CMakeLists.txt | 2 + .../infrt/dialect/infrt/pass/CMakeLists.txt | 7 +++ .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 23 ++++++++ .../dialect/infrt/pass/infrt_op_fuse_pass.cc | 52 +++++++++++++++++++ .../dialect/infrt/pass/infrt_op_fuse_pass.h | 24 +++++++++ paddle/infrt/dialect/pd_op_base.td | 2 +- paddle/infrt/dialect/phi/phi_ir_exec.cc | 2 + .../dialect/{pten => phi}/dense_tensor.mlir | 0 .../pten_pass.mlir => phi/phi_pass.mlir} | 0 tools/infrt/custom_pdop.td | 2 +- 12 files changed, 114 insertions(+), 6 deletions(-) create mode 100644 paddle/infrt/dialect/infrt/pass/CMakeLists.txt create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h rename paddle/infrt/tests/dialect/{pten => phi}/dense_tensor.mlir (100%) rename paddle/infrt/tests/dialect/{pten/pten_pass.mlir => phi/phi_pass.mlir} (100%) diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index a7a9e85ffd7..9f6fd32ad98 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -100,8 +100,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") - add_public_tablegen_target(${td_base}_IncGen) - add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) + add_public_tablegen_target(MLIR${td_base}IncGen) + add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() # Execute the mlir script with infrt-exec program. diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index f2768f3dfa8..ed29b5b44c7 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -95,9 +95,7 @@ set(infrt_mlir_incs dense_tensor_inc pd_ops_inc pd_extra_ops_inc - rewrite_inc trt_ops_inc - pd_lower_to_trt_inc ) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt index daf710e0baf..08ce2d4707b 100644 --- a/paddle/infrt/dialect/infrt/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -13,3 +13,5 @@ mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) + +add_subdirectory(pass) diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt new file mode 100644 index 00000000000..19c12251a2e --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt @@ -0,0 +1,7 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_op_fuse_pass.cc + ) + +mlir_add_rewriter(infrt_op_fuse) diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td new file mode 100644 index 00000000000..ef702650b6f --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -0,0 +1,23 @@ +#ifndef INFRT_OP_FUSE +#define INFRT_OP_FUSE + +include "mlir/Interfaces/SideEffectInterfaces.td" +include "paddle/infrt/dialect/infrt/infrt_ops.td" +include "paddle/infrt/dialect/pd_ops.td" + +def FuseCvtTensorPattern : Pat< + (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)), + (Infrt_CvtTensorOp $arg)>; + +def FuseFeedCvtTensorPattern : Pat< + (Infrt_CvtTensorOp (PD_FeedOp $name)), + (PD_FeedOp $name)>; + +def TypesAreIdentical : Constraint>; +def RedundantCvtTensorOptPattern : Pat< + (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg), + [(TypesAreIdentical $res, $arg)]>; + + + +#endif // INFRT_OP_FUSE diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc new file mode 100644 index 00000000000..cb16e054418 --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" + +#include +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/pd_ops.h" +namespace { +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT + +/* + * infrtOpFusePass. + */ +struct InfrtOpFusePass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "infrtOpFusePass"; } + void runOnFunction() override; +}; +// Implementation of the InfrtOpFusePass. +void InfrtOpFusePass::runOnFunction() { + ::mlir::RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + // Fuse pd.return Operation + auto terminator_op = getFunction().front().getTerminator(); + if (nullptr == terminator_op) return; + for (auto operand : terminator_op->getOperands()) { + auto *op1 = operand.getDefiningOp(); + auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1); + if (!cvt_op) continue; + mlir::Value value = cvt_op.input(); + operand.replaceAllUsesWith(value); + cvt_op.erase(); + } +} +} // namespace +std::unique_ptr infrt::createInfrtOpFusePass() { + return std::make_unique(); +} diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h new file mode 100644 index 00000000000..ef349a7bbc4 --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +/* + * infrtOpFusePass. + */ +std::unique_ptr createInfrtOpFusePass(); + +} // namespace infrt diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index 266bdf60de7..26425e3945c 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -75,7 +75,7 @@ def PD_ElementType : Type; def PD_Tensor1 : TensorOf<[PD_ElementType]>; -def PD_Tensor : AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">; +def PD_Tensor : AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">; def PD_Tensor_Array : VectorOf<[PD_Tensor]>; diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index 1df929895b1..559fb90a64a 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -16,6 +16,7 @@ #include #include #include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" @@ -38,6 +39,7 @@ int main(int argc, char** argv) { infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; return 4; diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir similarity index 100% rename from paddle/infrt/tests/dialect/pten/dense_tensor.mlir rename to paddle/infrt/tests/dialect/phi/dense_tensor.mlir diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir similarity index 100% rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index 83e29578312..2139fbd8155 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -1,4 +1,4 @@ -def PD_FeedOp : PD_Op<"feed"> { +def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> { let summary = "Feed Op"; let description = [{ -- GitLab From 10325a82e1032c3397b6f6611f558eb18ede0b07 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 09:55:10 +0800 Subject: [PATCH 093/261] add python profiler package (#40065) * add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage --- paddle/fluid/platform/profiler.cc | 4 + paddle/fluid/platform/profiler.h | 1 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 85 ++++ python/paddle/fluid/core.py | 2 + .../fluid/tests/unittests/test_newprofiler.py | 129 +++++ python/paddle/profiler/__init__.py | 26 + python/paddle/profiler/profiler.py | 469 ++++++++++++++++++ python/paddle/profiler/profiler_statistic.py | 31 ++ python/paddle/profiler/utils.py | 90 ++++ python/setup.py.in | 1 + 11 files changed, 839 insertions(+), 1 deletion(-) create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler.py create mode 100644 python/paddle/profiler/__init__.py create mode 100644 python/paddle/profiler/profiler.py create mode 100644 python/paddle/profiler/profiler_statistic.py create mode 100644 python/paddle/profiler/utils.py diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 866bf3c66aa..feb72bce72b 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -489,6 +489,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; } +void DisableHostEventRecorder() { + FLAGS_enable_host_event_recorder_hook = false; +} + std::string PrintHostEvents() { std::ostringstream oss; auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 122e19b7c28..78275341cbb 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -216,6 +216,7 @@ void NvprofEnableRecordEvent(); void NvprofDisableRecordEvent(); void EnableHostEventRecorder(); +void DisableHostEventRecorder(); // Defined for UT std::string PrintHostEvents(); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5e61133510d..7ff501ef43d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_ feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store) + cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0a1cf604d2e..fcfc3e6a379 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -78,6 +78,9 @@ limitations under the License. */ #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/eager.h" @@ -2913,6 +2916,88 @@ All parameter, weight, gradient are variables in Paddle. }); m.def("size_of_dtype", framework::SizeOfType); + py::class_(m, "_ProfilerResult") + .def(py::init<>()) + .def("get_data", &paddle::platform::ProfilerResult::GetData, + py::return_value_policy::automatic_reference) + .def("save", &paddle::platform::ProfilerResult::Save) + .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); + + py::class_(m, "DevicePythonNode") + .def(py::init<>()) + .def_readwrite("name", &paddle::platform::DevicePythonNode::name) + .def_readwrite("type", &paddle::platform::DevicePythonNode::type) + .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns) + .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns) + .def_readwrite("device_id", + &paddle::platform::DevicePythonNode::device_id) + .def_readwrite("context_id", + &paddle::platform::DevicePythonNode::context_id) + .def_readwrite("stream_id", + &paddle::platform::DevicePythonNode::stream_id); + + py::class_(m, "HostPythonNode") + .def(py::init<>()) + .def_readwrite("name", &paddle::platform::HostPythonNode::name) + .def_readwrite("type", &paddle::platform::HostPythonNode::type) + .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns) + .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns) + .def_readwrite("process_id", + &paddle::platform::HostPythonNode::process_id) + .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("children_node", + &paddle::platform::HostPythonNode::children_node_ptrs) + .def_readwrite("runtime_node", + &paddle::platform::HostPythonNode::runtime_node_ptrs) + .def_readwrite("device_node", + &paddle::platform::HostPythonNode::device_node_ptrs); + + py::class_(m, "_Profiler") + .def("create", &paddle::platform::Profiler::Create, + py::return_value_policy::take_ownership) + .def("prepare", + [](paddle::platform::Profiler *profiler) { + platform::EnableHostEventRecorder(); + profiler->Prepare(); + }) + .def("start", &paddle::platform::Profiler::Start) + .def("stop", + [](paddle::platform::Profiler *profiler) { + platform::DisableHostEventRecorder(); + return profiler->Stop(); + }, + py::return_value_policy::automatic_reference); + + py::class_(m, "ProfilerOptions") + .def(py::init<>()) + .def_readwrite("trace_switch", + &paddle::platform::ProfilerOptions::trace_switch); + + py::class_(m, "_RecordEvent") + .def(py::init([](std::string name, platform::TracerEventType type) { + return std::make_unique( + name, type, 1, paddle::platform::EventRole::kOrdinary); + })) + .def("end", [](platform::RecordEvent *event) { event->End(); }); + + py::enum_(m, "TracerEventType") + .value("Operator", paddle::platform::TracerEventType::Operator) + .value("Dataloader", paddle::platform::TracerEventType::Dataloader) + .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep) + .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime) + .value("Kernel", paddle::platform::TracerEventType::Kernel) + .value("Memcpy", paddle::platform::TracerEventType::Memcpy) + .value("Memset", paddle::platform::TracerEventType::Memset) + .value("UserDefined", paddle::platform::TracerEventType::UserDefined) + .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner) + .value("Forward", paddle::platform::TracerEventType::Forward) + .value("Backward", paddle::platform::TracerEventType::Backward) + .value("Optimization", paddle::platform::TracerEventType::Optimization) + .value("Communication", paddle::platform::TracerEventType::Communication) + .value("PythonOp", paddle::platform::TracerEventType::PythonOp) + .value("PythonUserDefined", + paddle::platform::TracerEventType::PythonUserDefined); + m.def("load_profiler_result", &paddle::platform::LoadProfilerResult); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("set_cublas_switch", platform::SetAllowTF32Cublas); diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 5e023e9248c..617ab630528 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -283,6 +283,7 @@ if avx_supported(): from .core_avx import _set_cached_executor_build_strategy from .core_avx import _device_synchronize from .core_avx import _get_current_stream + from .core_avx import _Profiler, _ProfilerResult, _RecordEvent from .core_avx import _set_current_stream if sys.platform != 'win32': from .core_avx import _set_process_pids @@ -344,6 +345,7 @@ if load_noavx: from .core_noavx import _device_synchronize from .core_noavx import _get_current_stream from .core_noavx import _set_current_stream + from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent if sys.platform != 'win32': from .core_noavx import _set_process_pids from .core_noavx import _erase_process_pids diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py new file mode 100755 index 00000000000..12fb0fa61b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -0,0 +1,129 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.profiler as profiler + + +class TestProfiler(unittest.TestCase): + def test_profiler(self): + def my_trace_back(prof): + profiler.export_chrome_tracing('./test_profiler_chrometracing/')( + prof) + profiler.export_protobuf('./test_profiler_pb/')(prof) + + x_value = np.random.randn(2, 3, 3) + x = paddle.to_tensor( + x_value, stop_gradient=False, place=paddle.CPUPlace()) + y = x / 2.0 + ones_like_y = paddle.ones_like(y) + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof: + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=(1, 2)) as prof: + with profiler.RecordEvent(name='test'): + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=0, ready=1, record=1, repeat=1), + on_trace_ready=my_trace_back) as prof: + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=0, ready=0, record=2, repeat=1), + on_trace_ready=my_trace_back) as prof: + for i in range(3): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN, + on_trace_ready=my_trace_back) as prof: + for i in range(2): + y = x / 2.0 + prof.step() + + def my_sheduler(num_step): + if num_step % 5 < 2: + return profiler.ProfilerState.RECORD_AND_RETURN + elif num_step % 5 < 3: + return profiler.ProfilerState.READY + elif num_step % 5 < 4: + return profiler.ProfilerState.RECORD + else: + return profiler.ProfilerState.CLOSED + + def my_sheduler1(num_step): + if num_step % 5 < 2: + return profiler.ProfilerState.RECORD + elif num_step % 5 < 3: + return profiler.ProfilerState.READY + elif num_step % 5 < 4: + return profiler.ProfilerState.RECORD + else: + return profiler.ProfilerState.CLOSED + + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN, + on_trace_ready=my_trace_back) as prof: + for i in range(2): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=my_sheduler, + on_trace_ready=my_trace_back) as prof: + for i in range(5): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=my_sheduler1) as prof: + for i in range(5): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=1, ready=1, record=2, repeat=1, skip_first=1), + on_trace_ready=my_trace_back) as prof: + for i in range(5): + y = x / 2.0 + paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y) + prof.step() + + prof.export(path='./test_profiler_pb.pb', format='pb') + prof.summary() + result = profiler.utils.load_profiler_result('./test_profiler_pb.pb') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py new file mode 100644 index 00000000000..4999e703f2a --- /dev/null +++ b/python/paddle/profiler/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .profiler import ProfilerState, ProfilerTarget +from .profiler import make_scheduler, export_chrome_tracing, export_protobuf +from .profiler import Profiler +from .profiler import TracerEventType +from .utils import RecordEvent, load_profiler_result +from .profiler_statistic import SortedKeys + +__all__ = [ + 'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler', + 'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent', + 'load_profiler_result', 'SortedKeys' +] diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py new file mode 100644 index 00000000000..dc637bf9830 --- /dev/null +++ b/python/paddle/profiler/profiler.py @@ -0,0 +1,469 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import socket +import datetime +from enum import Enum +from typing import Any, Callable, Iterable, Optional, Union +from warnings import warn + +import paddle +from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, + TracerEventType) + +from .utils import RecordEvent, wrap_optimizers +from .profiler_statistic import SortedKeys + + +class ProfilerState(Enum): + r""" + Profiler state that can be specified to control profiler action. + + CLOSED: The profilers are closed. + READY: The profilers are open, but the data will not be recorded. + This state is used for reducing overhead influence when profilers start. + RECORD: The profilers are open, and the data will be recorded. + RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, + the collected data will be returned. + """ + CLOSED = 0 + READY = 1 + RECORD = 2 + RECORD_AND_RETURN = 3 # the last step of RECORD + + +class ProfilerTarget(Enum): + r""" + Target device for profiling. + """ + CPU = 0 + GPU = 1 + + +def make_scheduler(*, + closed: int, + ready: int, + record: int, + repeat: int=0, + skip_first: int=0) -> Callable: + r""" + Return a scheduler function, which scheduler the state according to the setting. + The state transform confirms to: + + (CLOSED) (CLOSED) (CLOSED) (READY) (RECORD,last RETURN) (CLOSED) + START -> skip_first -> closed -> ready -> record -> END + | | + | | (if has_repeated < repeat) + - - - - - - - - - - - - + Note that repeat <= 0 means the cycle will continue until the profiler exits. + + Parameters: + closed(int): The number of steps in state ProfilerState.CLOSED. + ready(int): The number of steps in state ProfilerState.READY. + record(int): The number of steps in state ProfilerState.RECORD. + repeat(int): The number of cycles to repeat above state transform. + skip_first(int): The number of first steps to drop, not participate in the state transform. + + Returns: + A scheduler function, conforms to above state transform setting. + + Examples: + 1. profiling range [2, 5] + batch 0: closed, batch 1: ready, batch [2, 5] record + .. code-block:: python + make_scheduler(closed=1, ready=1, record=4, repeat=1) + 2. profiling range [3,6], [9,12], [15,18]... + batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat + .. code-block:: python + make_scheduler(closed=1, ready=1, record=4, skip_first=1) + """ + + def getScheduleState(step: int) -> ProfilerState: + assert step >= 0 + if step < skip_first: # within skip_first, just skip + return ProfilerState.CLOSED + step = step - skip_first + period_steps = closed + ready + record + has_repeated = step // period_steps + if repeat > 0 and has_repeated >= repeat: # the period has repeated repeat times, return CLOSED state + return ProfilerState.CLOSED + mod_step = step % period_steps + if mod_step < closed: + return ProfilerState.CLOSED + elif mod_step >= closed and mod_step < closed + ready: + return ProfilerState.READY + else: + if mod_step < period_steps - 1: + return ProfilerState.RECORD + else: + return ProfilerState.RECORD_AND_RETURN + assert closed >= 0 and ready >= 0 and record > 0 and \ + repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments" + if ready == 0: + warn("Profiler will record data after enabling profiler immediately, \ + some data collected at the beginning of profiling may be 'noisy' because of overhead." + ) + return getScheduleState + + +def _default_state_scheduler(step: int): + r""" + A default state scheduler, keep recording from the begining of the profiler until ending. + """ + return ProfilerState.RECORD + + +def export_chrome_tracing(dir_name: str, + worker_name: Optional[str]=None) -> Callable: + r""" + Return a callable, used for outputing tracing data to chrome tracing format file. + The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + if worker_name is not set, the default name is [hostname]_[pid]. + + Parameters: + dir_name(str): Directory to save profiling data. + worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + """ + if not os.path.exists(dir_name): + try: + os.makedirs(dir_name, exist_ok=True) + except Exception: + raise RuntimeError( + "Can not create directory '{}' for saving profiling results.". + format(dir_name)) + + def handle_fn(prof): + nonlocal worker_name + if not worker_name: + worker_name = "host_{}pid_{}".format(socket.gethostname(), + str(os.getpid())) + now = datetime.datetime.now() + filename = '{}_time_{}.paddle_trace.json'.format( + worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')) + prof.export(os.path.join(dir_name, filename), "json") + + return handle_fn + + +def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: + r""" + Return a callable, used for outputing tracing data to protobuf file. + The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + if worker_name is not set, the default name is [hostname]_[pid]. + + Parameters: + dir_name(str): Directory to save profiling data. + worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_protobuf('./log') + ) as p: + for iter in range(N): + train() + p.step() + """ + if not os.path.exists(dir_name): + try: + os.makedirs(dir_name, exist_ok=True) + except Exception: + raise RuntimeError( + "Can not create directory '{}' for saving profiling results.". + format(dir_name)) + + def handle_fn(prof): + nonlocal worker_name + if not worker_name: + worker_name = "host_{}pid_{}".format(socket.gethostname(), + str(os.getpid())) + now = datetime.datetime.now() + filename = '{}_time_{}.paddle_trace.pb'.format( + worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')) + prof.export(os.path.join(dir_name, filename), "pb") + + return handle_fn + + +def _get_supported_targets() -> Iterable[ProfilerTarget]: + r""" + Get the current supported profiler target in the system. + """ + if paddle.device.is_compiled_with_cuda(): + return [ProfilerTarget.CPU, ProfilerTarget.GPU] + return [ProfilerTarget.CPU] + + +class Profiler: + r""" + Profiler context manager, user interface to manage profile process. + + Parameters: + targets (iterable): list of tracing targets, currently supported values: + ``paddle.profiler.ProfilerTarget.CPU``, + ``paddle.profiler.ProfilerTarget.GPU``. + scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. + If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, + which means profiling range [start_batch, end_batch). + on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. + This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. + + Examples: + 1. profiling range [2, 5) + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (2, 5), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + 2. profiling range [2,4], [7, 9], [11,13] + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + 3. Use profiler without context manager, and use default parameters + .. code-block:: python + import paddle.profiler as profiler + p = profiler.Profiler() + p.start() + for iter in range(N): + train() + p.step() + p.stop() + p.summary() + """ + + def __init__( + self, + *, + targets: Optional[Iterable[ProfilerTarget]]=None, + scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None, + on_trace_ready: Optional[Callable[..., Any]]=None): + supported_targets = _get_supported_targets() + if targets: + self.targets = set(targets) + for target in targets: + if target not in supported_targets: + self.targets.remove(target) + warn("Profiling {} is not supported in current context.". + format(target)) + else: + self.targets = supported_targets + profileoption = ProfilerOptions() + if ProfilerTarget.CPU in self.targets: + profileoption.trace_switch |= 1 + if ProfilerTarget.GPU in self.targets: + profileoption.trace_switch |= (1 << 1) + wrap_optimizers() + self.profiler = _Profiler.create(profileoption) + if callable(scheduler): + self.scheduler = scheduler + elif isinstance(scheduler, (tuple, list)): + assert len(scheduler) == 2 and scheduler[1] > scheduler[0] + start_batch, end_batch = scheduler + start_batch = max(start_batch, 0) + if start_batch >= 1: + self.scheduler = make_scheduler( + closed=max(start_batch - 1, 0), + ready=1, + record=(end_batch - start_batch), + repeat=1) + else: + self.scheduler = make_scheduler( + closed=0, + ready=0, + record=(end_batch - start_batch), + repeat=1) + else: + self.scheduler = _default_state_scheduler + + if on_trace_ready == None: + self.on_trace_ready = export_chrome_tracing('./profiler_log/') + else: + self.on_trace_ready = on_trace_ready + self.step_num = 0 + self.previous_state = ProfilerState.CLOSED + self.current_state = self.scheduler(self.step_num) + self.record_event = None + self.profiler_result = None + + def __enter__(self): + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + + def start(self): + r''' + Start profiler and enter the first profiler step(0). + State transformed from CLOSED to self.current_state and trigger corresponding action. + ''' + # CLOSED -> self.current_state + if self.current_state == ProfilerState.READY: + self.profiler.prepare() + elif self.current_state == ProfilerState.RECORD: + self.profiler.prepare() + self.profiler.start() + elif self.current_state == ProfilerState.RECORD_AND_RETURN: + self.profiler.prepare() + self.profiler.start() + self.record_event = RecordEvent( + name="ProfileStep#{}".format(self.step_num), + event_type=TracerEventType.ProfileStep) + self.record_event.begin() + + def stop(self): + r''' + Stop profiler and State transformed from self.current_state to CLOSED. + Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists. + ''' + # self.current_state -> CLOSED + # In this situation, RECORD state is regarded as RECORD_AND_RETURN + if self.record_event: + self.record_event.end() + self.record_event = None + if self.current_state == ProfilerState.READY: + warn( + "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data" + ) + self.profiler.start() + self.profiler.stop() + if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN: + self.profiler_result = self.profiler.stop() + if self.on_trace_ready: + self.on_trace_ready(self) + + def step(self): + r""" + Signals the profiler that the next profiling step has started. + Get the new ProfilerState and trigger corresponding action. + """ + if self.record_event: + self.record_event.end() + self.record_event = None + self.previous_state = self.current_state + self.step_num += 1 + self.current_state = self.scheduler(self.step_num) + self._trigger_action() + self.record_event = RecordEvent( + name="ProfileStep#{}".format(self.step_num), + event_type=TracerEventType.ProfileStep) + self.record_event.begin() + + def _trigger_action(self): + if self.previous_state == ProfilerState.CLOSED: + if self.current_state == ProfilerState.READY: # CLOSED -> READY + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD: # CLOSED -> RECORD + self.profiler.prepare() + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # CLOSED -> RECORD_AND_RETURN + self.profiler.prepare() + self.profiler.start() + + elif self.previous_state == ProfilerState.READY: + if self.current_state == ProfilerState.CLOSED: # READY -> CLOSED + warn( + "Improper schedule: READY->CLOSED, profiler will start and stop without saving data" + ) + self.profiler.start() + self.profiler.stop() + if self.current_state == ProfilerState.RECORD: # READY -> RECORD + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # READY -> RECORD_AND_RETURN + self.profiler.start() + + elif self.previous_state == ProfilerState.RECORD: + if self.current_state == ProfilerState.CLOSED: # RECORD -> CLOSED + warn( + "Improper schedule: RECORD->CLOSED, profiler will not saving data" + ) + self.profiler.stop() + + if self.current_state == ProfilerState.READY: # RECORD -> READY + warn( + "Improper schedule: RECORD->READY, profiler will stop and re-prepare" + ) + self.profiler.stop() + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # RECORD -> RECORD_AND_RETURN + pass + + else: + assert self.previous_state == ProfilerState.RECORD_AND_RETURN + if self.current_state == ProfilerState.CLOSED: # RECORD_AND_RETURN -> CLOSED + self.profiler_result = self.profiler.stop() + if self.current_state == ProfilerState.READY: # RECORD_AND_RETURN -> READY + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD: # RECORD_AND_RETURN -> RECORD + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # RECORD_AND_RETURN -> RECORD_AND_RETURN + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + self.profiler.start() + if self.on_trace_ready: + self.on_trace_ready(self) + + def export(self, path="", format="json"): + r""" + Exports the tracing data in Chrome tracing data format. + """ + if self.profiler_result: + self.profiler_result.save(path, format) + + def summary(self, + sorted_by=SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms'): + r""" + Print the Summary table. + + Parameters: + sorted_by: how to rank the op table items. + detail: expand each operator detail information. + thread_sep: print op table each thread. + time_unit: can be chosen form ['s', 'ms', 'us', 'ns'] + """ + pass diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py new file mode 100644 index 00000000000..29d586268a0 --- /dev/null +++ b/python/paddle/profiler/profiler_statistic.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +from enum import Enum + +from paddle.fluid.core import TracerEventType + + +class SortedKeys(Enum): + r""" + Sorted keys for printing summary table. + """ + CPUTotal = 0 + CPUAvg = 1 + CPUMax = 2 + CPUMin = 3 + GPUTotal = 4 + GPUAvg = 5 + GPUMax = 6 + GPUMin = 7 diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py new file mode 100644 index 00000000000..642001dfbfc --- /dev/null +++ b/python/paddle/profiler/utils.py @@ -0,0 +1,90 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import (_RecordEvent, TracerEventType, + load_profiler_result) +from typing import Any +from warnings import warn +import functools +from contextlib import ContextDecorator + +_AllowedEventTypeList = [ + TracerEventType.Dataloader, TracerEventType.ProfileStep, + TracerEventType.UserDefined, TracerEventType.Forward, + TracerEventType.Backward, TracerEventType.Optimization, + TracerEventType.PythonOp, TracerEventType.PythonUserDefined +] + + +class RecordEvent(ContextDecorator): + r""" + Interface for recording a time range. + + Parameters: + name(str): Name of the record event + event_type(TracerEventType): Type of the record event, can be used for statistics. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined): + op1() + """ + + def __init__(self, + name: str, + event_type: TracerEventType=TracerEventType.UserDefined): + self.name = name + self.event_type = event_type + self.event = None + + def __enter__(self): + self.begin() + return self + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any): + self.end() + + def begin(self): + if self.event_type not in _AllowedEventTypeList: + warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ + can be recorded.".format(*_AllowedEventTypeList)) + self.event = None + else: + if self.event_type == TracerEventType.UserDefined: + self.event_type == TracerEventType.PythonUserDefined + self.event = _RecordEvent(self.name, self.event_type) + + def end(self): + if self.event: + self.event.end() + + +def wrap_optimizers(): + def optimizer_warpper(func): + @functools.wraps(func) + def warpper(*args, **kwargs): + with RecordEvent( + 'Optimization Step', + event_type=TracerEventType.Optimization): + return func(*args, **kwargs) + + return warpper + + import paddle.optimizer as optimizer + for classname in optimizer.__all__: + if classname != 'Optimizer': + classobject = getattr(optimizer, classname) + if getattr(classobject, 'step', None) != None: + classobject.step = optimizer_warpper(classobject.step) diff --git a/python/setup.py.in b/python/setup.py.in index 0bc32cfbc00..118f617361f 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -372,6 +372,7 @@ packages=['paddle', 'paddle.device', 'paddle.device.cuda', 'paddle.version', + 'paddle.profiler' ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: -- GitLab From 1f857cb966191e3e43de7950918595a6a4ca2db2 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 09:55:54 +0800 Subject: [PATCH 094/261] add profiler statistic helper (#40111) * add profiler helper * fix unittest * improve test coverage rate --- .../unittests/test_newprofiler_helper.py | 137 +++++++++++ python/paddle/profiler/statistic_helper.py | 225 ++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler_helper.py create mode 100644 python/paddle/profiler/statistic_helper.py diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py new file mode 100755 index 00000000000..05e79200354 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle.profiler.statistic_helper as statistic_helper + + +class TestStatisticHelper(unittest.TestCase): + def test_sum_ranges_case1(self): + src = [(1, 3), (4, 10), (11, 15)] + self.assertEqual(statistic_helper.sum_ranges(src), 12) + + def test_sum_ranges_case2(self): + src = [(3, 3), (5, 5), (7, 7)] + self.assertEqual(statistic_helper.sum_ranges(src), 0) + + def test_merge_self_ranges_case1(self): + src = [(1, 5), (2, 7), (4, 9), (14, 19)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 9), (14, 19)]) + src = [(4, 9), (14, 19), (1, 5), (2, 7)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 9), (14, 19)]) + + def test_merge_self_ranges_case2(self): + src = [(1, 1), (2, 3), (4, 7), (5, 12)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)]) + src = [(5, 12), (1, 1), (2, 3), (4, 7)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)]) + + def test_merge_ranges_case1(self): + src1 = [(1, 2), (5, 7), (9, 14)] + src2 = [(1, 2), (4, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (4, 15)]) + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, [(1, 2), (4, 15)]) + src1 = [] + src2 = [] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, []) + src1 = [(1, 2), (3, 5)] + src2 = [] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, src1) + src1 = [] + src2 = [(1, 2), (3, 5)] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, src2) + src1 = [(3, 4), (1, 2), (17, 19)] + src2 = [(6, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)]) + dst = statistic_helper.merge_ranges(src2, src1) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)]) + src1 = [(1, 2), (5, 9), (12, 13)] + src2 = [(6, 8), (9, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (5, 15)]) + dst = statistic_helper.merge_ranges(src2, src1) + self.assertEqual(dst, [(1, 2), (5, 15)]) + + def test_merge_ranges_case2(self): + src1 = [(3, 4), (1, 2), (9, 14)] + src2 = [(6, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)]) + src2 = [(9, 14), (1, 2), (5, 7)] + src1 = [(4, 9), (1, 2), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (4, 15)]) + + def test_intersection_ranges_case1(self): + src1 = [(1, 7), (9, 12), (14, 18)] + src2 = [(3, 8), (10, 13), (15, 19)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + src1 = [] + src2 = [] + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, []) + src1 = [(3, 7), (10, 12)] + src2 = [(2, 9), (11, 13), (15, 19)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (11, 12)]) + dst = statistic_helper.intersection_ranges(src2, src1) + self.assertEqual(dst, [(3, 7), (11, 12)]) + + def test_intersection_ranges_case2(self): + src2 = [(9, 12), (1, 7), (14, 18)] + src1 = [(10, 13), (3, 8), (15, 19), (20, 22)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + src2 = [(1, 7), (14, 18), (21, 23)] + src1 = [(6, 9), (10, 13)] + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, [(6, 7)]) + + def test_subtract_ranges_case1(self): + src1 = [(1, 10), (12, 15)] + src2 = [(3, 7), (9, 11)] + dst = statistic_helper.subtract_ranges(src1, src2, True) + self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)]) + src1 = [(1, 10), (12, 15)] + src2 = [] + dst = statistic_helper.subtract_ranges(src1, src2, True) + self.assertEqual(dst, src1) + dst = statistic_helper.subtract_ranges(src2, src1, True) + self.assertEqual(dst, src2) + + def test_subtract_ranges_case2(self): + src2 = [(12, 15), (1, 10)] + src1 = [(9, 11), (3, 7)] + dst = statistic_helper.subtract_ranges(src1, src2) + self.assertEqual(dst, [(10, 11)]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py new file mode 100644 index 00000000000..1f11649928a --- /dev/null +++ b/python/paddle/profiler/statistic_helper.py @@ -0,0 +1,225 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections + + +def sum_ranges(ranges): + result = 0 + for time_range in ranges: + result += (time_range[1] - time_range[0]) + return result + + +def merge_self_ranges(src_ranges, is_sorted=False): + merged_ranges = [] + if len(src_ranges) > 0: + if not is_sorted: + src_ranges.sort(key=lambda x: x[0]) + cur_indx = 0 + merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1])) + for cur_indx in range(1, len(src_ranges)): + if src_ranges[cur_indx][1] > merged_ranges[-1][1]: + if src_ranges[cur_indx][0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], + src_ranges[cur_indx][1]) + else: + merged_ranges.append( + (src_ranges[cur_indx][0], src_ranges[cur_indx][1])) + return merged_ranges + + +def merge_ranges(range_list1, range_list2, is_sorted=False): + merged_ranges = [] + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + len1 = len(range_list1) + len2 = len(range_list2) + if len1 == 0 and len2 == 0: + return merged_ranges + elif len1 == 0: + return range_list2 + elif len2 == 0: + return range_list1 + else: + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + if range1[0] < range2[0]: + merged_ranges.append(range1) + indx1 += 1 + else: + merged_ranges.append(range2) + indx2 += 1 + while indx1 < len1 and indx2 < len2: + range1 = range_list1[indx1] + range2 = range_list2[indx2] + if range1[0] < range2[0]: + if range1[1] > merged_ranges[-1][1]: + if range1[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range1[1]) + else: + merged_ranges.append((range1[0], range1[1])) + indx1 += 1 + else: + indx1 += 1 + else: + if range2[1] > merged_ranges[-1][1]: + if range2[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) + else: + merged_ranges.append((range2[0], range2[1])) + indx2 += 1 + else: + indx2 += 1 + + while indx1 < len1: + range1 = range_list1[indx1] + if range1[1] > merged_ranges[-1][1]: + if range1[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range1[1]) + else: + merged_ranges.append((range1[0], range1[1])) + indx1 += 1 + else: + indx1 += 1 + while indx2 < len2: + if range2[1] > merged_ranges[-1][1]: + if range2[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) + else: + merged_ranges.append((range2[0], range2[1])) + indx2 += 1 + else: + indx2 += 1 + return merged_ranges + + +def intersection_ranges(range_list1, range_list2, is_sorted=False): + result_range = [] + if len(range_list1) == 0 or len(range_list2) == 0: + return result_range + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + + len1 = len(range_list1) + len2 = len(range_list2) + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + while indx1 < len1 and indx2 < len2: + if range2[1] <= range1[0]: + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] <= range1[0] and range2[1] < range1[1]: + assert (range2[1] > range1[0]) + result_range.append((range1[0], range2[1])) + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] <= range1[0]: + assert (range2[1] >= range1[1]) + result_range.append(range1) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + + elif range2[1] < range1[1]: + assert (range2[0] > range1[0]) + result_range.append(range2) + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] < range1[1]: + assert (range2[1] >= range1[1]) + result_range.append((range2[0], range1[1])) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + + else: + assert (range2[0] >= range1[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + return result_range + + +def subtract_ranges(range_list1, range_list2, is_sorted=False): + result_range = [] + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + if len(range_list1) == 0: + return result_range + if len(range_list2) == 0: + return range_list1 + + len1 = len(range_list1) + len2 = len(range_list2) + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + + while indx1 < len(range_list1): + if indx2 == len(range_list2): + result_range.append(range1) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + elif range2[1] <= range1[0]: + indx2 += 1 + if indx2 != len2: + range2 = range_list2[indx2] + elif range2[0] <= range1[0] and range2[1] < range1[1]: + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 != len2: + range2 = range_list2[indx2] + elif range2[0] <= range1[0]: + assert (range2[1] >= range1[1]) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 != len1: + range1 = range_list1[indx1] + elif range2[0] < range1[1]: + assert (range2[0] > range1[0]) + result_range.append((range1[0], range2[0])) + range1 = (range2[0], range1[1]) + else: + assert (range2[0] >= range1[1]) + result_range.append(range1) + indx1 += 1 + if indx1 != len1: + range1 = range_list1[indx1] + return result_range -- GitLab From fe1cc8bd43472f6b9eb413a6ae88144517b9bf8a Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:01:54 +0800 Subject: [PATCH 095/261] [phi] move sigmoid_cross_entopy_with_logits log_loss cumsum auc infershape to phi (#40200) * move infershapes to phi * update code format * update code format --- paddle/fluid/operators/cumsum_op.cc | 20 ++--- paddle/fluid/operators/log_loss_op.cc | 45 ++-------- paddle/fluid/operators/metrics/auc_op.cc | 72 ++-------------- .../sigmoid_cross_entropy_with_logits_op.cc | 50 ++--------- paddle/phi/infermeta/binary.cc | 83 +++++++++++++++++++ paddle/phi/infermeta/binary.h | 14 ++++ paddle/phi/infermeta/multiary.cc | 80 ++++++++++++++++++ paddle/phi/infermeta/multiary.h | 12 +++ paddle/phi/infermeta/unary.cc | 18 ++++ paddle/phi/infermeta/unary.h | 7 ++ 10 files changed, 244 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 7c80917a713..11633fb0b87 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 2e596ff3e62..883e3597d8a 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 54ecba08a82..f3ed98c3f4d 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,4 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 8e502fc04db..016ff54645b 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,46 +29,6 @@ const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 55230aa8d05..b17405990fb 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -575,6 +575,48 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config) { + auto pred_dims = input.dims(); + auto label_dims = label.dims(); + + if (config.is_runtime || + (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { + PADDLE_ENFORCE_EQ( + pred_dims, + label_dims, + phi::errors::InvalidArgument( + "The dimensions of Input(Predicted) must be equal to the" + "dimensions of Input(Labels), but received dimensions of " + "Input(Predicted)" + "is [%s], received dimensions of Input(Labels) is [%s].", + pred_dims, + label_dims)); + } + PADDLE_ENFORCE_EQ(pred_dims.size(), + 2, + phi::errors::InvalidArgument( + "The dimensions of Input(Predicted) must be 2," + "But received dimensions of Input(Predicted)" + "is [%d]", + pred_dims.size())); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(pred_dims[1], + 1, + phi::errors::InvalidArgument( + "Each row of Input(Predicted) contains a real value, " + "so the 2nd dimension of Input(X) must be 1," + "But got [%d]", + pred_dims[1])); + } + out->set_dims({pred_dims[0], 1}); + out->set_dtype(input.dtype()); + out->share_lod(input); +} + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { auto dim_x = x.dims(); auto dim_vec = vec.dims(); @@ -605,4 +647,45 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, + const MetaTensor& label, + bool normalize, + int ignore_index, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto labels_dims = label.dims(); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, + labels_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + labels_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ( + phi::slice_ddim(x_dims, 0, rank), + phi::slice_ddim(labels_dims, 0, rank), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension. But received: the shape of " + "Input(X) is [%s], the shape of Input(Label) is [%s].", + x_dims, + labels_dims)); + } + + out->set_dims(x_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 106c22f7548..934ed688bf2 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -89,6 +89,7 @@ void BincountInferMeta(const MetaTensor& x, const paddle::optional weights, int minlength, MetaTensor* out); + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -102,6 +103,19 @@ void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, + const MetaTensor& label, + bool normalize, + int ignore_index, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index a21f077c09f..acce40713b8 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -28,6 +28,86 @@ std::vector GetMetaTensorsDim(const std::vector& tensors) { return dims; } +void AucInferMeta(const MetaTensor& input, + const MetaTensor& label, + const MetaTensor& stat_pos, + const MetaTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + MetaTensor* auc, + MetaTensor* stat_pos_out, + MetaTensor* stat_neg_out, + MetaConfig config) { + auto predict_dims = input.dims(); + auto label_dims = label.dims(); + PADDLE_ENFORCE_GE( + predict_dims.size(), + 2, + phi::errors::InvalidArgument( + "The Input(Predict) has not been initialized properly. The " + "shape of Input(Predict) = [%s], the shape size must be " + "greater_equal 2.", + predict_dims)); + auto predict_width = predict_dims[1]; + PADDLE_ENFORCE_NE( + phi::product(predict_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Predict) has not been initialized properly. The " + "shape of Input(Predict) = [%s], the shape can not involes 0.", + predict_dims)); + PADDLE_ENFORCE_NE( + phi::product(label_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Label) has not been initialized properly. The " + "shape of Input(Label) = [%s], the shape can not involes 0.", + label_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_LE( + predict_width, + 2, + phi::errors::InvalidArgument("Only support binary classification," + "prediction dims[1] should be 1 or 2")); + } + auto predict_height = input.dims()[0]; + auto label_height = label.dims()[0]; + + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + predict_height, + label_height, + phi::errors::InvalidArgument("Out and Label should have same height.")); + } + + int num_pred_buckets = num_thresholds + 1; + + PADDLE_ENFORCE_GE( + num_pred_buckets, + 1, + phi::errors::InvalidArgument("num_thresholds must larger than 1")); + PADDLE_ENFORCE_GE( + slide_steps, + 0, + phi::errors::InvalidArgument("slide_steps must be natural number")); + + auc->set_dims({1}); + auc->set_dtype(DataType::INT64); + + if (slide_steps) { + stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1}); + stat_pos_out->set_dtype(DataType::INT64); + stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1}); + stat_neg_out->set_dtype(DataType::INT64); + } else { + stat_pos_out->set_dims({1, num_pred_buckets}); + stat_pos_out->set_dtype(DataType::INT64); + stat_neg_out->set_dims({1, num_pred_buckets}); + stat_neg_out->set_dtype(DataType::INT64); + } +} + void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& learning_rate, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 8cb6f70481d..26bdc62302f 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -20,6 +20,18 @@ namespace phi { std::vector GetMetaTensorsDim(const std::vector& tensors); +void AucInferMeta(const MetaTensor& input, + const MetaTensor& label, + const MetaTensor& stat_pos, + const MetaTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + MetaTensor* auc, + MetaTensor* stat_pos_out, + MetaTensor* stat_neg_out, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index b9eb5196b1e..4053cfbc362 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -156,6 +156,24 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) { out->set_layout(x.layout()); } +void CumsumInferMeta(const MetaTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + MetaTensor* out) { + auto x_dims = x.dims(); + if (flatten) { + out->set_dims(phi::make_ddim({phi::product(x_dims)})); + out->set_dtype(x.dtype()); + } else { + out->set_dims(x_dims); + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 37b17f6e3d1..a679ef8c11a 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -63,6 +63,13 @@ void CopyToInferMeta(const MetaTensor& x, void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); +void CumsumInferMeta(const MetaTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + MetaTensor* out); + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); void InferMetaFromVecValue(const MetaTensor& x, -- GitLab From 0c33c47ee752befb54b6a16f6608cb3c411506d9 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 8 Mar 2022 10:21:48 +0800 Subject: [PATCH 096/261] fix paddle.median torch diff (#40118) --- python/paddle/tensor/stat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 468aa460486..dd0da03e4fd 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None): paddle.slice( tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]), dtype=dtype) + out_tensor = out_tensor + paddle.sum( + paddle.cast( + paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True) if not keepdim or is_flatten: if not is_flatten: newshape = x.shape[:axis] + x.shape[axis + 1:] -- GitLab From 81d4142b97e3758f7a526066dd0414ec8b306098 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:23:01 +0800 Subject: [PATCH 097/261] [Phi] move InferShape for truncated_gaussian_random and gaussian_random (#40191) * [Phi] move InferShape for truncated_gaussian_random and gaussian_random * [Phi] delete useless codes --- paddle/fluid/operators/gaussian_random_op.cc | 47 +++++-------------- .../operators/truncated_gaussian_random_op.cc | 36 +++++--------- paddle/phi/infermeta/nullary.cc | 25 ++++++++++ paddle/phi/infermeta/nullary.h | 14 ++++++ .../cpu/truncated_gaussian_random_kernel.cc | 2 +- .../gpu/truncated_gaussian_random_kernel.cu | 3 +- .../truncated_gaussian_random_kernel.h | 5 +- 7 files changed, 70 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 6b559885c56..66eecc13d04 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfd..dc5a66dce16 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 0c48c9d0c7e..506d3fd14ea 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -40,4 +40,29 @@ void EyeInferMeta(int64_t num_rows, out->set_dims({num_rows, num_columns}); out->set_dtype(dtype); } + +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + +void GaussianRandomInferMeta(const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape.GetData()); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 40d6ea595c0..bd0567486e4 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -40,4 +40,18 @@ void EyeInferMeta(int64_t num_rows, DataType dtype, MetaTensor* out); +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + +void GaussianRandomInferMeta(const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index ebc032ef545..4247e597ace 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -27,7 +27,7 @@ namespace phi { template void TruncatedGaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const std::vector& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index 12c1bf791e1..f27b32ca7b8 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -25,7 +25,6 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/fluid/framework/generator.h" -// #include "paddle/phi/core/generator.h" namespace phi { @@ -87,7 +86,7 @@ struct TruncatedNormalOffset { template void TruncatedGaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const std::vector& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index 0370cc431fe..f8547ced419 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -20,6 +20,7 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" +#include "paddle/phi/infermeta/nullary.h" namespace phi { @@ -157,8 +158,8 @@ struct TruncatedNormal { }; template -void TruncatedGaussianRandomKernel(const Context& ctx, - const ScalarArray& shape, +void TruncatedGaussianRandomKernel(const Context& dev_ctx, + const std::vector& shape, float mean, float std, int seed, -- GitLab From 413a743e7f5e0436db60b7d1718cc0353488062a Mon Sep 17 00:00:00 2001 From: tanzhipeng <51696454+tiancaitzp@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:29:32 +0800 Subject: [PATCH 098/261] remove unnecessary constant fill in sequence conv test=kunlun. (#40126) --- .../fluid/operators/sequence_ops/sequence_conv_op_xpu.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044..23c6a0133e1 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, -- GitLab From 6bd2d2b1cb5fa2e350adb4c9b291b48054257be5 Mon Sep 17 00:00:00 2001 From: wawltor Date: Tue, 8 Mar 2022 10:29:59 +0800 Subject: [PATCH 099/261] [Phi] move the graph_send_recv op to the phi (#40092) * [Phi] transfer old kernel to pten kernel for the graph_send_recv op * update the code for the define of graph_send_recv * fix the gradient problem for graph_send_recv * fix the compile problem * update the enfore message for the windows * update the code for the compiler * update compiler problem for the windows * udpate the code for windows * fix some format problem --- paddle/fluid/operators/graph_send_recv_op.cc | 12 +- paddle/fluid/operators/graph_send_recv_op.cu | 419 ------------------ paddle/fluid/operators/graph_send_recv_op.h | 291 ------------ .../phi/kernels/cpu/graph_send_recv_funcs.h | 80 ++++ .../cpu/graph_send_recv_grad_kernel.cc | 172 +++++++ .../phi/kernels/cpu/graph_send_recv_kernel.cc | 153 +++++++ .../phi/kernels/gpu/graph_send_recv_funcs.h | 171 +++++++ .../gpu/graph_send_recv_grad_kernel.cu | 148 +++++++ .../phi/kernels/gpu/graph_send_recv_kernel.cu | 179 ++++++++ .../phi/kernels/graph_send_recv_grad_kernel.h | 33 ++ paddle/phi/kernels/graph_send_recv_kernel.h | 31 ++ paddle/phi/ops/compat/graph_send_recv_sig.cc | 31 ++ 12 files changed, 999 insertions(+), 721 deletions(-) delete mode 100644 paddle/fluid/operators/graph_send_recv_op.cu delete mode 100644 paddle/fluid/operators/graph_send_recv_op.h create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_funcs.h create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_kernel.cc create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_funcs.h create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_kernel.cu create mode 100644 paddle/phi/kernels/graph_send_recv_grad_kernel.h create mode 100644 paddle/phi/kernels/graph_send_recv_kernel.h create mode 100644 paddle/phi/ops/compat/graph_send_recv_sig.cc diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eb..b759345eda5 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -171,13 +171,3 @@ REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvGradOpMaker, ops::GraphSendRecvGradOpMaker); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac..00000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee8..00000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h new file mode 100644 index 00000000000..df6d9c87be0 --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +struct GraphSendRecvSumFunctor { + void operator()(const bool& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + eigen_dst += eigen_src; + } +}; + +template +struct GraphSendRecvMinFunctor { + void operator()(const bool& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + if (first_flag) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMin(eigen_src); + } + } +}; + +template +struct GraphSendRecvMaxFunctor { + void operator()(const int& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + if (first_flag) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMax(eigen_src); + } + } +}; + +template +void ElementwiseInnerOperation(const DenseTensor& src, + DenseTensor* dst, + const IndexT& src_index, + const IndexT& dst_index, + const bool& first_flag, + Functor functor) { + auto src_slice = src.Slice(src_index, src_index + 1); + auto dst_slice = dst->Slice(dst_index, dst_index + 1); + + functor(first_flag, src_slice, &dst_slice); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc new file mode 100644 index 00000000000..8538461b1b8 --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" + +#include +#include + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvCpuGradLoop(const int& input_size, + const int& index_size, + const IndexT* s_index, + const IndexT* d_index, + const DenseTensor& src, + DenseTensor* dst, + const std::string& pool_type, + const int* dst_count = nullptr, + const DenseTensor* input = nullptr, + const DenseTensor* output = nullptr) { + if (pool_type == "SUM") { + Functor functor; + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + auto src_slice = src.Slice(src_idx, src_idx + 1); + auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + for (int i = 0; i < index_size; ++i) { + const IndexT& forward_src_idx = d_index[i]; + const IndexT& forward_dst_idx = s_index[i]; + auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); + auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); + auto eigen_input = phi::EigenVector::Flatten(input_slice); + auto eigen_output = phi::EigenVector::Flatten(output_slice); + + auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); + auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst += eigen_src * (eigen_output == eigen_input); + } + } +} + +template +void GraphSendRecvGradOpKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* x = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* p_output = x_grad->data(); + const auto& src_dims = out_grad.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + if (index_size == 0) return; + + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + + if (pool_type == "SUM") { + GraphSendRecvCpuGradLoop>( + src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type); + } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); + // Functor not used here. + GraphSendRecvCpuGradLoop>(src_dims[0], + index_size, + d_index, + s_index, + out_grad, + x_grad, + pool_type, + s_count); + } else if (pool_type == "MIN" || pool_type == "MAX") { + // Functor not used here. + GraphSendRecvCpuGradLoop>(src_dims[0], + index_size, + d_index, + s_index, + out_grad, + x_grad, + pool_type, + nullptr, + x, + out); + } +} + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvGradOpKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvGradOpKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv_grad, + CPU, + ALL_LAYOUT, + phi::GraphSendRecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc new file mode 100644 index 00000000000..fecbd4b1d7a --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/graph_send_recv_kernel.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvCpuLoop(const int& input_size, + const int& index_size, + const IndexT* s_index, + const IndexT* d_index, + const DenseTensor& src, + DenseTensor* dst, + const std::string& pool_type, + int* dst_count = nullptr) { + Functor functor; + if (pool_type == "SUM") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + for (int i = 0; i < index_size; ++i) { + IndexT dst_idx = d_index[i]; + *(dst_count + dst_idx) += 1; + } + for (int i = 0; i < input_size; ++i) { + if (*(dst_count + i) == 0) continue; + auto dst_slice = dst->Slice(i, i + 1); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst = eigen_dst / static_cast(*(dst_count + i)); + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + std::set existed_dst; + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); + if (!in_set) { + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, true, functor); + existed_dst.emplace(dst_idx); + } else { + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } + } +} + +template +void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + + ctx.template Alloc(out); + T* p_output = out->data(); + const auto& src_dims = x.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + if (index_size == 0) return; + + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + if (pool_type == "SUM") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MIN") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MAX") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MEAN") { + ctx.template Alloc(dst_count); + int* p_dst_count = dst_count->data(); + memset(p_dst_count, 0, src_dims[0] * sizeof(int)); + GraphSendRecvCpuLoop>(src_dims[0], + index_size, + s_index, + d_index, + x, + out, + pool_type, + p_dst_count); + } +} + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvOpKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvOpKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv, + CPU, + ALL_LAYOUT, + phi::GraphSendRecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h new file mode 100644 index 00000000000..1eab521170b --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -0,0 +1,171 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/graph_send_recv_kernel.h" + +#include +#include +#include +#include + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" + +namespace phi { + +template +struct GraphSendRecvSumCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); + } +}; + +template +struct GraphSendRecvMaxCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); + } +}; + +template +struct GraphSendRecvMinCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); + } +}; + +template +__global__ void GraphSendRecvCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + Functor functor) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + functor(params, output, in_i, out_i); + } +} + +// For max +template +__global__ void InputResetMaxCUDAKernel(T* output, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::min()) { + *(output + i) = 0; + } + } +} + +// For min +template +__global__ void InputResetMinCUDAKernel(T* output, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::max()) { + *(output + i) = 0; + } + } +} + +// Get dst_count +template +__global__ void ComputeCountCUDAKernel(int32_t* count, + const IndexT* dst_indices, + size_t index_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { + IndexT dst_i = dst_indices[i]; + paddle::platform::CudaAtomicAdd(count + dst_i, 1); + } +} + +// For forward mean +template +__global__ void ManipulateMeanCUDAKernel(T* output, + int32_t* count, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + int64_t c_index = i / slice_size; + if (*(count + c_index) > 1) { + *(output + i) = *(output + i) / *(count + c_index); + } + } +} + +// For backward mean +template +__global__ void ManipulateMeanGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const int32_t* dst_count) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd(output + out_i, + *(params + in_i) / dst_count[src_i]); + } +} + +// For backward min and max +template +__global__ void ManipulateMinMaxGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const T* ptr_input, + const T* ptr_output) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd( + output + out_i, + *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu new file mode 100644 index 00000000000..75692966b46 --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvGradOpCUDAKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* x = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* p_output = x_grad->data(); + + const auto& src_dims = out_grad.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else + cudaMemset(p_output, 0, memset_bytes); +#endif + + if (index_size == 0) return; + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = out_grad.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif + int64_t n = slice_size * index_size; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, d_index, s_index, p_output, index_size, slice_size, functor); + } else if (pool_type == "MEAN") { + const int32_t* s_count = dst_count->data(); + ManipulateMeanGradCUDAKernel<<>>( + p_src, d_index, s_index, p_output, index_size, slice_size, s_count); + } else if (pool_type == "MAX" || pool_type == "MIN") { + const T* ptr_input = x->data(); + const T* ptr_output = out->data(); + ManipulateMinMaxGradCUDAKernel<<>>( + p_src, + d_index, + s_index, + p_output, + index_size, + slice_size, + ptr_input, + ptr_output); + } +} + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv_grad, + GPU, + ALL_LAYOUT, + phi::GraphSendRecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu new file mode 100644 index 00000000000..fab306f831a --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -0,0 +1,179 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_recv_kernel.h" + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + ctx.template Alloc(out); + T* p_output = out->data(); + const auto& src_dims = x.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else + cudaMemset(p_output, 0, memset_bytes); +#endif + } else if (pool_type == "MAX") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, + p_output_ptr, + p_output_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, + p_output_ptr, + p_output_ptr + memset_size, + std::numeric_limits::max()); + } + + if (index_size == 0) return; + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = x.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif + int64_t n = slice_size * index_size; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + } else if (pool_type == "MAX") { + GraphSendRecvMaxCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvMaxCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_max = + grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; + InputResetMaxCUDAKernel<<>>( + p_output, input_size, slice_size); + } else if (pool_type == "MIN") { + GraphSendRecvMinCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvMinCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_min = + grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; + InputResetMinCUDAKernel<<>>( + p_output, input_size, slice_size); + } else if (pool_type == "MEAN") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + ctx.template Alloc(dst_count); + int32_t* p_dst_count = dst_count->data(); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_dst_count, 0, input_size * sizeof(int)); +#else + cudaMemset(p_dst_count, 0, input_size * sizeof(int)); +#endif + + int64_t grid_count = (index_size + block - 1) / block; + ComputeCountCUDAKernel<<>>( + p_dst_count, d_index, index_size); + + int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_mean = + grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; + ManipulateMeanCUDAKernel<<>>( + p_output, p_dst_count, input_size, slice_size); + } +} + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvOpCUDAKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvOpCUDAKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv, + GPU, + ALL_LAYOUT, + phi::GraphSendRecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h new file mode 100644 index 00000000000..d163e6e278a --- /dev/null +++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h new file mode 100644 index 00000000000..95dbdc4443a --- /dev/null +++ b/paddle/phi/kernels/graph_send_recv_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count); + +} // namespace phi diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc new file mode 100644 index 00000000000..dacb8b25a89 --- /dev/null +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GraphSendRecvGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "graph_send_recv_grad", + {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"}, + {"pool_type"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, + phi::GraphSendRecvGradOpArgumentMapping); -- GitLab From c722ee690dd75389bf000cd5435f5f4519c4b7a2 Mon Sep 17 00:00:00 2001 From: maxhuiy <1508399706@qq.com> Date: Tue, 8 Mar 2022 10:35:27 +0800 Subject: [PATCH 100/261] [MLU] add fleet init api and collective api pytest for mlu (#40010) * [MLU] add fleet init api and collective api pytest for mlu * fix no value for argument 'data_type' in method call --- python/paddle/distributed/collective.py | 4 + python/paddle/distributed/parallel.py | 14 +- python/paddle/fluid/dygraph/parallel.py | 3 + .../fluid/tests/unittests/mlu/CMakeLists.txt | 12 +- .../tests/unittests/mlu/c_comm_init_op_mlu.py | 71 ++++++ .../unittests/mlu/collective_allreduce_api.py | 54 +++++ .../unittests/mlu/collective_broadcast_api.py | 54 +++++ .../unittests/mlu/test_c_comm_init_op_mlu.sh | 21 ++ .../mlu/test_collective_allreduce_api_mlu.py | 43 ++++ .../mlu/test_collective_api_base_mlu.py | 223 ++++++++++++++++++ .../mlu/test_collective_broadcast_api_mlu.py | 43 ++++ 11 files changed, 535 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 8042aced6bb..bf6556d21e9 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -267,6 +267,10 @@ def new_group(ranks=None, backend=None): place = core.NPUPlace(genv.device_id) core.HCCLParallelContext(strategy, place).init_with_ring_id(ring_id) + elif core.is_compiled_with_mlu(): + place = core.MLUPlace(genv.device_id) + core.CNCLParallelContext(strategy, + place).init_with_ring_id(ring_id) else: assert False, ("no cuda device found") else: diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 177e19194a5..16ed528b64f 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size): def _is_cpuonly(backend): check_backend(backend) - if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and ( + if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and ( core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or - core.is_compiled_with_npu()): + core.is_compiled_with_npu() or core.is_compiled_with_mlu()): # passes 'auto' and can use cuda or xpu, use the default logics. so return False return False @@ -152,7 +152,8 @@ def init_parallel_env(): is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or - core.is_compiled_with_xpu() or core.is_compiled_with_npu()): + core.is_compiled_with_xpu() or core.is_compiled_with_npu() or + core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend") @@ -162,6 +163,8 @@ def init_parallel_env(): _check_var_exists('FLAGS_selected_xpus') elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') + elif not is_cpu_only and core.is_compiled_with_mlu(): + _check_var_exists('FLAGS_selected_mlus') _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") @@ -213,6 +216,8 @@ def init_parallel_env(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) + elif core.is_compiled_with_mlu(): + place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) # init nccl or hccl or bkcl or heter context @@ -231,6 +236,9 @@ def init_parallel_env(): elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) + elif core.is_compiled_with_mlu(): + parallel_helper._set_parallel_ctx( + core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 0049f387b70..652916491ee 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -128,6 +128,9 @@ class ParallelEnv(object): elif core.is_compiled_with_npu(): selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") self._device_id = int(selected_npus[0]) + elif core.is_compiled_with_mlu(): + selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") + self._device_id = int(selected_mlus[0]) self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index c17790bd320..17f5509bdb9 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -13,13 +13,17 @@ if (WITH_MLU) endforeach(TEST_OP) if(WITH_CNCL) - foreach(TEST_OP ${TEST_DIST_OPS}) + foreach(TEST_OP ${TEST_DIST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) - set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120) + set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120) endif(WITH_CNCL) endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py new file mode 100644 index 00000000000..e91f28e3b1d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.distributed.fleet.base.private_helper_function import wait_server_ready +import paddle + +paddle.enable_static() + + +class TestCCommInitOp(unittest.TestCase): + def setUp(self): + self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',') + self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + self.nranks = len(self.endpoints) + self.rank = self.endpoints.index(self.current_endpoint) + self.mlu_id = int(os.getenv("FLAGS_selected_mlus")) + self.place = fluid.MLUPlace(self.mlu_id) + self.exe = fluid.Executor(self.place) + self.endpoints.remove(self.current_endpoint) + self.other_endpoints = self.endpoints + if self.rank == 0: + wait_server_ready(self.other_endpoints) + + def test_specifying_devices(self): + program = fluid.Program() + block = program.global_block() + cncl_id_var = block.create_var( + name=fluid.unique_name.generate('cncl_id'), + persistable=True, + type=fluid.core.VarDesc.VarType.RAW) + block.append_op( + type='c_gen_cncl_id', + inputs={}, + outputs={'Out': cncl_id_var}, + attrs={ + 'rank': self.rank, + 'endpoint': self.current_endpoint, + 'other_endpoints': self.other_endpoints + }) + block.append_op( + type='c_comm_init', + inputs={'X': cncl_id_var}, + outputs={}, + attrs={ + 'nranks': self.nranks, + 'rank': self.rank, + 'ring_id': 0, + 'device_id': self.mlu_id + }) + self.exe.run(program) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py new file mode 100644 index 00000000000..ebe4e71d22f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program, rank): + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype='float32') + paddle.distributed.all_reduce(tindata) + return [tindata] + + +if __name__ == "__main__": + runtime_main(TestCollectiveAllreduceAPI, "allreduce") diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py new file mode 100644 index 00000000000..2002909ea2e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program, rank): + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype="float32") + paddle.distributed.broadcast(tindata, src=1) + return [tindata] + + +if __name__ == "__main__": + runtime_main(TestCollectiveBroadcastAPI, "broadcast") diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh new file mode 100644 index 00000000000..97f21798c11 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +# use default values +# FIXME: random fails on Unknown command lines -c (or -m). +launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py +MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py new file mode 100644 index 00000000000..447498b9022 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from test_collective_api_base_mlu import TestDistBase + +paddle.enable_static() + + +class TestCollectiveAllreduceAPI(TestDistBase): + def _setup_config(self): + pass + + def test_allreduce_cncl_fp16(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "float16") + + def test_allreduce_cncl_fp32(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "float32") + + def test_allreduce_cncl_int32(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "int32") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py new file mode 100644 index 00000000000..556fc6fcbb7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py @@ -0,0 +1,223 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import numpy as np +import unittest +import os +import sys +import subprocess +import pickle +from contextlib import closing +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + + +def DataTypeCast(date_type): + np_data_type = None + + if date_type == "float16": + np_data_type = np.float16 + elif date_type == "float32": + np_data_type = np.float32 + elif date_type == "int32": + np_data_type = np.int32 + else: + raise ValueError("This data type is not support!") + + return np_data_type + + +class TestCollectiveAPIRunnerBase(object): + def get_model(self, train_prog, startup_prog, rank, indata=None): + raise NotImplementedError( + "get model should be implemented by child class.") + + def run_trainer(self, args): + train_prog = fluid.Program() + startup_prog = fluid.Program() + endpoints = args["endpoints"].split(",") + rank = args["trainerid"] + current_endpoint = args["currentendpoint"] + nranks = 2 + paddle.distributed.init_parallel_env() + device_id = int(os.getenv("FLAGS_selected_mlus", "0")) + place = fluid.MLUPlace(device_id) + np.random.seed(os.getpid()) + np_data_type = DataTypeCast(args["data_type"]) + indata = np.random.random((10, 1000)).astype(np_data_type) + if args['static_mode']: + result = self.get_model(train_prog, startup_prog, rank) + exe = fluid.Executor(place) + exe.run(startup_prog) + fetch_list = [] + for elem in result: + fetch_list.append(elem.name) + out = exe.run(train_prog, + feed={'tindata': indata}, + fetch_list=fetch_list) + else: + out = self.get_model(train_prog, startup_prog, rank, indata) + #print(out, sys.stderr) + sys.stdout.buffer.write(pickle.dumps(out)) + + +def runtime_main(test_class, col_type): + args = {} + model = test_class() + args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID")) + args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM")) + args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS') + args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT") + args["col_type"] = col_type + args["backend"] = os.getenv("BACKEND") + args["path_id"] = int(os.getenv("PATH_ID")) + args["static_mode"] = int(os.getenv("STATIC_MODE")) + args["data_type"] = os.getenv("DATA_TYPE") + model.run_trainer(args) + + +import paddle.compat as cpt +import socket +from contextlib import closing + + +class TestDistBase(unittest.TestCase): + def setUp(self): + self._port_set = set() + self._trainers = 2 + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + self._python_interp = sys.executable + + def _find_free_port(self): + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port + + def _run_cluster(self, model_file, envs): + worker_endpoints = self._ps_endpoints.split(",") + w0_ep, w1_ep = worker_endpoints + #print("w0_ep:",w0_ep," w1_ep:",w1_ep) + env0 = { + "FLAGS_selected_mlus": "0", + "PADDLE_TRAINER_ID": "0", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w0_ep + } + + env1 = { + "FLAGS_selected_mlus": "1", + "PADDLE_TRAINER_ID": "1", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w1_ep + } + #update environment + env0.update(envs) + env1.update(envs) + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + tr_cmd = "%s -m coverage run --branch -p %s" + else: + tr_cmd = "%s %s" + tr0_cmd = tr_cmd % (self._python_interp, model_file) + tr1_cmd = tr_cmd % (self._python_interp, model_file) + tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w") + tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w") + #print(tr0_cmd) + tr0_proc = subprocess.Popen( + tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr0_pipe, + env=env0) + + tr1_proc = subprocess.Popen( + tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr1_pipe, + env=env1) + + tr0_out, tr0_err = tr0_proc.communicate() + tr1_out, tr1_err = tr1_proc.communicate() + sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + # close trainer file + tr0_pipe.close() + tr1_pipe.close() + with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) + with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) + return pickle.loads(tr0_out), pickle.loads( + tr1_out), tr0_proc.pid, tr1_proc.pid + + def check_with_place(self, + model_file, + col_type, + data_type, + path_id="0", + static_mode="1", + check_error_log=False, + need_envs={}): + required_envs = { + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_eager_delete_tensor_gb": "0.0", + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "LD_PRELOAD": os.getenv("LD_PRELOAD", ""), + "FLAGS_call_stack_level": "2", + "GLOG_v": "3", + "STATIC_MODE": static_mode, + "PADDLE_WITH_GLOO": '0', + "BACKEND": "cncl", + "PATH_ID": path_id, + "DATA_TYPE": data_type + } + required_envs.update(need_envs) + if check_error_log: + required_envs["GLOG_v"] = "3" + required_envs["GLOG_logtostderr"] = "1" + required_envs["GLOO_LOG_LEVEL"] = "TRACE" + tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file, + required_envs) + np_data_type = DataTypeCast(data_type) + np.random.seed(pid0) + input1 = np.random.random((10, 1000)).astype(np_data_type) + np.random.seed(pid1) + input2 = np.random.random((10, 1000)).astype(np_data_type) + if col_type == "broadcast": + need_result = input2 + self.assertTrue(np.allclose(tr0_out, need_result)) + self.assertTrue(np.allclose(tr1_out, need_result)) + elif col_type == "allreduce": + need_result = input1 + input2 + self.assertTrue( + np.allclose( + tr0_out, need_result, rtol=1e-05, atol=1e-05)) + self.assertTrue( + np.allclose( + tr1_out, need_result, rtol=1e-05, atol=1e-05)) + else: + pass diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py new file mode 100644 index 00000000000..95919f33328 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from test_collective_api_base_mlu import TestDistBase + +paddle.enable_static() + + +class TestCollectiveBroadcastAPI(TestDistBase): + def _setup_config(self): + pass + + def test_broadcast_cncl_fp16(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "float16") + + def test_broadcast_cncl_fp32(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "float32") + + def test_broadcast_cncl_int32(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "int32") + + +if __name__ == '__main__': + unittest.main() -- GitLab From c39aa18e0d3fe4eddd72ff1d07839655a8af8dbb Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Tue, 8 Mar 2022 10:48:33 +0800 Subject: [PATCH 101/261] [custom kernel]Upgrade support for multiple libs (#40223) * [custom kernel]Upgade support for multi libs * upgrade phi_custom_kernel deps --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/inference/api/CMakeLists.txt | 2 +- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/platform/init.cc | 2 +- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/custom_kernel.cc | 71 ++++++++------------- paddle/phi/core/custom_kernel.h | 14 ++-- paddle/phi/core/kernel_registry.h | 3 +- paddle/phi/kernels/CMakeLists.txt | 2 +- paddle/phi/kernels/sparse/CMakeLists.txt | 2 +- paddle/phi/tests/core/CMakeLists.txt | 2 +- paddle/phi/tests/core/test_custom_kernel.cc | 4 +- paddle/testing/CMakeLists.txt | 2 +- 13 files changed, 45 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e486799495c..aa92a3b2226 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 6eeb5d64253..1f83e606c3f 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 5a47443fd0b..04c8a329e5e 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -117,7 +117,7 @@ endif() cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # seperate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel) +cc_library(init SRCS init.cc DEPS device_context custom_kernel) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index cf85dede8e8..293a71dbd96 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -154,8 +154,8 @@ void LoadCustomDevice(const std::string &library_dir) { "Fail to open library: %s with error: %s", lib_path, dlerror())); phi::LoadCustomRuntimeLib(lib_path, dso_handle); - phi::LoadCustomKernelLib(lib_path, dso_handle); } + phi::CustomKernelMap::Instance().RegisterCustomKernels(); LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir << "]"; } diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 424c4ce2ebc..b4a6b54d0fe 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) -cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw) +cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index a333874d03e..bc317da8d98 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -12,21 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - #include "paddle/phi/core/custom_kernel.h" namespace phi { -void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { - auto& kernel_info_map = custom_kernel_map.GetMap(); - VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); +void CustomKernelMap::RegisterCustomKernel(const std::string& name, + const KernelKey& key, + const Kernel& kernel) { + PADDLE_ENFORCE_EQ(kernels_[name].find(key), + kernels_[name].end(), + phi::errors::AlreadyExists( + "The custom kernel [%s:%s] has been already existed in " + "CustomKernelMap, please check if any duplicate kernel " + "info in your lib(s) before load again.", + name, + key)); + kernels_[name][key] = kernel; +} + +void CustomKernelMap::RegisterCustomKernels() { + VLOG(3) << "Size of custom_kernel_map: " << kernels_.size(); auto& kernels = KernelFactory::Instance().kernels(); - for (auto& pair : kernel_info_map) { + for (auto& pair : kernels_) { PADDLE_ENFORCE_NE( kernels.find(pair.first), kernels.end(), @@ -38,8 +46,8 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { PADDLE_ENFORCE_EQ( kernels[pair.first].find(info_pair.first), kernels[pair.first].end(), - phi::errors::InvalidArgument( - "The operator <%s>'s kernel: %s has been already existed " + phi::errors::AlreadyExists( + "The kernel [%s:%s] has been already existed " "in Paddle, please contribute PR if it is necessary " "to optimize the kernel code. Custom kernel does NOT support " "to replace existing kernel in Paddle.", @@ -48,43 +56,14 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { kernels[pair.first][info_pair.first] = info_pair.second; - VLOG(3) << "Successed in registering operator <" << pair.first - << ">'s kernel: " << info_pair.first - << " to Paddle. It will be used like native ones."; + VLOG(3) << "Successed in registering kernel [" << pair.first << ":" + << info_pair.first + << "] to Paddle. It will be used like native ones."; } + kernels_[pair.first].clear(); } + LOG(INFO) << "Successed in loading custom kernels."; + kernels_.clear(); } -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { -#ifdef _LINUX - typedef phi::CustomKernelMap& get_custom_kernel_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetCustomKernelMap")); - - if (func == nullptr) { - LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetCustomKernelMap symbol in this lib."; - return; - } - auto& custom_kernel_map = func(); - phi::RegisterCustomKernels(custom_kernel_map); - LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; -#else - VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; -#endif - return; -} } // namespace phi - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global CustomKernelMap. -phi::CustomKernelMap& PD_GetCustomKernelMap() { - return phi::CustomKernelMap::Instance(); -} - -#ifdef __cplusplus -} // end extern "C" -#endif diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h index ffd12b9dd03..5ba14de6a61 100644 --- a/paddle/phi/core/custom_kernel.h +++ b/paddle/phi/core/custom_kernel.h @@ -29,6 +29,12 @@ class CustomKernelMap { return g_custom_kernel_info_map; } + void RegisterCustomKernel(const std::string& kernel_name, + const KernelKey& kernel_key, + const Kernel& kernel); + + void RegisterCustomKernels(); + KernelNameMap& Kernels() { return kernels_; } const KernelNameMap& GetMap() const { return kernels_; } @@ -40,12 +46,4 @@ class CustomKernelMap { KernelNameMap kernels_; }; -/** - * Note: - * Used to register custom kernels to KernelFactory. - */ -void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map); - -// Load custom kernel lib and register -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle); } // namespace phi diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 6a0c7bbc9b7..d9ed68593cd 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -210,7 +210,8 @@ struct KernelRegistrar { if (reg_type == RegType::INNER) { KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; } else { - CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel; + CustomKernelMap::Instance().RegisterCustomKernel( + kernel_name, kernel_key, kernel); } } }; diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 16fae8d879c..58ea231beef 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] -set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) +set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt index a319e9a13c3..eaea6d95216 100644 --- a/paddle/phi/kernels/sparse/CMakeLists.txt +++ b/paddle/phi/kernels/sparse/CMakeLists.txt @@ -1,3 +1,3 @@ -set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function) +set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel) register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel") diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 5356bac9fbd..de9bd7a4d47 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index a4e89231e14..6fe34a6891a 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -172,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) { fake_dot_kernels.end()); // register - phi::RegisterCustomKernels(phi::CustomKernelMap::Instance()); + phi::CustomKernelMap::Instance().RegisterCustomKernels(); + + EXPECT_EQ(0, static_cast(custom_fake_dot_kernels.size())); EXPECT_TRUE(fake_dot_kernels.find( phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) != diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index eace7c41f4a..0cc68bf3161 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -1,5 +1,5 @@ # for paddle test case if(WITH_TESTING) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc) endif() -- GitLab From d4b007af8bfa82df134220690115fcd58122de26 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 8 Mar 2022 10:53:28 +0800 Subject: [PATCH 102/261] add share dims (#40238) --- paddle/fluid/framework/infershape_utils.cc | 20 +++++++------ paddle/phi/core/meta_tensor.cc | 35 +++++++++++++++++----- paddle/phi/core/meta_tensor.h | 3 +- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 7232a707916..91ef59575c3 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -232,16 +232,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -254,6 +246,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor + share_lod(meta_tensor); + share_dims(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index 2aadce4feda..eb114304f53 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -98,13 +98,9 @@ const LoD& MetaTensor::lod() const { } void MetaTensor::share_meta(const MetaTensor& meta_tensor) { - if (phi::DenseTensor::classof(tensor_)) { - set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - set_layout(meta_tensor.layout()); - share_lod(meta_tensor); - } else if (phi::SelectedRows::classof(tensor_)) { - set_dims(meta_tensor.dims()); + if (phi::DenseTensor::classof(tensor_) || + phi::SelectedRows::classof(tensor_)) { + share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); set_layout(meta_tensor.layout()); share_lod(meta_tensor); @@ -114,4 +110,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { } } +TensorBase* MetaTensor::get_tensor() const { return tensor_; } + +void MetaTensor::share_dims(const MetaTensor& meta_tensor) { + bool is_dense_tensor = phi::DenseTensor::classof(tensor_); + bool is_selected_rows = phi::SelectedRows::classof(tensor_); + if (is_dense_tensor || is_selected_rows) { + set_dims(meta_tensor.dims()); + if (is_selected_rows) { + const auto in_tensor_base = meta_tensor.get_tensor(); + PADDLE_ENFORCE_EQ( + phi::SelectedRows::classof(in_tensor_base), + true, + errors::InvalidArgument("The input MetaTensor is SelectedRows, but " + "the output MetaTensor is not this type.")); + auto* selected_rows_out = static_cast(tensor_); + auto* selected_rows_in = static_cast(in_tensor_base); + selected_rows_out->set_rows(selected_rows_in->rows()); + selected_rows_out->set_height(selected_rows_in->height()); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported sharing dims for `%s`.", tensor_->type_info().name())); + } +} + } // namespace phi diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 1a32019a190..3971a9f7e99 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -60,12 +60,13 @@ class MetaTensor { virtual void share_lod(const MetaTensor& meta_tensor); virtual void share_meta(const MetaTensor& meta_tensor); + virtual void share_dims(const MetaTensor& meta_tensor); private: // Because the lod in compiletime and runtime is different, // so `LoD` cannot in public methods const LoD& lod() const; - + TensorBase* get_tensor() const; TensorBase* tensor_; }; -- GitLab From f876320a9836a6a12ab6e8b3ddb079fc2ae6e746 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 8 Mar 2022 11:15:39 +0800 Subject: [PATCH 103/261] support code auto-gene for sparse backward api (#40196) --- .gitignore | 2 + paddle/phi/api/lib/CMakeLists.txt | 26 ++- .../paddle/utils/code_gen/backward_api_gen.py | 1 + .../paddle/utils/code_gen/sparse_api_gen.py | 9 +- .../paddle/utils/code_gen/sparse_bw_api.yaml | 6 + .../utils/code_gen/sparse_bw_api_gen.py | 200 ++++++++++++++++++ 6 files changed, 235 insertions(+), 9 deletions(-) create mode 100644 python/paddle/utils/code_gen/sparse_bw_api.yaml create mode 100644 python/paddle/utils/code_gen/sparse_bw_api_gen.py diff --git a/.gitignore b/.gitignore index a2009a1ed30..21222678f04 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 4f449c578ba..926ddf8ba49 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -40,6 +40,14 @@ set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) set(sparse_api_header_file_tmp ${api_header_file}.tmp) set(sparse_api_source_file_tmp ${api_source_file}.tmp) +# sparse bw api file +set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) +set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) +set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) +set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc) +set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp) +set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp) + # wrapped infermeta file set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) @@ -91,7 +99,20 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file} COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file} COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" - DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} + DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file} + VERBATIM) + +# generate backward sparse api +add_custom_command( + OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file} + COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} + --api_yaml_path ${sparse_bw_api_yaml_file} + --api_header_path ${sparse_bw_api_header_file_tmp} + --api_source_path ${sparse_bw_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file} + COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}" + DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file} VERBATIM) # generate wrapped infermeta @@ -113,9 +134,10 @@ cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfe cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) -cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl) +cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) +cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index b9f991f9b0f..7bd488cc114 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -35,6 +35,7 @@ class BackwardAPI(BaseAPI): forward_config) api = result.group('api') _, outputs, _ = self.parse_output(self.api, result.group('outputs')) + outputs = [item.split('@')[0] for item in outputs] fw_inputs, fw_attrs, _, = self.parse_input_and_attr( api, result.group('args')) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index 99c5a4f49f8..d845653f488 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -17,10 +17,10 @@ import yaml import argparse import re -from api_base import BaseAPI +from api_gen import ForwardAPI -class SparseAPI(BaseAPI): +class SparseAPI(ForwardAPI): def __init__(self, api_item_yaml): super(SparseAPI, self).__init__(api_item_yaml) @@ -30,11 +30,6 @@ class SparseAPI(BaseAPI): def get_api_func_name(self): return self.api - def get_return_type(self, out_type_list): - return out_type_list[0] if len( - out_type_list) == 1 else "std::tuple<" + ",".join( - out_type_list) + ">" - def gene_api_declaration(self): return f""" // {", ".join(self.outputs['names'])} diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml new file mode 100644 index 00000000000..c71dce50299 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -0,0 +1,6 @@ +- sparse_bw_api : conv3d_grad + forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) + args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups) + output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor) + kernel : + func : sparse_conv_grad diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py new file mode 100644 index 00000000000..6ef294caa14 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -0,0 +1,200 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import yaml +import argparse +import re + +from sparse_api_gen import SparseAPI +from backward_api_gen import BackwardAPI + + +class SparseBackwardAPI(SparseAPI, BackwardAPI): + def __init__(self, bw_api_item_yaml): + BackwardAPI.__init__(self, bw_api_item_yaml) + + def get_api_name(self, api_item_yaml): + return api_item_yaml['sparse_bw_api'] + + def get_api_func_name(self): + return self.api + + def get_return_type(self, out_type_list): + return BackwardAPI.get_return_type(self, out_type_list) + + def gene_api_declaration(self): + return SparseAPI.gene_api_declaration(self) + + def gene_output(self, + output_type_list, + set_out_func, + code_indent, + inplace_flag=False): + kernel_output = "" + output_names = [] + output_create = "" + + if len(output_type_list) == 1: + kernel_output = 'kernel_out' + output_names.append('kernel_out') + inplace_assign = " = " + self.inplace_map[self.outputs['names'][ + 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][0] in self.inplace_map else "" + output_create = f""" + {self.outputs['return_type']} out{inplace_assign}; + auto kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.outputs['return_type']} out({len(output_type_list)});""" + + for i, out_type_item in enumerate(output_type_list): + kernel_output = kernel_output + f'kernel_out_{i}, ' + output_names.append(f'kernel_out_{i}') + if out_type_item == 'Tensor': + get_out_code = f'&out[{i}][0]' + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" + + else: + output_create = output_create + f""" + out[{i}].emplace_back();""" + + else: + get_out_code = f'&out[{i}]' + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + out[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" + + output_create = output_create + f""" + auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.api)) + + return kernel_output, output_names, output_create + + +def header_include(): + return """ +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" +""" + + +def source_include(header_file_path): + return f""" +#include "{header_file_path}" +#include + +#include "glog/logging.h" + +#include "paddle/phi/api/lib/api_registry.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/declarations.h" +""" + + +def api_register(): + return """ +PD_REGISTER_API(Test); +""" + + +def api_namespace(): + return (""" +namespace paddle { +namespace experimental { +namespace sparse { + +""", """ + +} // namespace sparse +} // namespace experimental +} // namespace paddle +""") + + +def generate_api(api_yaml_path, header_file_path, source_file_path): + + with open(api_yaml_path, 'r') as f: + apis = yaml.load(f, Loader=yaml.FullLoader) + header_file = open(header_file_path, 'w') + source_file = open(source_file_path, 'w') + + namespace = api_namespace() + + header_file.write("#pragma once\n") + header_file.write(header_include()) + header_file.write(namespace[0]) + + include_header_file = "paddle/phi/api/backward/sparse_bw_api.h" + source_file.write(source_include(include_header_file)) + source_file.write(namespace[0]) + + for api in apis: + sparse_bw_api = SparseBackwardAPI(api) + header_file.write(sparse_bw_api.gene_api_declaration()) + source_file.write(sparse_bw_api.gene_api_code()) + + header_file.write(namespace[1]) + source_file.write(namespace[1]) + + source_file.write(api_register()) + + header_file.close() + source_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle C++ Sparse API files') + parser.add_argument( + '--api_yaml_path', + help='path to sparse api yaml file', + default='python/paddle/utils/code_gen/sparse_bw_api.yaml') + + parser.add_argument( + '--api_header_path', + help='output of generated api header code file', + default='paddle/phi/api/backward/sparse_bw_api.h') + + parser.add_argument( + '--api_source_path', + help='output of generated api source code file', + default='paddle/phi/api/lib/sparse_bw_api.cc') + + options = parser.parse_args() + + api_yaml_path = options.api_yaml_path + header_file_path = options.api_header_path + source_file_path = options.api_source_path + + generate_api(api_yaml_path, header_file_path, source_file_path) + + +if __name__ == '__main__': + main() -- GitLab From 3c536f2e65c65cc986e8aaff86214426498d1f7a Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Tue, 8 Mar 2022 13:11:12 +0800 Subject: [PATCH 104/261] =?UTF-8?q?[phi]=20move=20isnan=5Fv2=E3=80=81isfin?= =?UTF-8?q?ite=5Fv2=E3=80=81isinf=5Fv2=20to=20phi=20(#40076)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * support isfinite for phi * mark v2 * fixed bugs * fixed include bugs * deal with comments * decoupling selected_rows * rm bfloat16 * fixed infermeta * fixed code style * rm useless code * replace pt by pd --- paddle/fluid/operators/isfinite_v2_op.cc | 64 ++++--------- paddle/fluid/operators/isfinite_v2_op.cu | 55 ----------- paddle/phi/core/compat/op_utils.h | 3 + paddle/phi/infermeta/unary.cc | 5 + paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/cpu/isfinite_kernel.cc | 62 ++++++++++++ .../kernels/funcs/isfinite_functor.h} | 33 +++---- paddle/phi/kernels/gpu/isfinite_kernel.cu | 61 ++++++++++++ .../phi/kernels/impl/isfinite_kernel_impl.h | 39 ++++++++ paddle/phi/kernels/isfinite_kernel.h | 31 ++++++ .../kernels/selected_rows/isfinite_kernel.cc | 96 +++++++++++++++++++ .../kernels/selected_rows/isfinite_kernel.h | 31 ++++++ .../selected_rows/isfinite_kernel_impl.h | 39 ++++++++ paddle/phi/ops/compat/isfinite_sig.cc | 19 ++++ 14 files changed, 419 insertions(+), 121 deletions(-) delete mode 100644 paddle/fluid/operators/isfinite_v2_op.cu create mode 100644 paddle/phi/kernels/cpu/isfinite_kernel.cc rename paddle/{fluid/operators/isfinite_v2_op.h => phi/kernels/funcs/isfinite_functor.h} (52%) create mode 100644 paddle/phi/kernels/gpu/isfinite_kernel.cu create mode 100644 paddle/phi/kernels/impl/isfinite_kernel_impl.h create mode 100644 paddle/phi/kernels/isfinite_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h create mode 100644 paddle/phi/ops/compat/isfinite_sig.cc diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203..cfa370ff9cb 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36df..00000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 8f64a7145ed..9947e00ecb5 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -40,6 +40,9 @@ const std::unordered_set standard_kernel_suffixs({ const std::unordered_set deprecated_op_names({"diag", "flatten", "flatten_grad", + "isinf", + "isnan", + "isfinite", "matmul", "matmul_grad", "matmul_grad_grad", diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4053cfbc362..17edc846187 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1007,6 +1007,11 @@ void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { out->set_dims({1}); } +void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + void PixelShuffleInferMeta(const MetaTensor& x, int upscale_factor, const std::string& data_format, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a679ef8c11a..dac7c19cf9b 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -157,6 +157,8 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); + void TransposeInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc new file mode 100644 index 00000000000..33a7429a22a --- /dev/null +++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isfinite_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(isinf, + CPU, + ALL_LAYOUT, + phi::IsinfKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan, + CPU, + ALL_LAYOUT, + phi::IsnanKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite, + CPU, + ALL_LAYOUT, + phi::IsfiniteKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h similarity index 52% rename from paddle/fluid/operators/isfinite_v2_op.h rename to paddle/phi/kernels/funcs/isfinite_functor.h index b646e460ec7..c804bee8d4c 100644 --- a/paddle/fluid/operators/isfinite_v2_op.h +++ b/paddle/phi/kernels/funcs/isfinite_functor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,39 +14,32 @@ #pragma once -#include - -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/isfinite_op.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace operators { +namespace funcs { struct InfinityV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorContainsInfV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorContainsInfV2(tensor, out); } }; struct NANV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorContainsNANV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorContainsNANV2(tensor, out); } }; struct IsfiniteV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorIsfiniteV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorIsfiniteV2(tensor, out); } }; -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu new file mode 100644 index 00000000000..4b41ed1e55d --- /dev/null +++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h" +#include "paddle/phi/kernels/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(isinf, + GPU, + ALL_LAYOUT, + phi::IsinfKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan, + GPU, + ALL_LAYOUT, + phi::IsnanKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite, + GPU, + ALL_LAYOUT, + phi::IsfiniteKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h new file mode 100644 index 00000000000..affa85f8a2d --- /dev/null +++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/isfinite_functor.h" +#include "paddle/phi/kernels/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& ctx, + const DenseTensor& x, + DenseTensor* out); + +#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor) \ + template \ + void isfinite_kernel( \ + const Context& ctx, const DenseTensor& x, DenseTensor* out) { \ + IsfiniteKernelImpl(ctx, x, out); \ + } + +DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor) +DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor) +DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor) +#undef DEFINE_ISFINITE_KERNEL + +} // namespace phi diff --git a/paddle/phi/kernels/isfinite_kernel.h b/paddle/phi/kernels/isfinite_kernel.h new file mode 100644 index 00000000000..e695a8e0742 --- /dev/null +++ b/paddle/phi/kernels/isfinite_kernel.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \ + template \ + void isfinite_kernel( \ + const Context& ctx, const DenseTensor& x, DenseTensor* out); + +DEFINE_ISFINITE_KERNEL(IsinfKernel) +DEFINE_ISFINITE_KERNEL(IsnanKernel) +DEFINE_ISFINITE_KERNEL(IsfiniteKernel) +#undef DEFINE_ISFINITE_KERNEL + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc new file mode 100644 index 00000000000..a507cdd0d86 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_context.h" +#endif +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h" + +namespace phi { + +template +inline void IsfiniteSRImpl(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x.value(), out->mutable_value()); +} +} // namespace phi + +PD_REGISTER_KERNEL(isinf_sr, + CPU, + ALL_LAYOUT, + phi::IsinfSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan_sr, + CPU, + ALL_LAYOUT, + phi::IsnanSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite_sr, + CPU, + ALL_LAYOUT, + phi::IsfiniteSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(isinf_sr, + GPU, + ALL_LAYOUT, + phi::IsinfSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan_sr, + GPU, + ALL_LAYOUT, + phi::IsnanSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite_sr, + GPU, + ALL_LAYOUT, + phi::IsfiniteSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.h b/paddle/phi/kernels/selected_rows/isfinite_kernel.h new file mode 100644 index 00000000000..948d8c89477 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +#define DEFINE_ISFINITE_SR(isfinite_sr) \ + template \ + void isfinite_sr( \ + const Context& ctx, const SelectedRows& x, SelectedRows* out); + +DEFINE_ISFINITE_SR(IsinfSR) +DEFINE_ISFINITE_SR(IsnanSR) +DEFINE_ISFINITE_SR(IsfiniteSR) +#undef DEFINE_ISFINITE_SR + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h new file mode 100644 index 00000000000..c53abdf996c --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/isfinite_functor.h" +#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteSRImpl(const Context& ctx, + const SelectedRows& x, + SelectedRows* out); + +#define DEFINE_ISFINITE_SR(isfinite_sr, functor) \ + template \ + void isfinite_sr( \ + const Context& ctx, const SelectedRows& x, SelectedRows* out) { \ + IsfiniteSRImpl(ctx, x, out); \ + } + +DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor) +DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor) +DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor) +#undef DEFINE_ISFINITE_SR + +} // namespace phi diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc new file mode 100644 index 00000000000..218b4c2f962 --- /dev/null +++ b/paddle/phi/ops/compat/isfinite_sig.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf); +PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan); +PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite); -- GitLab From 13f2b1e381d6ab112dd431bdb415e7ea04fbb7b7 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 8 Mar 2022 13:29:26 +0800 Subject: [PATCH 105/261] [phi] transfer accuracy op and pass the unittests (#39982) * transfer accuracy op and pass the ci * remove header file * fix code * fix code * fix * fix --- paddle/fluid/operators/metrics/accuracy_op.cc | 9 +- paddle/fluid/operators/metrics/accuracy_op.cu | 110 ---------------- paddle/fluid/operators/metrics/accuracy_op.h | 74 ----------- .../operators/metrics/accuracy_op_mlu.cc | 3 +- .../operators/metrics/accuracy_op_npu.cc | 2 +- .../operators/metrics/accuracy_op_xpu.cc | 4 +- paddle/phi/kernels/accuracy_kernel.h | 30 +++++ paddle/phi/kernels/cpu/accuracy_kernel.cc | 72 +++++++++++ paddle/phi/kernels/gpu/accuracy_kernel.cu | 117 ++++++++++++++++++ 9 files changed, 228 insertions(+), 193 deletions(-) delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.cu delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.h create mode 100644 paddle/phi/kernels/accuracy_kernel.h create mode 100644 paddle/phi/kernels/cpu/accuracy_kernel.cc create mode 100644 paddle/phi/kernels/gpu/accuracy_kernel.cu diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5..056620db5b9 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -123,13 +123,10 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d..00000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e..00000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277..1ce02ff4525 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index e83278f88b8..9f2ca4165f3 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78d..3cc1be4de8a 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h new file mode 100644 index 00000000000..8f2dbb96f86 --- /dev/null +++ b/paddle/phi/kernels/accuracy_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total); +} // namespace phi diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc new file mode 100644 index 00000000000..c57ec69b73a --- /dev/null +++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/accuracy_kernel.h" + +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { + int* correct_data = dev_ctx.template Alloc(correct); + int* total_data = dev_ctx.template Alloc(total); + float* accuracy_data = dev_ctx.template Alloc(accuracy); + + const int64_t* indices_data = indices.data(); + const int64_t* label_data = label.data(); + + size_t num_samples = inference.dims()[0]; + size_t class_dim = inference.dims()[1]; + *accuracy_data = 0.0f; + + if (num_samples == 0) { + return; + } + + int num_correct = 0; + // assume inference is already the topk of the output + for (size_t i = 0; i < num_samples; ++i) { + PADDLE_ENFORCE_GE( + label_data[i], + 0, + phi::errors::InvalidArgument( + "label of AccuracyOp must >= 0, But received label[%d] is %d", + i, + label_data[i])); + for (size_t j = 0; j < class_dim; ++j) { + if (indices_data[i * class_dim + j] == label_data[i]) { + ++num_correct; + break; + } + } + } + + *correct_data = num_correct; + *total_data = num_samples; + *accuracy_data = + static_cast(num_correct) / static_cast(num_samples); +} +} // namespace phi + +// TODO(add supported dtype.) +PD_REGISTER_KERNEL( + accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu new file mode 100644 index 00000000000..f08fb74e54d --- /dev/null +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -0,0 +1,117 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/accuracy_kernel.h" + +#include +#include +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void AccuracyCudaKernel(const int N, + const int D, + const int64_t* Xdata, + const int64_t* labeldata, + int* correct_data, + float* accuracy, + int* total_data) { + int count = 0; + __shared__ int total[BlockSize]; + + // support only 1 block + for (int i = threadIdx.x; i < (N); i += BlockSize) { + for (int j = 0; j < D; ++j) { + if (Xdata[i * D + j] == labeldata[i]) { + ++count; + break; + } + } + } + total[threadIdx.x] = count; + __syncthreads(); + +// reduce the count with init value 0, and output accuracy. +#ifdef PADDLE_WITH_CUDA + int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); +#else + // HIP thrust::reduce not support __device__ + for (int s = BlockSize / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + total[threadIdx.x] += total[threadIdx.x + s]; + } + __syncthreads(); + } + int result = total[0]; +#endif + if (threadIdx.x == 0) { + *correct_data = result; + *accuracy = static_cast(result) / static_cast(N); + *total_data = N; + } +} + +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { + // FIXME(typhoonzero): only support indices currently + // if add support for output values, how to detect the data type? + const int64_t* indices_data = indices.data(); + const int64_t* label_data = label.data(); + + int* correct_data = dev_ctx.template Alloc(correct); + int* total_data = dev_ctx.template Alloc(total); + float* accuracy_data = dev_ctx.template Alloc(accuracy); + + int num_samples = static_cast(inference.dims()[0]); + size_t infer_width = inference.dims()[1]; + auto stream = dev_ctx.stream(); + phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); + + if (num_samples == 0) { + return; + } + + AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_samples, + infer_width, + indices_data, + label_data, + correct_data, + accuracy_data, + total_data); +} +} // namespace phi + +// FIXME(typhoonzero): types of T is for inference data. +// label data is always int64 +PD_REGISTER_KERNEL(accuracy, + GPU, + ALL_LAYOUT, + phi::AccuracyRawKernel, + phi::dtype::float16, + float, + double) {} -- GitLab From a279a4f8576667bf86258f1e9e59b6a05b6ce00e Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 8 Mar 2022 13:57:42 +0800 Subject: [PATCH 106/261] [IPU] update ipu unittests p2 (#40069) * update ipu UTs part2 * clean git * rename ut * rename ut 1 * sync api changes * update uts for new api * update uts for new api * fix re-define --- .../tests/unittests/ipu/test_ipu_pipeline.py | 71 ------ .../tests/unittests/ipu/test_ipu_place.py | 51 ----- .../tests/unittests/ipu/test_ipu_shard.py | 70 ------ .../unittests/ipu/test_ipu_shard_api_ipu.py | 112 ++++++++++ .../tests/unittests/ipu/test_ipu_strategy.py | 56 ----- .../unittests/ipu/test_ipu_strategy_ipu.py | 72 ++++++ .../unittests/ipu/test_layernorm_op_ipu.py | 134 +++++++---- .../unittests/ipu/test_log_softmax_op_ipu.py | 87 ++++---- .../unittests/ipu/test_logical_not_op_ipu.py | 97 ++++++++ .../unittests/ipu/test_lookuptable_op_ipu.py | 102 ++++----- .../ipu/test_lookuptable_v2_op_ipu.py | 141 ++++++++++++ ...lr_sheduelr.py => test_lr_sheduler_ipu.py} | 6 +- .../tests/unittests/ipu/test_matmul_op_ipu.py | 208 ++++++++++-------- 13 files changed, 723 insertions(+), 484 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_lr_sheduelr.py => test_lr_sheduler_ipu.py} (95%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py deleted file mode 100644 index beab68553d7..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuShard(unittest.TestCase): - def _test(self): - # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_stage : no - - with paddle.fluid.ipu_shard(ipu_stage=1): - c = b + 1 # scale, ipu_stage : 1 - with paddle.fluid.ipu_shard(ipu_stage=2): - d = c * 2 # scale, ipu_stage : 2 - with paddle.fluid.ipu_shard(ipu_stage=3): - e = d + 3 # scale, ipu_stage : 3 - with paddle.fluid.ipu_shard(ipu_stage=1): - e = e + 3 # scale, ipu_stage : 1 - with paddle.fluid.ipu_shard(ipu_stage=2): - e = e + 3 # scale, ipu_stage : 2 - - with paddle.fluid.ipu_shard(ipu_stage=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 - - with paddle.fluid.ipu_shard(ipu_stage=2): - g = f - 1 # scale, ipu_stage : 2 - - h = g + 1 # scale, ipu_stage : no - - ipu_index_list = [] - main_prog = paddle.static.default_main_program() - for op in main_prog.global_block().ops: - if op.desc.has_attr("ipu_stage"): - ipu_index_list.append(op.desc.attr("ipu_stage")) - - return ipu_index_list - - def test_ipu_shard(self): - ipu_index_list = self._test() - expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] - - self.assertTrue( - np.allclose( - ipu_index_list, expected_ipu_index_list, atol=0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py deleted file mode 100644 index 48ab046deb3..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -sys.path.append("..") -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuPlace(unittest.TestCase): - def test_ipu_place(self): - num_devices = fluid.core.get_ipu_device_count() - self.assertGreater(num_devices, 0) - - for i in range(num_devices): - place = paddle.IPUPlace() - p = fluid.core.Place() - p.set_place(place) - self.assertTrue(p.is_ipu_place()) - - def test_ipu_set_device(self): - num_devices = fluid.core.get_ipu_device_count() - self.assertGreater(num_devices, 0) - - for i in range(num_devices): - paddle.set_device('ipu') - device = paddle.get_device() - self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py deleted file mode 100644 index 368556d8b2f..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuShard(unittest.TestCase): - def _test(self): - # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_index : no - - with paddle.fluid.ipu_shard(ipu_index=1): - c = b + 1 # scale, ipu_index : 1 - with paddle.fluid.ipu_shard(ipu_index=2): - d = c * 2 # scale, ipu_index : 2 - with paddle.fluid.ipu_shard(ipu_index=3): - e = d + 3 # scale, ipu_index : 3 - with paddle.fluid.ipu_shard(ipu_index=1): - e = e + 3 # scale, ipu_index : 1 - with paddle.fluid.ipu_shard(ipu_index=2): - e = e + 3 # scale, ipu_index : 2 - - with paddle.fluid.ipu_shard(ipu_index=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 - - with paddle.fluid.ipu_shard(ipu_index=2): - g = f - 1 # scale, ipu_index : 2 - - h = g + 1 # scale, ipu_index : no - - ipu_index_list = [] - main_prog = paddle.static.default_main_program() - for op in main_prog.global_block().ops: - if op.desc.has_attr("ipu_index"): - ipu_index_list.append(op.desc.attr("ipu_index")) - - return ipu_index_list - - def test_ipu_shard(self): - ipu_index_list = self._test() - expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] - self.assertTrue( - np.allclose( - ipu_index_list, expected_ipu_index_list, atol=0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py new file mode 100644 index 00000000000..026b19eccf1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import paddle + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuShard(unittest.TestCase): + def _test(self): + # build graph + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_index : no + + with paddle.static.ipu_shard_guard(index=1): + c = b + 1 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + d = c * 2 # scale, ipu_index : 2 + with paddle.static.ipu_shard_guard(index=3): + e = d + 3 # scale, ipu_index : 3 + with paddle.static.ipu_shard_guard(index=1): + e = e + 3 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + e = e + 3 # scale, ipu_index : 2 + + with paddle.static.ipu_shard_guard(index=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 + + with paddle.static.ipu_shard_guard(index=2): + g = f - 1 # scale, ipu_index : 2 + + h = g + 1 # scale, ipu_index : no + + ipu_index_list = [] + main_prog = paddle.static.default_main_program() + for op in main_prog.global_block().ops: + if op.desc.has_attr("ipu_index"): + ipu_index_list.append(op.desc.attr("ipu_index")) + + return ipu_index_list + + def test_ipu_shard(self): + ipu_index_list = self._test() + expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] + self.assertTrue( + np.allclose( + ipu_index_list, expected_ipu_index_list, atol=0)) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuPipeline(unittest.TestCase): + def _test(self): + # build graph + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_stage : no + + with paddle.static.ipu_shard_guard(stage=1): + c = b + 1 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + d = c * 2 # scale, ipu_stage : 2 + with paddle.static.ipu_shard_guard(stage=3): + e = d + 3 # scale, ipu_stage : 3 + with paddle.static.ipu_shard_guard(stage=1): + e = e + 3 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + e = e + 3 # scale, ipu_stage : 2 + + with paddle.static.ipu_shard_guard(stage=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 + + with paddle.static.ipu_shard_guard(stage=2): + g = f - 1 # scale, ipu_stage : 2 + + h = g + 1 # scale, ipu_stage : no + + ipu_index_list = [] + main_prog = paddle.static.default_main_program() + for op in main_prog.global_block().ops: + if op.desc.has_attr("ipu_stage"): + ipu_index_list.append(op.desc.attr("ipu_stage")) + + return ipu_index_list + + def test_ipu_shard(self): + ipu_index_list = self._test() + expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] + + self.assertTrue( + np.allclose( + ipu_index_list, expected_ipu_index_list, atol=0)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py deleted file mode 100644 index afeec9ee1b6..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler - -paddle.enable_static() -SEED = 2021 - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestConvNet(unittest.TestCase): - def test_training(self): - ipu_strategy = paddle.static.IpuStrategy() - - assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1" - assert ipu_strategy.is_training == True, "Default is_training is True" - assert ipu_strategy.enable_pipelining == False, \ - "Default enable_pipelining is False" - assert ipu_strategy.enable_manual_shard == False, \ - "Default enable_manual_shard is False" - - ipu_strategy.SetGraphConfig( - num_ipus=2, is_training=False, enable_manual_shard=True) - ipu_strategy.SetPipeliningConfig(enable_pipelining=True) - assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed" - - assert ipu_strategy.is_training == False, "Set is_training Failed" - - assert ipu_strategy.enable_pipelining == True, \ - "Set enable_pipelining Failed" - - assert ipu_strategy.enable_manual_shard == True, \ - "Set enable_manual_shard Failed" - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py new file mode 100644 index 00000000000..f120f559491 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.static + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuStrategy(unittest.TestCase): + def test_set_options(self): + ipu_strategy = paddle.static.IpuStrategy() + all_option_names = ipu_strategy._ipu_strategy.get_all_option_names() + for option_name in all_option_names: + option = ipu_strategy._ipu_strategy.get_option(option_name) + option_type = option['type'] + option_value = option['value'] + if option_type in ['double']: + set_value = option_value + 0.5 + elif option_type == 'uint64': + set_value = option_value + 1 + elif option_type == 'bool': + set_value = not option_value + else: + continue + ipu_strategy.set_options({option_name: set_value}) + new_value = ipu_strategy.get_option(option_name) + assert new_value == set_value, f"set {option_name} to {set_value} failed" + + def test_set_string_options(self): + ipu_strategy = paddle.static.IpuStrategy() + options = { + 'cache_path': 'paddle_cache', + 'log_dir': 'paddle_log', + 'partials_type_matmuls': 'half', + 'partials_type_matmuls': 'float', + } + ipu_strategy.set_options(options) + for k, v in options.items(): + assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed " + + def test_set_other_options(self): + ipu_strategy = paddle.static.IpuStrategy() + options = {} + options['dot_checks'] = ['0', '1', '2', '3'] + options['engine_options'] = { + 'debug.allowOutOfMemory': 'true', + 'autoReport.directory': 'path', + 'autoReport.all': 'true' + } + for k, v in options.items(): + ipu_strategy.set_options({k: v}) + assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed " + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py index 196f94b68f9..a52946bba15 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,44 +26,52 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 1, "epsilon": 1e-05, } + self.optimizer = None - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -80,33 +82,38 @@ class TestBase(IPUOpTest): out = paddle.fluid.layers.nn.layer_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) else: - # scale = True - # bias = True scale = self.attrs['scale'] bias = self.attrs['shift'] out = paddle.fluid.layers.nn.layer_norm( x, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) - if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) - fetch_list = [loss.name] - else: - fetch_list = [out.name] + fetch_list = [loss.name] - if run_ipu: + if self.is_training: + optimizer = None + if self.optimizer == 'sgd': + optimizer = paddle.optimizer.SGD(learning_rate=1e-2) + elif self.optimizer == 'adam': + optimizer = paddle.optimizer.Adam(learning_rate=1e-2) + elif self.optimizer == 'lamb': + optimizer = paddle.optimizer.Lamb( + learning_rate=1e-2, lamb_weight_decay=0.0) + if optimizer is not None: + optimizer.minimize(loss) + + if exec_mode: place = paddle.IPUPlace() else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -116,12 +123,14 @@ class TestBase(IPUOpTest): result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=self.feed_fp32, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, + feed=self.feed_fp32, + fetch_list=fetch_list) return result[0] def test_base(self): @@ -137,7 +146,7 @@ class TestBase(IPUOpTest): @unittest.skip('raise error') class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": False, "shift": True, @@ -148,7 +157,7 @@ class TestCase1(TestBase): @unittest.skip('raise error') class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": False, @@ -158,18 +167,28 @@ class TestCase2(TestBase): class TestCase3(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 2, "epsilon": 1e-05, } + self.optimizer = None class TestTrainCase1(TestBase): + def set_op_attrs(self): + self.attrs = { + "scale": True, + "shift": True, + "begin_norm_axis": 1, + "epsilon": 1e-05 + } + self.optimizer = 'sgd' + def set_atol(self): - self.atol = 1e-3 + self.atol = 1e-6 def set_training(self): self.is_training = True @@ -178,15 +197,34 @@ class TestTrainCase1(TestBase): class TestTrainCase2(TestBase): def set_atol(self): - self.atol = 1e-3 + self.atol = 5e-4 - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 2, - "epsilon": 1e-05, + "epsilon": 1e-05 + } + self.optimizer = 'adam' + + def set_training(self): + self.is_training = True + self.epoch = 10 + + +class TestTrainCase3(TestBase): + def set_atol(self): + self.atol = 5e-3 + + def set_op_attrs(self): + self.attrs = { + "scale": True, + "shift": True, + "begin_norm_axis": 2, + "epsilon": 1e-05 } + self.optimizer = 'lamb' def set_training(self): self.is_training = True diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py index dc3cab6ac5e..fad7516e442 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py @@ -16,15 +16,9 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) import paddle.nn.functional as F - -paddle.enable_static() +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -33,72 +27,81 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + self.feed_list = list(self.feed_fp32.keys()) def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -1} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = F.log_softmax(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py new file mode 100644 index 00000000000..3f8472890d0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 20, 30528]) + self.feed = {"in_0": data.astype('bool')} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="bool") + + out = paddle.fluid.layers.logical_not(x) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).astype(np.int32) + + self.check(output_dict, check_shape=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py index 31b0c99603c..4a877ddce4e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,16 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]]) + self.feed_cpu = {"x": data.astype(np.int64)} + self.feed_ipu = {"x": data.astype(np.int32)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + self.feed_dtype = [x.dtype for x in self.feed_cpu.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "size": [128, 16], "is_sparse": False, @@ -50,33 +53,20 @@ class TestBase(IPUOpTest): "dtype": 'float32' } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - if run_ipu: - self.feed = { - "x": np.array( - [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32) - } - else: - self.feed = { - "x": np.array( - [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64) - } + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - self.set_feed_attr() - - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='int64') + out = paddle.fluid.layers.embedding(x, **self.attrs) if self.is_training: @@ -87,47 +77,61 @@ class TestBase(IPUOpTest): else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_cpu + if exec_mode > ExecutionMode.CPU_FP32: + feed = self.feed_ipu + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or + self.is_training): + break - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestTrainCase1(TestBase): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + def set_training(self): self.is_training = True self.epoch = 10 diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py new file mode 100644 index 00000000000..da8048fb320 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]]) + self.feed_cpu = {"x": x.astype(np.int64)} + self.feed_ipu = {"x": x.astype(np.int32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + self.feed_dtype = [x.dtype for x in self.feed_cpu.values()] + + def set_op_attrs(self): + self.attrs = { + "num_embeddings": 128, + "embedding_dim": 16, + "sparse": False, + "padding_idx": -1, + "weight_attr": None + } + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int64') + + embedding = paddle.nn.Embedding(**self.attrs) + out = embedding(x) + + if self.is_training: + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + fetch_list = [loss.name] + else: + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_cpu + if exec_mode > ExecutionMode.CPU_FP32: + feed = self.feed_ipu + + if self.is_training: + result = [] + for _ in range(self.epoch): + loss_res = exe.run(program, + feed=feed, + fetch_list=fetch_list) + result.append(loss_res[0]) + return np.array(result) + else: + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or + self.is_training): + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestTrainCase1(TestBase): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_training(self): + self.is_training = True + self.epoch = 10 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py similarity index 95% rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py index 38b91785aee..58f018e2ae6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py @@ -19,7 +19,7 @@ import unittest import sys import paddle import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static from paddle.optimizer.lr import LRScheduler paddle.enable_static() @@ -71,8 +71,8 @@ class TestConvNet(unittest.TestCase): feed_list = [image.name] fetch_list = [loss.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py index c6702b92ab9..6929ded6ebf 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,85 +26,93 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[2, 3]).astype('float32'), - "y": np.random.uniform(size=[3, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[20, 30]) + y = np.random.uniform(size=[30, 20]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": False, "transpose_y": False, "alpha": 1.0, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.matmul(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -119,55 +121,64 @@ class TestCase1(TestBase): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, "alpha": 3.14, } + def set_atol(self): + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + class TestCase3(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[5, 4, 3, 2]) + y = np.random.uniform(size=[5, 4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase4(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[4, 3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[4, 3, 2]) + y = np.random.uniform(size=[4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase5(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase6(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} @unittest.skip("not supported") class TestCase6_2(TestCase6): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -176,27 +187,36 @@ class TestCase6_2(TestCase6): class TestCase7(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 1]).astype('float32'), - "y": np.random.uniform(size=[1, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 12, 128, 64]) + y = np.random.uniform(size=[1, 12, 128, 64]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125} + + +class TestCase8(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 1]) + y = np.random.uniform(size=[1, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} @unittest.skip("not supported") -class TestCase7_2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[2]).astype('float32'), - } - # equal to - # self.feed = { - # "x": np.random.uniform(size=[3, 1]).astype('float32'), - # "y": np.random.uniform(size=[1, 2]).astype('float32'), - # } +class TestCase8_2(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[2]) - def set_attrs(self): + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -205,12 +225,12 @@ class TestCase7_2(TestBase): @unittest.skip("dim > 4 is not supported") -class TestCase8(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'), - } +class TestCase9(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[6, 5, 4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} if __name__ == "__main__": -- GitLab From 061044a0cc199f03645a9cbc836f46da3930329d Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 8 Mar 2022 13:59:03 +0800 Subject: [PATCH 107/261] [IPU] update ipu unittests p4 (#40073) * update ipu UTs part4 * rename uts * sync api changes * update uts for new api --- .../unittests/ipu/test_set_batch_size_ipu.py | 96 +++++----- .../tests/unittests/ipu/test_sgd_optimizer.py | 88 --------- .../tests/unittests/ipu/test_slice_op_ipu.py | 122 +++++++------ .../unittests/ipu/test_softmax_op_ipu.py | 87 ++++----- .../tests/unittests/ipu/test_split_op_ipu.py | 113 ++++++++++++ .../unittests/ipu/test_squeeze_op_ipu.py | 91 +++++----- .../tests/unittests/ipu/test_stack_op_ipu.py | 102 ++++++----- .../tests/unittests/ipu/test_sum_op_ipu.py | 143 ++++++++------- .../tests/unittests/ipu/test_topk_op_ipu.py | 171 +++++++++--------- .../unittests/ipu/test_transpose_op_ipu.py | 98 +++++----- .../unittests/ipu/test_unsqueeze_op_ipu.py | 86 ++++----- ...inplace.py => test_varname_inplace_ipu.py} | 37 ++-- .../unittests/ipu/test_weight_sharing_ipu.py | 126 +++++++++++++ 13 files changed, 782 insertions(+), 578 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_varname_inplace.py => test_varname_inplace_ipu.py} (79%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py index 93945b98ef0..9a18922f353 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py @@ -16,13 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,36 +26,46 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([-1, 3, 128, 128]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=[2, 3, 128, 128]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 3e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 128, 128]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + conv1 = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) conv2 = paddle.static.nn.conv2d( @@ -70,36 +75,45 @@ class TestBase(IPUOpTest): conv4 = paddle.static.nn.conv2d( conv3, num_filters=3, filter_size=3, bias_attr=False) - fetch_list = [conv4.name] + fetch_list = [conv4.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - batch_size=2, is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + # set batch size + ipu_strategy.micro_batch_size = 2 + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py deleted file mode 100644 index df0e2a040bd..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler - -paddle.enable_static() -SEED = 2021 - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestSGD(unittest.TestCase): - def _test_sgd(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - image = paddle.static.data( - name='image', shape=[1, 3, 10, 10], dtype='float32') - conv1 = paddle.static.nn.conv2d( - image, num_filters=3, filter_size=3, bias_attr=False) - loss = paddle.mean(conv1) - - sgd = paddle.optimizer.SGD(learning_rate=1e-1) - sgd.minimize(loss) - - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - if run_ipu: - feed_list = [image.name] - fetch_list = [loss.name] - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=True) - program = compiler.IPUCompiledProgram( - main_prog, ipu_strategy=ipu_strategy).compile(feed_list, - fetch_list) - else: - program = main_prog - - result = [] - for epoch in range(100): - loss_res = exe.run(program, - feed={"image": np_image}, - fetch_list=[loss]) - result.append(loss_res) - - return np.array(result) - - def test_sgd(self): - # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1) - ipu_loss = self._test_sgd(True).flatten() - cpu_loss = self._test_sgd(False).flatten() - - self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py index 3bdfeabce65..8881f018de3 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,78 +26,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), } + def set_data_feed(self): + data = np.random.uniform(size=[4, 5, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "axes": [0, 1, 2], "starts": [-3, 0, 2], "ends": [3, 2, 4], } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.slice(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "axes": [0, 1], "starts": [0, 0], @@ -113,38 +117,45 @@ class TestCase1(TestBase): @unittest.skip('dynamic graph is not support on IPU') class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 5, 6]).astype('float32'), - "starts": np.array([0, 0, 2]).astype('int32'), - "ends": np.array([3, 2, 4]).astype('int32'), + def set_data_feed(self): + x = np.random.uniform(size=[4, 5, 6]) + s = np.array([0, 0, 2]) + e = np.array([3, 2, 4]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "starts": s.astype(np.int32), + "ends": e.astype(np.int32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "starts": s.astype(np.int32), + "ends": e.astype(np.int32) } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [0, 1, 2]} def _test_base(self, run_ipu=True): scope = fluid.core.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED with fluid.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') starts = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='int32') ends = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='int32') out = paddle.fluid.layers.slice( x, starts=starts, ends=ends, **self.attrs) @@ -160,8 +171,8 @@ class TestCase2(TestBase): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -170,6 +181,9 @@ class TestCase2(TestBase): result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] + def test_base(self): + pass + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py index a4a4b83baf3..25201959cec 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py @@ -13,16 +13,11 @@ # limitations under the License. import unittest + import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,76 +26,84 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 2, 20]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -1} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.softmax(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 2} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py new file mode 100644 index 00000000000..59af3a3d6ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py @@ -0,0 +1,113 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 10, 10]) + + self.feed_fp32 = {'x': data1.astype(np.float32)} + self.feed_fp16 = {'x': data1.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.split(x, **self.attrs) + + fetch_list = [fetch.name for fetch in out] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled + ) or mode == ExecutionMode.IPU_POPART_FP16: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"num_or_sections": [2, 8], "axis": 2} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py index ccd27965908..bdc8fb32c84 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py @@ -13,16 +13,11 @@ # limitations under the License. import unittest + import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,81 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 1, 5]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [0]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.squeeze(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, - iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": []} class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [-2]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py index 3d5de11b5e2..c807ab9aab6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,86 +26,102 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 2]).astype('float32'), - "z": np.random.uniform(size=[1, 2]).astype('float32'), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[1, 2]) + y = np.random.uniform(size=[1, 2]) + z = np.random.uniform(size=[1, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + "z": z.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + "z": z.astype(np.float16) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') z = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='float32') + out = paddle.fluid.layers.stack([x, y, z], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -2} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py index 003350cd7a0..12351cb63d6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,131 +26,154 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 2, 2]) + y = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.sum([x, y], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) @unittest.skip('') class TestCase1(TestBase): def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), + x = np.random.uniform(size=[1, 3, 2, 2]) + y = np.random.uniform(size=[1, 3, 2, 2]) + z = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + "z": y.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + "z": y.astype(np.float16) } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') z = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='float32') + out = paddle.fluid.layers.sum([x, y, z], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py index 9915a7a1fd8..ef75aee7804 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py @@ -16,130 +16,125 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestTopKOp(IPUOpTest): def setUp(self): - self.set_ops() self.set_atol() self.set_training() - self.k = 3 - self.use_K_as_const_variable = False - - self.set_feed() - self.set_attrs() - - def set_ops(self): - self.ops = [ - paddle.fluid.layers.topk, - paddle.topk # use top_k_v2 implementation - ] - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([3, 5]) - - self.feed = {} - self.feed_list = [] - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - self.feed_list.append("in_0") - if self.use_K_as_const_variable: - # self.feed["in_1"] = np.array([self.k]).astype("int32") - # self.feed_list.append("in_1") - pass - print("[TestTopKop] feed data:\n%s" % self.feed["in_0"]) - - def set_attrs(self): - self.attrs = { - # "axis": -1, - # "sorted": True - } - if not self.use_K_as_const_variable: - self.attrs["k"] = self.k - - def _test_base(self, run_ipu=True, op=None, data_feed=None): - assert (op is not None) - assert (data_feed is not None) - scope = fluid.core.Scope() + self.set_data_feed() + self.set_feed_attr() + self.set_test_op() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_test_op(self): + self.op = paddle.fluid.layers.topk + + def set_data_feed(self): + data = np.random.uniform(size=[3, 5]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.use_k_as_const_variable = False + self.attrs = {} + if not self.use_k_as_const_variable: + self.attrs["k"] = 3 + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - if not self.use_K_as_const_variable: - topk_values, topk_indices = op(x, **self.attrs) + + if not self.use_k_as_const_variable: + topk_values, topk_indices = self.op(x, **self.attrs) else: # !important, popart cannot accept non const tensor - # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32') - K_t = fluid.layers.fill_constant( + K_t = paddle.fluid.layers.fill_constant( shape=[1], dtype='int32', value=self.k, name="in_2") - topk_values, topk_indices = op(x, K_t, **self.attrs) + topk_values, topk_indices = self.op(x, K_t, **self.attrs) + fetch_list = [topk_values.name, topk_indices.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - print("Running inference ...") - result = exe.run(program, feed=data_feed, fetch_list=fetch_list) - print("Complete running infrence.") + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result def test_base(self): - for op in self.ops: - res0_topk_values, res0_topk_indices = self._test_base( - True, op=op, data_feed=self.feed) - res1_topk_values, res1_topk_indices = self._test_base( - False, op=paddle.fluid.layers.topk, data_feed=self.feed) - - print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values) - print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values) - view_type = np.uint32 - print("[TestTopKop] IPU res0 indices:\n%s\n" % - res0_topk_indices.astype(view_type)) - print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices) - - self.assertTrue( - np.allclose( - res0_topk_values.flatten(), - res1_topk_values.flatten(), - atol=self.atol)) - - self.assertTrue( - np.allclose( - res0_topk_indices.astype(view_type).flatten(), - res1_topk_indices.flatten(), - atol=self.atol)) + value_dict = {} + index_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + value, index = self._test_base(mode) + value_dict[mode] = value + index_dict[mode] = index + + self.check(value_dict) + self.check(index_dict) + + +class TestCase2(TestTopKOp): + def set_test_op(self): + self.op = paddle.topk + + +@unittest.skip("Trying to get data as int64 but it is of type int32") +class TestCase3(TestTopKOp): + def set_op_attrs(self): + self.use_k_as_const_variable = True + self.attrs = {} + self.k = 2 + + +@unittest.skip("Trying to get data as int64 but it is of type int32") +class TestCase4(TestCase3): + def set_test_op(self): + self.op = paddle.topk if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py index 77d2f413101..1747bde20b6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,86 +26,94 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [0, 2, 3, 1]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.transpose(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [0, 1, 2, 3]} class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 2, 3, 4, 5]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [4, 0, 2, 3, 1]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py index 75ed5a07315..e068c2e3b59 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,79 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')} + def set_data_feed(self): + data = np.random.uniform(size=[1, 2, 3]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.unsqueeze(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": -1} class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [1, 2]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py similarity index 79% rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py index fabad936dec..5cc62432dc6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py @@ -16,15 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -from paddle.fluid.executor import global_scope -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -33,11 +26,11 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), } @@ -45,25 +38,22 @@ class TestBase(IPUOpTest): def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "shape": [30, 10], "inplace": True, } def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -76,12 +66,13 @@ class TestBase(IPUOpTest): scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5) scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7) - fetch_list = [scale3.name] + fetch_list = [scale3.name] if run_ipu: place = paddle.IPUPlace() else: place = paddle.CPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) scale1_out = main_prog.global_block().ops[4].output("Out")[0] @@ -92,8 +83,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py new file mode 100644 index 00000000000..ecf1c61f52e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestWeightSharing(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32) + self.feed_cpu = {"x": x.astype(np.int64)} + self.feed_ipu = { + "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1]) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int64') + + with paddle.static.ipu_shard_guard(index=0, stage=0): + y = paddle.fluid.layers.embedding( + input=x, + size=[768, 768], + dtype='float32', + param_attr=paddle.fluid.ParamAttr( + name='word_embedding'), + is_sparse=False) + + with paddle.static.ipu_shard_guard(index=1, stage=1): + z = paddle.fluid.layers.fc( + input=y, + size=768, + param_attr=paddle.fluid.ParamAttr(name="fc")) + + with paddle.static.ipu_shard_guard(index=0, stage=2): + out = paddle.fluid.layers.matmul( + x=z, + y=main_prog.global_block().var('word_embedding'), + transpose_y=True) + + fetch_list = [out.name] + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config( + num_ipus=2, + is_training=self.is_training, + enable_manual_shard=True) + ipu_strategy.set_pipelining_config( + enable_pipelining=True, batches_per_step=3) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_ipu if run_ipu else self.feed_cpu + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + res0 = self._test_base(False) + res1 = self._test_base(True) + + self.assertTrue( + np.allclose( + res0.flatten(), res1[0].flatten(), atol=self.atol)) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 47d1d5af242c49e36520d2fd04abcac2715fe6f4 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 8 Mar 2022 14:31:27 +0800 Subject: [PATCH 108/261] [PHI] Support string type attr in yaml (#40218) * support str attr in yaml * fix bug --- .../final_state_generator/eager_gen.py | 4 ++-- python/paddle/utils/code_gen/api.yaml | 4 ++-- python/paddle/utils/code_gen/api_base.py | 14 ++++++++------ python/paddle/utils/code_gen/sparse_api.yaml | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d1e20854153..81d0c9b7bed 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -25,10 +25,10 @@ core_ops_args_type_info = {} yaml_types_mapping = { - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 699e42f2373..8c68ca4d7e0 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -121,7 +121,7 @@ backward : matmul_grad - api : mean - args : (Tensor x, int64_t[] axis={}, bool keep_dim=false) + args : (Tensor x, int64[] axis={}, bool keep_dim=false) output : Tensor infer_meta : func : MeanInferMeta @@ -181,7 +181,7 @@ func : subtract - api : sum - args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) + args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) output : Tensor infer_meta : func : SumInferMeta diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 601248a4176..68127fb522c 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -89,18 +89,20 @@ class BaseAPI(object): attr_types_map = { 'ScalarArray': 'const ScalarArray&', 'Scalar': 'const Scalar&', + 'uint8': 'uint8_t', 'int': 'int', - 'int32_t': 'int32_t', - 'int64_t': 'int64_t', + 'int32': 'int32_t', + 'int64': 'int64_t', 'long': 'long', 'size_t': 'size_t', 'float': 'float', 'double': 'double', 'bool': 'bool', + 'str': 'const std::string&', 'Backend': 'Backend', 'DataLayout': 'DataLayout', 'DataType': 'DataType', - 'int64_t[]': 'const std::vector&', + 'int64[]': 'const std::vector&', 'int[]': 'const std::vector&', 'long[]': 'const std::vector&' } @@ -110,8 +112,8 @@ class BaseAPI(object): 'ScalarArray': 'const paddle::optional&', 'Scalar': 'const paddle::optional&', 'int': 'paddle::optional', - 'int32_t': 'paddle::optional', - 'int64_t': 'paddle::optional', + 'int32': 'paddle::optional', + 'int64': 'paddle::optional', 'size_t': 'paddle::optional', 'float': 'paddle::optional', 'double': 'paddle::optional', @@ -119,7 +121,7 @@ class BaseAPI(object): 'Backend': 'paddle::optional', 'DataLayout': 'paddle::optional', 'DataType': 'paddle::optional', - 'int64_t[]': 'paddle::optional>', + 'int64[]': 'paddle::optional>', 'int[]': 'paddle::optional>' } diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 135989121cc..b531c2ed9ce 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -11,7 +11,7 @@ invoke : to_dense_impl(x, backend) - sparse_api : to_sparse_coo - args : (Tensor x, Backend backend, int64_t sparse_dim) + args : (Tensor x, Backend backend, int64 sparse_dim) output : Tensor(out@SparseCooTensor) invoke : to_sparse_coo_impl(x, backend, sparse_dim) -- GitLab From f1fe2ad45d2b4cd013ce83194192b1fb7bc72957 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 8 Mar 2022 14:33:28 +0800 Subject: [PATCH 109/261] add support for concat and variadic tensor list (#40229) --- .../paddle/fluid/tests/unittests/op_test.py | 23 +++++++++++-------- .../fluid/tests/unittests/test_concat_op.py | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 0c7f269a087..6455da92475 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -722,13 +722,17 @@ class OpTest(unittest.TestCase): def assumption_assert_and_transform(args, argvs): """ - currently only support "X" is [Tensor], don't support multi-tensor in "X" + transform by the following rules: + 1. [Tensor] -> Tensor + 2. [Tensor, Tensor, ...] -> list of Tensors + + only support "X" is list of Tensor, currently don't support other structure like dict. """ for inp in args: - assert isinstance(inp, list) and len( - inp - ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`" - args = [inp[0] for inp in args] + assert isinstance( + inp, list + ), "currently only support `X` is [Tensor], don't support other structure." + args = [inp[0] if len(inp) == 1 else inp for inp in args] return args, argvs def cal_python_api(python_api, args, argvs, kernel_sig): @@ -1239,15 +1243,16 @@ class OpTest(unittest.TestCase): dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) + if check_eager: + with _test_eager_guard(): + eager_dygraph_outs = self._calc_dygraph_output( + place, no_check_set=no_check_set) + # we only check end2end api when check_eager=True if hasattr(self, "python_api"): api_outs = self._calc_python_api_output(place) self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, place) - if check_eager: - with _test_eager_guard(): - eager_dygraph_outs = self._calc_dygraph_output( - place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) for out_name, out_dup in Operator.get_op_outputs(self.op_type): diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 10b7e13dcc3..4feca1b9250 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -25,6 +25,7 @@ import paddle class TestConcatOp(OpTest): def setUp(self): self.op_type = "concat" + self.python_api = paddle.concat self.dtype = self.get_dtype() self.init_test_data() self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} -- GitLab From 975f99ab012310e97cbdee44bb25a05ad7bad012 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 8 Mar 2022 14:54:30 +0800 Subject: [PATCH 110/261] [Phi]Move Relu/Cos/Sin/Tan/Acos/Asin/Atan/Sinh/Cosh/Asinh/Acosh/Atanh kernels in Activation to Phi (#40175) * move activation op * adjust code format * fix compile bugs * fix ci bugs * code format adjust * code format adjust2 * activate ci status * modify according to comment --- cmake/operators.cmake | 2 +- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- .../paddle2cinn/build_cinn_pass_test.cc | 4 +- .../paddle2cinn/cinn_compiler_test.cc | 2 +- .../fluid/imperative/tests/test_prepare_op.cc | 2 +- .../tensorrt/convert/test_activation_op.cc | 2 +- .../fluid/operators/activation_cudnn_op.cu.cc | 19 +- paddle/fluid/operators/activation_op.cc | 43 +- paddle/fluid/operators/activation_op.h | 590 +++---------- paddle/fluid/operators/activation_op.kps | 454 ++-------- .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- .../mkldnn/test_mkldnn_op_inplace.cc | 2 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- .../operators/mlu/activation_op_mlu_test.cc | 2 +- .../test_common_infer_shape_functions.cc | 2 +- paddle/phi/kernels/activation_grad_kernel.h | 55 ++ paddle/phi/kernels/activation_kernel.h | 40 + .../phi/kernels/cpu/activation_grad_kernel.cc | 91 ++ paddle/phi/kernels/cpu/activation_kernel.cc | 55 ++ paddle/phi/kernels/funcs/activation_functor.h | 830 ++++++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 221 +++++ paddle/phi/kernels/gpu/activation_kernel.cu | 143 +++ .../phi/kernels/impl/activation_grad_impl.h | 133 +++ paddle/phi/kernels/impl/activation_impl.h | 50 ++ paddle/phi/ops/compat/activation_sig.cc | 67 ++ 25 files changed, 1908 insertions(+), 907 deletions(-) create mode 100644 paddle/phi/kernels/activation_grad_kernel.h create mode 100644 paddle/phi/kernels/activation_kernel.h create mode 100644 paddle/phi/kernels/cpu/activation_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/activation_kernel.cc create mode 100644 paddle/phi/kernels/funcs/activation_functor.h create mode 100644 paddle/phi/kernels/gpu/activation_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/activation_kernel.cu create mode 100644 paddle/phi/kernels/impl/activation_grad_impl.h create mode 100644 paddle/phi/kernels/impl/activation_impl.h create mode 100644 paddle/phi/ops/compat/activation_sig.cc diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 9e8c81c2985..1291e60cfe4 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852..796aa4039c9 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -27,7 +27,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf39..47dffd47b7c 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9..cdccc4c5546 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99a..17cbe067482 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -226,7 +226,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c..7f7313fbcb5 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); USE_OP(tanh); USE_OP(relu6); diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ad..b4a97e24cf2 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7..66f1bcc8b68 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,18 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1584,16 +1603,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7b..4b79397b6cd 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; +template +using ReluGradFunctor = phi::funcs::ReluGradFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +template +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* Out @@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { static_cast(2) * out * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // tanhshrink(x) = x - tanh(x) @@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x > th).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; /* @@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor( "Cannot get the tensor from the Variable Output, variable name = %s", ctx.OutputName("DDX"))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NOT_NULL( x_var, platform::errors::NotFound( @@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor( VLOG(10) << "Inplace activation of Op: " << ctx.Type(); *X = *ddX; } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { auto out_var = ctx.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, @@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 3b7ce9eaf2b..208abd0949a 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,28 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - template struct CudaLeakyReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { return x > zero ? dout : static_cast(alpha) * dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { return dout * (one - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } }; -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); - } -}; - -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); - } -}; - -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { return (x > t_min_cast && x < t_max_cast) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { return x > static_cast(threshold) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1602,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1838,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL( __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ - __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor); \ - __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); \ - __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); \ - __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor); \ - __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); \ - __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor); \ - __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor); \ - __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor); \ - __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor); \ - __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor); \ - __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor); \ __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \ __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ CudaReciprocalGradFunctor); \ diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 05cd264cf3e..23428dd403e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c7..e9dadd5ec93 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,7 +27,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 3791fed23a8..916f02179b3 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -27,7 +27,7 @@ USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 88452130175..6e3bd5e43c9 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58a..1de1b590a13 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h new file mode 100644 index 00000000000..f34e5710ab7 --- /dev/null +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx); + +#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx); + +template +void ReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + DenseTensor* ddout); + +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); + +} // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h new file mode 100644 index 00000000000..bdf8f436359 --- /dev/null +++ b/paddle/phi/kernels/activation_kernel.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +#define DECLARE_ACTIVATION_KERNEL(name) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +DECLARE_ACTIVATION_KERNEL(Cos) +DECLARE_ACTIVATION_KERNEL(Tan) +DECLARE_ACTIVATION_KERNEL(Acos) +DECLARE_ACTIVATION_KERNEL(Sin) +DECLARE_ACTIVATION_KERNEL(Asin) +DECLARE_ACTIVATION_KERNEL(Atan) +DECLARE_ACTIVATION_KERNEL(Sinh) +DECLARE_ACTIVATION_KERNEL(Cosh) +DECLARE_ACTIVATION_KERNEL(Asinh) +DECLARE_ACTIVATION_KERNEL(Acosh) +DECLARE_ACTIVATION_KERNEL(Atanh) +DECLARE_ACTIVATION_KERNEL(Relu) + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc new file mode 100644 index 00000000000..fe43ebb8160 --- /dev/null +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +namespace phi { + +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradImpl( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradImpl( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); + +} // namespace phi + +PD_REGISTER_KERNEL( + cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {} +PD_REGISTER_KERNEL( + tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {} +PD_REGISTER_KERNEL( + acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {} +PD_REGISTER_KERNEL( + sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {} +PD_REGISTER_KERNEL( + asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {} +PD_REGISTER_KERNEL( + atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {} +PD_REGISTER_KERNEL( + sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {} +PD_REGISTER_KERNEL( + asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {} +PD_REGISTER_KERNEL( + atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {} +PD_REGISTER_KERNEL(relu_double_grad, + CPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc new file mode 100644 index 00000000000..51883f25183 --- /dev/null +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/activation_impl.h" + +namespace phi { + +#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + ActivationImpl(dev_ctx, x, out, functor); \ + } + +DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) + +} // namespace phi +PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {} +PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {} +PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {} +PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {} +PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {} +PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {} +PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {} +PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {} +PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {} +PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {} +PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {} +PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h new file mode 100644 index 00000000000..1a36e4e132f --- /dev/null +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -0,0 +1,830 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif + +#include + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +namespace funcs { +enum ActBwdOpFwdDeps { + kNoDeps = 0x00, // Do not need any forward input/output + kDepX = 0x01, // Only need forward input X + kDepOut = 0x02, // Only need forward output Out +}; + +template +struct BaseActivationFunctor { + using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } +}; + +template +struct Sine { + HOSTDEVICE T operator()(const T& val) const { return sin(val); } +}; + +template <> +struct Sine { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(sin(static_cast(val))); + } +}; + +template +struct Cosine { + HOSTDEVICE T operator()(const T& val) const { return cos(val); } +}; + +template <> +struct Cosine { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(cos(static_cast(val))); + } +}; + +// sine'(x) = cos(x) +template +struct SinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// sine(x) = sin(x) +template +struct SinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sine()); + } +}; + +// cosine'(x) = -sin(x) +template +struct CosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = -dout * x.unaryExpr(Sine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// cosine(x) = cos(x) +template +struct CosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosine()); + } +}; + +template +struct Tangent { + HOSTDEVICE T operator()(const T& val) const { return tan(val); } +}; + +template <> +struct Tangent { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(tan(static_cast(val))); + } +}; + +// Tangent'(x) = -Tangent(x) +template +struct TanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout / x.unaryExpr(Cosine()).square(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// Tangent(x) = tan(x) +template +struct TanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Tangent()); + } +}; + +template +struct Sinh { + HOSTDEVICE T operator()(const T& val) const { return sinh(val); } +}; + +template <> +struct Sinh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(sinhf(static_cast(val))); + } +}; + +template +struct Cosh { + HOSTDEVICE T operator()(const T& val) const { return cosh(val); } +}; + +template <> +struct Cosh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(coshf(static_cast(val))); + } +}; + +// sinh(x) = sinh(x) +template +struct SinhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sinh()); + } +}; + +// cosh(x) = cosh(x) +template +struct CoshFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosh()); + } +}; + +// sinh'(x) = cosh(x) +template +struct SinhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// cosh'(x) = sinh(x) +template +struct CoshGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Sinh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Acosh { + HOSTDEVICE T operator()(const T& val) const { return acosh(val); } +}; + +template <> +struct Acosh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(acosh(static_cast(val))); + } +}; + +// Acosh(x) = acosh(x) +template +struct AcoshFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acosh()); + } +}; + +// acosh'(x) = 1/sqrt(x^2 - 1) +template +struct AcoshGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Asinh { + HOSTDEVICE T operator()(const T& val) const { return asinh(val); } +}; + +template <> +struct Asinh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(asinh(static_cast(val))); + } +}; + +// Asinh(x) = asinh(x) +template +struct AsinhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asinh()); + } +}; + +// asinh'(x) = 1/sqrt(x^2 + 1) +template +struct AsinhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Atanh { + HOSTDEVICE T operator()(const T& val) const { return atanh(val); } +}; + +template <> +struct Atanh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(atanh(static_cast(val))); + } +}; + +// Atanh(x) = atanh(x) +template +struct AtanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atanh()); + } +}; + +// atanh'(x) = 1/(1 - x^2) +template +struct AtanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// relu(x) = max(x, 0) +template +struct ReluCPUFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { + return v > static_cast(0) ? v : static_cast(0); + }); + } +}; + +template +struct ReluCUDAFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)); + } +}; + +template +struct ReluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (out > static_cast(0)).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct ReluGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* ddOut, + DenseTensor* dOut, + DenseTensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); + ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +template +struct CudaReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // relu(x) = max(x, 0) + __device__ __forceinline__ T operator()(const T x) const { + return x > zero ? x : zero; + } +}; + +template +struct CudaReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // dx = dout * (out > 0) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return out > zero ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaCosFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // cos(x) = cos(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(cos(x)); + } +}; + +template +struct CudaCosGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * (-sin(x)) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(-dout * sin(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSinFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // sin(x) = sin(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(sin(x)); + } +}; + +template +struct CudaSinGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * cos(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * cos(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaTanFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tan(x) = tan(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(tan(x)); + } +}; + +template +struct CudaTanGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout / cos(x)^2 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout / (cos(x) * cos(x))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAsinFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // asin(x) = asin(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(asin(x)); + } +}; + +template +struct CudaAsinGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout / sqrt(1 - x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAcosFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // acos(x) = acos(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(acos(x)); + } +}; + +template +struct CudaAcosGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = -dout / sqrt(1 - x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(-dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaCoshFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // cosh(x) = cosh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(cosh(x)); + } +}; + +template +struct CudaCoshGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * sinh(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * sinh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSinhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // sinh(x) = sinh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(sinh(x)); + } +}; + +template +struct CudaSinhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * cosh(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * cosh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAcoshFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Acosh(x) = acosh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(acosh(x)); + } +}; + +template +struct CudaAcoshGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + // dx = dout * 1 / sqrt(x^2 - 1) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / sqrt(x * x - one)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAsinhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Asinh(x) = asinh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(asinh(x)); + } +}; + +template +struct CudaAsinhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * 1/sqrt(x^2 + 1) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / sqrt(x * x + one)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAtanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Atanh(x) = atanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(atanh(x)); + } +}; + +template +struct CudaAtanhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + // dx = dout * 1/(1- x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / (one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAtanFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // atan(x) = atan(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(atan(x)); + } +}; + +template +struct CudaAtanGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x^2) + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return dout / (one + x * x); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu new file mode 100644 index 00000000000..c2995c79a7e --- /dev/null +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -0,0 +1,221 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +namespace phi { + +template +void ActivationGradGPUImpl(const Context& dev_ctx, + const DenseTensor* x, + const DenseTensor* out, + const DenseTensor* d_out, + DenseTensor* d_x, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } + PADDLE_ENFORCE_NOT_NULL( + d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + d_x, errors::NotFound("The output DenseTensor dX can not be nullptr")); + if (!out) { + out = d_out; // fake out + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + x, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + x = d_x; + } + + dev_ctx.template Alloc(d_x); + + std::vector ins = {d_out}; + std::vector outs = {d_x}; + + if (static_cast(Functor::FwdDeps()) == + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + // Only need forward output Out + ins.push_back(out); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + // Only need forward input X + ins.push_back(x); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else { + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } +} + +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradGPUImpl( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradGPUImpl( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor); + +} // namespace phi +PD_REGISTER_KERNEL(cos_grad, + GPU, + ALL_LAYOUT, + phi::CosGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(tan_grad, + GPU, + ALL_LAYOUT, + phi::TanGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acos_grad, + GPU, + ALL_LAYOUT, + phi::AcosGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sin_grad, + GPU, + ALL_LAYOUT, + phi::SinGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asin_grad, + GPU, + ALL_LAYOUT, + phi::AsinGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atan_grad, + GPU, + ALL_LAYOUT, + phi::AtanGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sinh_grad, + GPU, + ALL_LAYOUT, + phi::SinhGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(cosh_grad, + GPU, + ALL_LAYOUT, + phi::CoshGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asinh_grad, + GPU, + ALL_LAYOUT, + phi::AsinhGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acosh_grad, + GPU, + ALL_LAYOUT, + phi::AcoshGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atanh_grad, + GPU, + ALL_LAYOUT, + phi::AtanhGradKernel, + float, + double, + phi::dtype::float16) {} +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(relu_grad, + GPU, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(relu_double_grad, + GPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(relu_grad, + GPU, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(relu_double_grad, + GPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu new file mode 100644 index 00000000000..26752b89e7c --- /dev/null +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -0,0 +1,143 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +namespace phi { + +template +void ActivationGPUImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out, + const Functor& functor) { + PADDLE_ENFORCE_NOT_NULL(out, + errors::NotFound("Output Out should not be nullptr")); + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} + +#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + ActivationGPUImpl(dev_ctx, x, out, functor); \ + } + +DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(relu, + GPU, + ALL_LAYOUT, + phi::ReluKernel, + float, + double, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(relu, + GPU, + ALL_LAYOUT, + phi::ReluKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif +PD_REGISTER_KERNEL( + sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL( + cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL( + tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL(acos, + GPU, + ALL_LAYOUT, + phi::AcosKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asin, + GPU, + ALL_LAYOUT, + phi::AsinKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atan, + GPU, + ALL_LAYOUT, + phi::AtanKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sinh, + GPU, + ALL_LAYOUT, + phi::SinhKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(cosh, + GPU, + ALL_LAYOUT, + phi::CoshKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asinh, + GPU, + ALL_LAYOUT, + phi::AsinhKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acosh, + GPU, + ALL_LAYOUT, + phi::AcoshKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atanh, + GPU, + ALL_LAYOUT, + phi::AtanhKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h new file mode 100644 index 00000000000..80e23d2b8e2 --- /dev/null +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace phi { + +template +void ActivationGradImpl(const Context& dev_ctx, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* dOut, + DenseTensor* dX, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + Out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } + PADDLE_ENFORCE_NOT_NULL( + dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + dX, errors::NotFound("The output DenseTensor dX can not be nullptr")); + if (!Out) { + Out = dOut; // fake out + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + X, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + X = dX; + } + + dev_ctx.template Alloc(dX); + auto dout = phi::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad")); + auto out = phi::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad")); + auto dx = phi::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad")); + auto x = phi::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad")); + auto* place = dev_ctx.eigen_device(); + // use 32bit index to speed up computation + bool use_32bit_index = out.size() < Eigen::NumTraits::highest(); + bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); + if (use_32bit_index && is_gpu_place) { + functor(*place, + To32BitIndex(x), + To32BitIndex(out), + To32BitIndex(dout), + To32BitIndex(dx)); + } else { + functor(*place, x, out, dout, dx); + } +} + +template +void ActivationDoubleGradImpl(const Context& dev_ctx, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* dX, + DenseTensor* dOut, + DenseTensor* ddOut, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + X, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + X = ddX; + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + Out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + Out = ddX; + } + + if (ddOut) { + dev_ctx.template Alloc(ddOut); + } + if (dOut) { + dev_ctx.template Alloc(dOut); + } + if (dX) { + dX->Resize(Out->dims()); + dev_ctx.template Alloc(dX); + } + + functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX); +} + +template +void ReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + DenseTensor* ddout) { + funcs::ReluGradGradFunctor relu_double_grad_functor; + ActivationDoubleGradImpl>( + dev_ctx, + nullptr, + &out, + &ddx, + nullptr, + nullptr, + ddout, + relu_double_grad_functor); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h new file mode 100644 index 00000000000..ca3debd394a --- /dev/null +++ b/paddle/phi/kernels/impl/activation_impl.h @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace phi { + +#define ToString(x) #x + +template +void ActivationImpl(const Context& dev_ctx, + const DenseTensor& X, + DenseTensor* Out, + const Functor& functor) { + PADDLE_ENFORCE_NOT_NULL(Out, + errors::NotFound("Output Out should not be nullptr")); + dev_ctx.template Alloc(Out); + auto x = phi::EigenVector::Flatten( + GET_DATA_SAFELY(&X, "Input", "X", "Activation")); + auto out = phi::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "Activation")); + auto* place = dev_ctx.eigen_device(); + // use 32bit index to speed up computation + bool use_32bit_index = out.size() < Eigen::NumTraits::highest(); + bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); + if (use_32bit_index && is_gpu_place) { + functor(*place, To32BitIndex(x), To32BitIndex(out)); + } else { + functor(*place, x, out); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc new file mode 100644 index 00000000000..396830ca207 --- /dev/null +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +#define DefineActGradDepXOpArgMap(func_name, op_name) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature( \ + op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \ + } + +#define DefineActGradDepOutOpArgMap(func_name, op_name) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature( \ + op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \ + } + +KernelSignature ReluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); +} + +DefineActGradDepXOpArgMap(Cos, "cos"); +DefineActGradDepXOpArgMap(Tan, "tan"); +DefineActGradDepXOpArgMap(Acos, "acos"); +DefineActGradDepXOpArgMap(Sin, "sin"); +DefineActGradDepXOpArgMap(Asin, "asin"); +DefineActGradDepXOpArgMap(Atan, "atan"); +DefineActGradDepXOpArgMap(Sinh, "sinh"); +DefineActGradDepXOpArgMap(Cosh, "cosh"); +DefineActGradDepXOpArgMap(Asinh, "asinh"); +DefineActGradDepXOpArgMap(Acosh, "acosh"); +DefineActGradDepXOpArgMap(Atanh, "atanh"); +DefineActGradDepOutOpArgMap(Relu, "relu"); +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); + +PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, + phi::ReluDoubleGradOpArgumentMapping); -- GitLab From 7024ade70597962aad8e7f7cf77b174fa821ee13 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 8 Mar 2022 15:54:32 +0800 Subject: [PATCH 111/261] [Phi] Move matrix inverse into phi (#40237) * move matrix inverse into phi * change license year --- paddle/fluid/operators/determinant_op.h | 6 +- paddle/fluid/operators/inverse_op.h | 4 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/matrix_inverse.cc | 38 ----- .../fluid/operators/math/matrix_inverse.cu.cc | 124 --------------- paddle/fluid/operators/matrix_power_op.h | 6 +- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + paddle/phi/kernels/funcs/matrix_inverse.cc | 37 +++++ paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 141 ++++++++++++++++++ .../kernels/funcs}/matrix_inverse.h | 41 ++--- 10 files changed, 208 insertions(+), 191 deletions(-) delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cc delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cu.cc create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cc create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cu.cc rename paddle/{fluid/operators/math => phi/kernels/funcs}/matrix_inverse.h (61%) diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f4..463a707ccf1 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,11 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -226,7 +226,7 @@ class DeterminantGradKernel : public framework::OpKernel { inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); @@ -381,7 +381,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel { inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50a..31c22915ec5 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index bce927c32dd..d5a86d62b41 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -46,7 +46,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) math_library(segment_pooling) math_library(matrix_solve) diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68..00000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417..00000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h index d2c67d80b4f..8eb9c58513d 100644 --- a/paddle/fluid/operators/matrix_power_op.h +++ b/paddle/fluid/operators/matrix_power_op.h @@ -18,9 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -67,7 +67,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); } else { // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *X, &new_x); new_n = -n; } @@ -200,7 +200,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); } else { // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *X, &new_x); new_n = -n; } diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 02cba6009c4..f0fbb7bf084 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -9,3 +9,4 @@ math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) math_library(concat_and_split_functor DEPS dense_tensor) math_library(matrix_reduce DEPS dense_tensor) +math_library(matrix_inverse DEPS dense_tensor eigen3 blas) diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc new file mode 100644 index 00000000000..c95e97f8ea8 --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_inverse.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_inverse.h" + +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace funcs { + +template +void MatrixInverseFunctor::operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { + ComputeInverseEigen(dev_ctx, a, a_inv); +} + +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +// TODO(chenweihang): remove these instantiations later +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc new file mode 100644 index 00000000000..686b8405bf7 --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_inverse.h" + +#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace phi { +namespace funcs { + +template +void MatrixInverseFunctor::operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { +#ifndef PADDLE_WITH_HIP + const auto& mat_dims = a.dims(); + const int rank = mat_dims.size(); + int n = mat_dims[rank - 1]; + int batch_size = rank > 2 ? a.numel() / (n * n) : 1; + + paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data; + const T* gpu_mat = a.data(); + if (n >= 32) { + // Copy all elements of input matrix A to a temporary memory space to + // avoid being overriden by getrf. + tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_gpu_mat_data->ptr(), + dev_ctx.GetPlace(), + a.data(), + a.numel() * sizeof(T), + dev_ctx.stream()); + gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); + } + + std::vector cpu_ptrs(batch_size * 2); + for (int i = 0; i < batch_size; ++i) { + cpu_ptrs[i] = gpu_mat + i * n * n; + cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; + } + + // Copy the addresses of A and A_inv from host to device. + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_gpu_ptrs_data->ptr(), + phi::CPUPlace(), + static_cast(cpu_ptrs.data()), + cpu_ptrs.size() * sizeof(T*), + dev_ctx.stream()); + T** gpu_inv_ptrs = + reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; + + // Allocate device memory for info and pivots. + int num_ints = n < 32 ? batch_size : batch_size * (n + 1); + paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = + paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int)); + int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); + + auto blas = phi::funcs::GetBlas(dev_ctx); + + std::vector info; // only for singular checking + info.resize(batch_size); + // This functions in cuBLAS is intended to be used for matrices of small + // sizes where the launch overhead is a significant factor. + // TODO(Xreki): call function in cusolver for large matrices. + if (n < 32) { + // cublasmatinvBatched is a short cut of cublasgetrfBatched + // plus cublasgetriBatched. + // However it only works if N is less than 32. If not, we need to + // go through cublasgetrfBatched and cublasgetriBatched. + blas.BatchedMatInv(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_inv_ptrs, + gpu_info_ptr, + batch_size); + } else { + // This function performs the LU factorization of each matrix A by the + // equation P * A = L * U. L and U are written back to original matrix A, + // and diagonal elements of L are discarded. + int* gpu_pivot_ptr = + reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; + blas.BatchedGETRF(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_pivot_ptr, + gpu_info_ptr, + batch_size); + + blas.BatchedGETRI(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_pivot_ptr, + gpu_inv_ptrs, + gpu_info_ptr, + batch_size); + } + paddle::memory::Copy(phi::CPUPlace(), + info.data(), + dev_ctx.GetPlace(), + gpu_info_ptr, + sizeof(int) * batch_size, + dev_ctx.stream()); + for (int i = 0; i < batch_size; ++i) { + PADDLE_ENFORCE_EQ(info[i], + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: U(%d, %d) is zero, singular U. " + "Please check the matrix value and change it to a " + "non-singular matrix", + i, + info[i], + info[i])); + } +#else + ComputeInverseEigen(dev_ctx, a, a_inv); +#endif +} + +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +// TODO(chenweihang): remove these instantiations later +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h similarity index 61% rename from paddle/fluid/operators/math/matrix_inverse.h rename to paddle/phi/kernels/funcs/matrix_inverse.h index fb58b483666..c5b04a81065 100644 --- a/paddle/fluid/operators/math/matrix_inverse.h +++ b/paddle/phi/kernels/funcs/matrix_inverse.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,17 +17,18 @@ limitations under the License. */ #include #include "Eigen/Core" #include "Eigen/LU" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace operators { -namespace math { +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" -template -void compute_inverse_eigen(const DeviceContext& context, - const framework::Tensor& a, - framework::Tensor* a_inv) { +namespace phi { +namespace funcs { + +template +void ComputeInverseEigen(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { using Matrix = Eigen::Matrix; using EigenMatrixMap = Eigen::Map; @@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context, int batch_size = rank > 2 ? a.numel() / (n * n) : 1; const T* a_ptr = a.data(); - T* a_inv_ptr = a_inv->mutable_data(context.GetPlace()); + T* a_inv_ptr = a_inv->mutable_data(dev_ctx.GetPlace()); for (int i = 0; i < batch_size; ++i) { ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); @@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context, lu.compute(mat); const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); - PADDLE_ENFORCE_GT( - min_abs_pivot, static_cast(0), - platform::errors::InvalidArgument("Input is not invertible.")); + PADDLE_ENFORCE_GT(min_abs_pivot, + static_cast(0), + errors::InvalidArgument("Input is not invertible.")); mat_inv.noalias() = lu.inverse(); } } -template +template class MatrixInverseFunctor { public: - void operator()(const DeviceContext& context, const framework::Tensor& a, - framework::Tensor* a_inv); + void operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi -- GitLab From 73583f862b7ac88328b201e5ac8d22bc4c122078 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Tue, 8 Mar 2022 16:04:05 +0800 Subject: [PATCH 112/261] add the implementation of process group for hccl (#40228) * add pg_hccl --- .../distributed/collective/CMakeLists.txt | 3 + .../fluid/distributed/collective/HCCLTools.h | 174 +++++++++ .../collective/ProcessGroupHCCL.cc | 356 ++++++++++++++++++ .../distributed/collective/ProcessGroupHCCL.h | 152 ++++++++ .../fluid/platform/device/npu/hccl_helper.h | 17 + paddle/fluid/pybind/CMakeLists.txt | 3 + paddle/fluid/pybind/distributed_py.cc | 12 + .../tests/unittests/npu/process_group_hccl.py | 249 ++++++++++++ .../npu/test_collective_process_group_hccl.py | 29 ++ 9 files changed, 995 insertions(+) create mode 100644 paddle/fluid/distributed/collective/HCCLTools.h create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.cc create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.h create mode 100644 python/paddle/fluid/tests/unittests/npu/process_group_hccl.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 96bc4a710f8..f88c993d85e 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 00000000000..09789bd4d37 --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 00000000000..84f5ca48d25 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,356 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_hccl_blocking_wait) { + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 00000000000..f2376b4eed7 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index efbc56bee72..134ec04030d 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { } } +inline HcclDataType ToHCCLDataType(experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return HCCL_DATA_TYPE_FP32; + } else if (type == experimental::DataType::FLOAT16) { + return HCCL_DATA_TYPE_FP16; + } else if (type == experimental::DataType::INT64) { + return HCCL_DATA_TYPE_INT64; + } else if (type == experimental::DataType::INT32) { + return HCCL_DATA_TYPE_INT32; + } else if (type == experimental::DataType::INT8) { + return HCCL_DATA_TYPE_INT8; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in hccl is not supported.")); + } +} + // NOTE(minqiyang): according to the ncclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, // ncclGroupEnd will wait for all communicators to be initialized, which will diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 7ff501ef43d..f40cd51a7b2 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -88,6 +88,9 @@ if(NOT ON_INFER) if (WITH_GLOO) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) endif() + if(WITH_ASCEND) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) + endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 9870eab8da9..0b179670381 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -35,6 +35,10 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/distributed/store/tcp_store.h" @@ -201,6 +205,14 @@ void BindDistributed(py::module *m) { py::call_guard()); #endif +#if defined(PADDLE_WITH_ASCEND_CL) + py::class_>( + *m, "ProcessGroupHCCL", ProcessGroup) + .def(py::init &, int, int>(), + py::call_guard()); +#endif + py::class_>(*m, "task") .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted) diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py new file mode 100644 index 00000000000..37a24885be1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) + pg_group = core.ProcessGroupHCCL(store, rank, nranks) + + return pg_group + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float32" + self.shape = (2, 10, 5) + + def test_create_process_group_nccl(self): + with _test_eager_guard(): + paddle.set_device('npu:%d' % + paddle.distributed.ParallelEnv().dev_id) + + pg = init_process_group() + + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.allreduce(tensor_x) + task.wait() + assert np.array_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if pg.rank() == 0: + task = pg.broadcast(tensor_x, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_y) + + print("test broadcast api ok") + + # test barrier + # rank 0 + if pg.rank() == 0: + task = pg.barrier() + task.wait() + # rank 1 + else: + task = pg.barrier() + task.wait() + + print("test barrier api ok\n") + exit(0) + + # test allgather + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], + [out_shape[0]]) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test alltoall + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], + [self.shape[0]]) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], + [self.shape[0] // 2]) + if pg.rank() == 0: + task = pg.alltoall(tensor_x, tensor_out1) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.alltoall(tensor_y, tensor_out2) + task.wait() + paddle.device.cuda.synchronize() + out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], + [self.shape[0]]) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], + [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") + + +class TestProcessGroupFp16(TestProcessGroupFp32): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float16" + self.shape = (4, 20, 20) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py new file mode 100644 index 00000000000..9b2c6fae15e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py @@ -0,0 +1,29 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestProcessGroup(TestMultipleGpus): + def test_process_group_nccl(self): + self.run_mnist_2gpu('process_group_hccl.py') + + +if __name__ == "__main__": + unittest.main() -- GitLab From 00566eade8749566763af7e782224f3fed68bbdf Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 8 Mar 2022 16:47:20 +0800 Subject: [PATCH 113/261] Add exception throw for norm_conv when platform is not supported (#40166) * Add throw for norm_conv when platform is not supported * fix format --- .../operators/fused/cudnn_norm_conv_test.cc | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index b3792a176fa..a80f590aa49 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 -- GitLab From 48b4366c707ab570d7012e213d3eccef73ac40a4 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Tue, 8 Mar 2022 16:51:44 +0800 Subject: [PATCH 114/261] [Phi] move ops: maxout/take_along_axis/put_along_axis (#39959) * [Phi] move put_along_axis/take_along_axis/maxout * use phi::Copy --- paddle/fluid/operators/math/maxouting.cc | 151 +++++++++--------- paddle/fluid/operators/math/maxouting.cu | 107 +++++++------ paddle/fluid/operators/math/maxouting.h | 2 +- paddle/fluid/operators/maxout_op.cc | 13 +- paddle/fluid/operators/maxout_op.cu.cc | 24 --- paddle/fluid/operators/maxout_op.h | 72 --------- paddle/fluid/operators/put_along_axis_op.cc | 16 +- paddle/fluid/operators/put_along_axis_op.cu | 134 ---------------- paddle/fluid/operators/put_along_axis_op.h | 124 -------------- paddle/fluid/operators/take_along_axis_op.cc | 16 +- paddle/fluid/operators/take_along_axis_op.cu | 97 ----------- paddle/fluid/operators/take_along_axis_op.h | 92 ----------- paddle/phi/kernels/CMakeLists.txt | 8 +- paddle/phi/kernels/cpu/maxout_grad_kernel.cc | 20 +++ paddle/phi/kernels/cpu/maxout_kernel.cc | 19 +++ .../kernels/cpu/put_along_axis_grad_kernel.cc | 83 ++++++++++ .../phi/kernels/cpu/put_along_axis_kernel.cc | 87 ++++++++++ .../cpu/take_along_axis_grad_kernel.cc | 71 ++++++++ .../phi/kernels/cpu/take_along_axis_kernel.cc | 60 +++++++ paddle/phi/kernels/gpu/maxout_grad_kernel.cu | 20 +++ paddle/phi/kernels/gpu/maxout_kernel.cu | 19 +++ .../kernels/gpu/put_along_axis_grad_kernel.cu | 79 +++++++++ .../phi/kernels/gpu/put_along_axis_kernel.cu | 86 ++++++++++ .../gpu/take_along_axis_grad_kernel.cu | 72 +++++++++ .../phi/kernels/gpu/take_along_axis_kernel.cu | 59 +++++++ .../kernels/impl/maxout_grad_kernel_impl.h | 45 ++++++ paddle/phi/kernels/impl/maxout_kernel_impl.h | 37 +++++ paddle/phi/kernels/maxout_grad_kernel.h | 30 ++++ paddle/phi/kernels/maxout_kernel.h | 28 ++++ .../phi/kernels/put_along_axis_grad_kernel.h | 33 ++++ paddle/phi/kernels/put_along_axis_kernel.h | 32 ++++ .../phi/kernels/take_along_axis_grad_kernel.h | 29 ++++ paddle/phi/kernels/take_along_axis_kernel.h | 28 ++++ paddle/phi/ops/compat/maxout_sig.cc | 33 ++++ paddle/phi/ops/compat/put_along_axis_sig.cc | 38 +++++ paddle/phi/ops/compat/take_along_axis_sig.cc | 37 +++++ 36 files changed, 1191 insertions(+), 710 deletions(-) delete mode 100644 paddle/fluid/operators/maxout_op.cu.cc delete mode 100644 paddle/fluid/operators/maxout_op.h delete mode 100644 paddle/fluid/operators/put_along_axis_op.cu delete mode 100644 paddle/fluid/operators/put_along_axis_op.h delete mode 100644 paddle/fluid/operators/take_along_axis_op.cu delete mode 100644 paddle/fluid/operators/take_along_axis_op.h create mode 100644 paddle/phi/kernels/cpu/maxout_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/maxout_kernel.cc create mode 100644 paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/put_along_axis_kernel.cc create mode 100644 paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/take_along_axis_kernel.cc create mode 100644 paddle/phi/kernels/gpu/maxout_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/maxout_kernel.cu create mode 100644 paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/put_along_axis_kernel.cu create mode 100644 paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/take_along_axis_kernel.cu create mode 100644 paddle/phi/kernels/impl/maxout_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/maxout_kernel_impl.h create mode 100644 paddle/phi/kernels/maxout_grad_kernel.h create mode 100644 paddle/phi/kernels/maxout_kernel.h create mode 100644 paddle/phi/kernels/put_along_axis_grad_kernel.h create mode 100644 paddle/phi/kernels/put_along_axis_kernel.h create mode 100644 paddle/phi/kernels/take_along_axis_grad_kernel.h create mode 100644 paddle/phi/kernels/take_along_axis_kernel.h create mode 100644 paddle/phi/ops/compat/maxout_sig.cc create mode 100644 paddle/phi/ops/compat/put_along_axis_sig.cc create mode 100644 paddle/phi/ops/compat/take_along_axis_sig.cc diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d..28ec3a87102 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48..1d0478db5ef 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2..1f4964f7715 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777..e55369e0691 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc deleted file mode 100644 index be1e81bb869..00000000000 --- a/paddle/fluid/operators/maxout_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/maxout_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CUDA_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 92299829394..00000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bc..54e31845ad4 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad..00000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28..00000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915..fa8a5e92712 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b3..00000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2..00000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 58ea231beef..de3b5b53f46 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,11 +27,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc new file mode 100644 index 00000000000..429344a362b --- /dev/null +++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc new file mode 100644 index 00000000000..e7cd3ab07ff --- /dev/null +++ b/paddle/phi/kernels/cpu/maxout_kernel.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc new file mode 100644 index 00000000000..e94d09e0337 --- /dev/null +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + dev_ctx); + } else { + paddle::operators::cpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } + } + + if (value_grad) { + value_grad->Resize(index.dims()); + value_grad->mutable_data(dev_ctx.GetPlace()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis_grad, + CPU, + ALL_LAYOUT, + phi::PutAlongAxisGradKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc new file mode 100644 index 00000000000..83c9a915ee6 --- /dev/null +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU.")); + + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (reduce == "add") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "assign") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } + } else { + PADDLE_THROW(errors::InvalidArgument( + "can not support reduce: '%s' for scatter kernel, only " + "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "defalut reduce " + "op is 'assign' ", + reduce)); + return; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis, + CPU, + ALL_LAYOUT, + phi::PutAlongAxisKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc new file mode 100644 index 00000000000..4443383f402 --- /dev/null +++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_grad_kernel.h" + +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on CPU.")); + + // We need to know the shape of input matrix to determine the shape of grad + // matrix of input. + x_grad->Resize(x.dims()); + dev_ctx.template Alloc(x_grad); + + // Set to zero tensor. + phi::funcs::SetConstant functor; + functor(dev_ctx, x_grad, static_cast(0)); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_add_kernel( + *x_grad, + axis, + index, + out_grad, + dev_ctx); // the gradient of gather is scatter + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_add_kernel( + *x_grad, axis, index, out_grad, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis_grad, + CPU, + ALL_LAYOUT, + phi::TakeAlongAxisGradKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc new file mode 100644 index 00000000000..502db8a22da --- /dev/null +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on CPU.")); + + out->Resize(index.dims()); + dev_ctx.template Alloc(out); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis, + CPU, + ALL_LAYOUT, + phi::TakeAlongAxisKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu new file mode 100644 index 00000000000..86ff09fd74b --- /dev/null +++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu new file mode 100644 index 00000000000..88776a49f19 --- /dev/null +++ b/paddle/phi/kernels/gpu/maxout_kernel.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu new file mode 100644 index 00000000000..f553da361f1 --- /dev/null +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet( + "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } else { + paddle::operators::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } + } + if (value_grad) { + value_grad->Resize(index.dims()); + value_grad->mutable_data(dev_ctx.GetPlace()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_gather_kernel( + out_grad, + axis, + index, + *value_grad, + dev_ctx); // the gradient of scatter is gather + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis_grad, + GPU, + ALL_LAYOUT, + phi::PutAlongAxisGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu new file mode 100644 index 00000000000..d363c0c2836 --- /dev/null +++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu @@ -0,0 +1,86 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out) { + PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet( + "PutAlongAxisCUDAKernel only runs on GPU device.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + if (reduce == "add") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "assign") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } + } else { + PADDLE_THROW(errors::InvalidArgument( + "can not support reduce: '%s' for scatter kernel, only " + "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "defalut reduce op is 'assign' ", + reduce)); + return; + } +} +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis, + GPU, + ALL_LAYOUT, + phi::PutAlongAxisKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu new file mode 100644 index 00000000000..e09cfd370a4 --- /dev/null +++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on GPU.")); + + // We need to know the shape of input matrix to determine the shape of grad + // matrix of input. + x_grad->Resize(x.dims()); + dev_ctx.template Alloc(x_grad); + + // Set to zero tensor. + phi::funcs::SetConstant functor; + functor(dev_ctx, x_grad, static_cast(0)); + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_add_kernel( + *x_grad, + axis, + index, + out_grad, + dev_ctx); // the gradient of gather is scatter + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_add_kernel( + *x_grad, axis, index, out_grad, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis_grad, + GPU, + ALL_LAYOUT, + phi::TakeAlongAxisGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu new file mode 100644 index 00000000000..63113e3e672 --- /dev/null +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on GPU device.")); + + out->Resize(index.dims()); + dev_ctx.template Alloc(out); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis, + GPU, + ALL_LAYOUT, + phi::TakeAlongAxisKernel, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h new file mode 100644 index 00000000000..546ea746742 --- /dev/null +++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/maxout_grad_kernel.h" + +#include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void MaxOutGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + int groups, + int axis, + DenseTensor* x_grad) { + if (axis < 0) { + axis += x.dims().size(); + } + + phi::funcs::SetConstant zero; + if (x_grad) { + dev_ctx.template Alloc(x_grad); + zero(dev_ctx, x_grad, static_cast(0.0)); + paddle::operators::math::MaxOutGradFunctor maxout_backward; + maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h new file mode 100644 index 00000000000..da8c259ebf2 --- /dev/null +++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/maxout_kernel.h" + +#include "paddle/fluid/operators/math/maxouting.h" + +namespace phi { + +template +void MaxOutKernel(const Context& dev_ctx, + const DenseTensor& x, + int groups, + int axis, + DenseTensor* out) { + if (axis < 0) { + axis += x.dims().size(); + } + + paddle::operators::math::MaxOutFunctor maxout_forward; + maxout_forward(dev_ctx, x, out, groups, axis); +} + +} // namespace phi diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h new file mode 100644 index 00000000000..1ee4e8cc896 --- /dev/null +++ b/paddle/phi/kernels/maxout_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MaxOutGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + int groups, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h new file mode 100644 index 00000000000..e582575678d --- /dev/null +++ b/paddle/phi/kernels/maxout_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MaxOutKernel(const Context& dev_ctx, + const DenseTensor& x, + int groups, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h new file mode 100644 index 00000000000..2141443da7a --- /dev/null +++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h new file mode 100644 index 00000000000..797d0e364b4 --- /dev/null +++ b/paddle/phi/kernels/put_along_axis_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h new file mode 100644 index 00000000000..a312c235f66 --- /dev/null +++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h new file mode 100644 index 00000000000..e8fb78556d9 --- /dev/null +++ b/paddle/phi/kernels/take_along_axis_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc new file mode 100644 index 00000000000..d16dd1c8617 --- /dev/null +++ b/paddle/phi/ops/compat/maxout_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"}); +} + +KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("maxout_grad", + {"X", "Out", GradVarName("Out")}, + {"groups", "axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping); diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc new file mode 100644 index 00000000000..5f8dc1cf4cd --- /dev/null +++ b/paddle/phi/ops/compat/put_along_axis_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("put_along_axis", + {"Input", "Index", "Value"}, + {"Axis", "Reduce"}, + {"Result"}); +} + +KernelSignature PutAlongAxisGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("put_along_axis_grad", + {"Input", "Index", GradVarName("Result")}, + {"Axis", "Reduce"}, + {GradVarName("Input"), GradVarName("Value")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad, + phi::PutAlongAxisGradArgumentMapping); diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc new file mode 100644 index 00000000000..27a996a270d --- /dev/null +++ b/paddle/phi/ops/compat/take_along_axis_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TakeAlongAxisArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"}); +} + +KernelSignature TakeAlongAxisGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("take_along_axis_grad", + {"Input", "Index", GradVarName("Result")}, + {"Axis"}, + {GradVarName("Input")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad, + phi::TakeAlongAxisGradArgumentMapping); -- GitLab From d4a4eb9d68d1d6ca8025fefbfee1dfb98a9170d0 Mon Sep 17 00:00:00 2001 From: xiaoting <31891223+tink2123@users.noreply.github.com> Date: Tue, 8 Mar 2022 17:05:50 +0800 Subject: [PATCH 115/261] Fix fold python examples (#38636) * fix fold python examples, test=develop * fix size type, test=develop * fix python example, test=develop * fix fold shape check * fix fold dygraph mode, test=develop --- paddle/fluid/operators/fold_op.cc | 22 +++++- .../fluid/tests/unittests/test_fold_op.py | 10 +++ python/paddle/nn/functional/common.py | 68 +++++++++++-------- python/paddle/nn/layer/common.py | 15 ++-- 4 files changed, 76 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190..92f59e118c3 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py index 14a59b41338..44b94cd3b66 100644 --- a/python/paddle/fluid/tests/unittests/test_fold_op.py +++ b/python/paddle/fluid/tests/unittests/test_fold_op.py @@ -174,6 +174,15 @@ class TestFoldOpError(unittest.TestCase): x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]) + def test_output_size_2(): + # out_size must GT 1 + x = paddle.randn(shape=[2, 6, 6], dtype="float32") + out = fold( + x, + output_sizes=[0.1, 0.2], + kernel_sizes=[2, 2], + strides=[1, 1]) + def test_block_h_w(): # test_block_h_w GT 0 x = paddle.randn(shape=[2, 1, 1], dtype="float32") @@ -196,6 +205,7 @@ class TestFoldOpError(unittest.TestCase): self.assertRaises(AssertionError, test_dilations_shape) self.assertRaises(AssertionError, test_strides_shape) self.assertRaises(ValueError, test_output_size) + self.assertRaises(ValueError, test_output_size_2) self.assertRaises(ValueError, test_block_h_w) self.assertRaises(ValueError, test_GT_0) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index ed668ed124c..9e78ca6be3f 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -351,7 +351,6 @@ def interpolate(x, out_shape = size scale = scale_factor - if out_shape is not None and scale is not None: raise ValueError("Only one of size or scale_factor should be defined.") if out_shape is not None: @@ -362,6 +361,8 @@ def interpolate(x, if in_dynamic_mode(): if isinstance(out_shape, Variable): out_shape = list(out_shape.numpy()) + else: + out_shape = list(out_shape) for i, dim in enumerate(out_shape): if isinstance(dim, Variable): out_shape[i] = dim.numpy()[0] @@ -1818,7 +1819,6 @@ def fold(x, can be calculated as following. .. math:: - H_out &= output_size[0] W_out &= output_size[1] C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1] @@ -1826,21 +1826,21 @@ def fold(x, Parameters: x(Tensor): 3-D Tensor, input tensor of format [N, C, L], data type can be float32 or float64 - output_sizes(list): The size of output size, should be [output_size_h, output_size_w] + output_sizes(int|list|tuple): The size of output size, should be [output_size_h, output_size_w] or an interger o treated as [o, o]. - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list): The strides, should be [stride_h, stride_w] + strides(int|list|tuple): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list): The paddings of each dimension, should be + paddings(int|list|tuple): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0] - dilations(int|list): the dilations of convolution kernel, should be + dilations(int|list|tuple): the dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. @@ -1859,9 +1859,9 @@ def fold(x, import paddle import paddle.nn.functional as F - x = paddle.randn([2,12,9]) - y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2) - # y.shape = [2,3,4,4] + x = paddle.randn([2,3*2*2,12]) + y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2) + # y.shape = [2,3,4,5] """ @@ -1872,29 +1872,32 @@ def fold(x, assert len(x.shape) == 3, \ "input should be the format of [N, C, L]" + def _is_list_or_turple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + if isinstance(output_sizes, int): output_sizes = [output_sizes, output_sizes] else: - assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \ - "output_sizes should either be an integer or a list of two integers" + assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \ + "output_sizes should either be an integer or a list/tuple of two integers" if isinstance(kernel_sizes, int): kernel_sizes = [kernel_sizes, kernel_sizes] else: - assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \ - "kernel_sizes should either be an integer or a list of two integers" + assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \ + "kernel_sizes should either be an integer or a list/tuple of two integers" if isinstance(strides, int): strides = [strides, strides] else: - assert isinstance(strides, list) and (len(strides) == 2), \ - "strides should either be an integer or a list of two integers" + assert _is_list_or_turple_(strides) and (len(strides) == 2), \ + "strides should either be an integer or a list/tuple of two integers" if isinstance(dilations, int): dilations = [dilations, dilations] else: - assert isinstance(dilations, list) and (len(dilations) == 2), \ - "dilations should either be an integer or a list of two integers" + assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \ + "dilations should either be an integer or a list/tuple of two integers" if isinstance(paddings, int): paddings = [paddings] * 4 @@ -1912,16 +1915,21 @@ def fold(x, "Unexpected type of paddings, it should be either an integer or a list" "of 2 or 4 integers") - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type="fold", - inputs={"X": x}, - outputs={"Y": out}, - attrs={ - "output_sizes": output_sizes, - "kernel_sizes": kernel_sizes, - "strides": strides, - "paddings": paddings, - "dilations": dilations - }) + if in_dynamic_mode(): + out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes", + kernel_sizes, "strides", strides, "paddings", + paddings, "dilations", dilations) + else: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="fold", + inputs={"X": x}, + outputs={"Y": out}, + attrs={ + "output_sizes": output_sizes, + "kernel_sizes": kernel_sizes, + "strides": strides, + "paddings": paddings, + "dilations": dilations + }) return out diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 19fbcd5b6f8..dac4cf5f272 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1565,7 +1565,6 @@ class Fold(Layer): can be calculated as following. .. math:: - H_out &= output_size[0] W_out &= output_size[1] C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1] @@ -1573,19 +1572,19 @@ class Fold(Layer): Parameters: output_sizes(list): The size of output size, should be [output_size_h, output_size_w] or an interger o treated as [o, o]. - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list): The strides, should be [stride_h, stride_w] + strides(int|list|tuple): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list): The paddings of each dimension, should be + paddings(int|list|tuple): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0] - dilations(int|list): the dilations of convolution kernel, should be + dilations(int|list|tuple): the dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. @@ -1604,10 +1603,10 @@ class Fold(Layer): import paddle import paddle.nn as nn - x = paddle.randn([2,12,9]) - fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2) + x = paddle.randn([2,3*2*2,12]) + fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2) y = fold(x) - # y.shape = [2,3,4,4] + # y.shape = [2,3,4,5] """ def __init__(self, -- GitLab From 2ce007cae0a2307997d8ffc43292fd505246e36b Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 8 Mar 2022 17:41:29 +0800 Subject: [PATCH 116/261] remove isinstance Dataset check. test=develop (#40184) --- python/paddle/fluid/dataloader/batch_sampler.py | 2 -- python/paddle/fluid/reader.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py index 3debeecfe4f..3a23c852563 100644 --- a/python/paddle/fluid/dataloader/batch_sampler.py +++ b/python/paddle/fluid/dataloader/batch_sampler.py @@ -113,8 +113,6 @@ class BatchSampler(Sampler): assert not shuffle, "shuffle should be False when sampler is set" self.sampler = sampler else: - assert isinstance(dataset, Dataset), \ - "dataset should be a paddle.io.Dataset" assert not isinstance(dataset, IterableDataset), \ "dataset should not be a paddle.io.IterableDataset" assert sampler is None, \ diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 727ceca72d1..cbea289162c 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -332,8 +332,6 @@ class DataLoader(object): self.use_buffer_reader = use_buffer_reader self.worker_init_fn = worker_init_fn - assert isinstance(dataset, Dataset), \ - "dataset should be subclass instance of paddle.io.Dataset" self.dataset = dataset if not return_list and not in_dygraph_mode(): -- GitLab From 9aa6bfc7e1cfce657109789995d153b6bcdf74d7 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 8 Mar 2022 17:42:29 +0800 Subject: [PATCH 117/261] fix yolov3 return value in dygraph mode. test=develop (#40185) --- python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py | 1 + python/paddle/vision/ops.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 1ec1d1527e1..3f0e4f7a400 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -305,6 +305,7 @@ class TestYolov3LossDygraph(unittest.TestCase): use_label_smooth=True, scale_x_y=1.) assert loss is not None + assert loss.shape == [2] paddle.enable_static() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 03060e92bdb..4983ca49ac3 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -195,7 +195,7 @@ def yolo_loss(x, """ if in_dygraph_mode() and gt_score is None: - loss = _C_ops.yolov3_loss( + loss, _, _ = _C_ops.yolov3_loss( x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask, 'class_num', class_num, 'ignore_thresh', ignore_thresh, 'downsample_ratio', downsample_ratio, 'use_label_smooth', -- GitLab From 3a77d027b143b19a9c26bdc7e77e0902ff2a7feb Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 8 Mar 2022 20:18:03 +0800 Subject: [PATCH 118/261] [Phi] Remove gpudnn suffix & polish cmake (#40239) * remove gpudnn suffix & polish cmake * fix typo --- cmake/phi.cmake | 122 +++++++++--------- ...nel_gpudnn.cu => conv_grad_grad_kernel.cu} | 0 ...d_kernel_gpudnn.cu => conv_grad_kernel.cu} | 0 .../{conv_kernel_gpudnn.cu => conv_kernel.cu} | 0 ...ernel_gpudnn.cu => softmax_grad_kernel.cu} | 0 ...max_kernel_gpudnn.cu => softmax_kernel.cu} | 0 6 files changed, 59 insertions(+), 63 deletions(-) rename paddle/phi/kernels/gpudnn/{conv_grad_grad_kernel_gpudnn.cu => conv_grad_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{conv_grad_kernel_gpudnn.cu => conv_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{conv_kernel_gpudnn.cu => conv_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{softmax_grad_kernel_gpudnn.cu => softmax_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{softmax_kernel_gpudnn.cu => softmax_kernel.cu} (100%) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379..ebb686d8ad0 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu -- GitLab From c1d81ec13cec96729f3902455e1038eb6e6280cf Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 21:12:04 +0800 Subject: [PATCH 119/261] Add profiler statistic (#40249) * add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage * add profiler code * add statistic code * reduce content for pr --- .../unittests/test_profiler_statistic.py | 199 +++++ python/paddle/profiler/profiler_statistic.py | 793 ++++++++++++++++++ 2 files changed, 992 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_profiler_statistic.py mode change 100644 => 100755 python/paddle/profiler/profiler_statistic.py diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py new file mode 100644 index 00000000000..838ccae37cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.profiler as profiler + + +class HostPythonNode: + def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.process_id = process_id + self.thread_id = thread_id + self.children_node = [] + self.runtime_node = [] + self.device_node = [] + + +class DevicePythonNode: + def __init__(self, name, type, start_ns, end_ns, device_id, context_id, + stream_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.device_id = device_id + self.context_id = context_id + self.stream_id = stream_id + + +class TestProfilerStatistic(unittest.TestCase): + def test_statistic_case1(self): + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + dataloader_node = HostPythonNode( + 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + yolonet_node = HostPythonNode( + 'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001) + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + sync_batch_norm_node = HostPythonNode('sync_batch_norm', + profiler.TracerEventType.Operator, + 60, 100, 1000, 1001) + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 30, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 30, 40, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, + 1000, 1001) + conv2d_MemCpy = HostPythonNode('AsyncMemcpy', + profiler.TracerEventType.UserDefined, 35, + 40, 1000, 1001) + conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', + profiler.TracerEventType.CudaRuntime, + 35, 40, 1000, 1001) + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode( + 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + sync_batch_norm_infer_shape = HostPythonNode( + 'sync_batch_norm::infer_shape', + profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) + sync_batch_norm_compute = HostPythonNode( + 'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner, + 80, 100, 1000, 1001) + sync_batch_norm_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90, + 1000, 1001) + sync_batch_norm_MemCpy = HostPythonNode( + 'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000, + 1001) + sync_batch_norm_cudaMemCpy = HostPythonNode( + 'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000, + 1001) + sync_batch_norm_kernel = DevicePythonNode( + 'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155, + 0, 0, 0) + sync_batch_norm_memcpy = DevicePythonNode( + 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, + 0, 0, 1) + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, yolonet_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + yolonet_node.children_node.append(sync_batch_norm_node) + conv2d_node.children_node.extend( + [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_cudaMemCpy.device_node.append(conv2d_memcpy) + sync_batch_norm_node.children_node.extend([ + sync_batch_norm_infer_shape, sync_batch_norm_compute, + sync_batch_norm_MemCpy + ]) + sync_batch_norm_compute.runtime_node.append( + sync_batch_norm_launchkernel) + sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) + sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) + sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.ProfileStep), 400) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Forward), 90) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Backward), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Optimization), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Operator), 55) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.OperatorInner), 45) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.CudaRuntime), 30) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Kernel), 75) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Memcpy), 60) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.UserDefined), 15) + self.assertEqual(len(event_summary.items), 2) + self.assertEqual(len(event_summary.userdefined_items), 0) + self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.memory_manipulation_items), 1) + self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) + self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual( + event_summary.model_perspective_items['Forward'].cpu_time, 90) + self.assertEqual( + event_summary.model_perspective_items['Forward'].gpu_time, 135) + self.assertEqual( + event_summary.model_perspective_items['Backward'].gpu_time, 0) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=profiler.SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py old mode 100644 new mode 100755 index 29d586268a0..e39871c7365 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -16,6 +16,20 @@ from enum import Enum from paddle.fluid.core import TracerEventType +from .statistic_helper import * + +_AllTracerEventType = [ + TracerEventType.Operator, TracerEventType.Dataloader, + TracerEventType.ProfileStep, TracerEventType.CudaRuntime, + TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset, + TracerEventType.UserDefined, TracerEventType.OperatorInner, + TracerEventType.Forward, TracerEventType.Backward, + TracerEventType.Optimization, TracerEventType.Communication, + TracerEventType.PythonOp, TracerEventType.PythonUserDefined +] + +_CommunicationOpName = ['reduce', 'broadcast', 'rpc'] + class SortedKeys(Enum): r""" @@ -29,3 +43,782 @@ class SortedKeys(Enum): GPUAvg = 5 GPUMax = 6 GPUMin = 7 + + +class HostStatisticNode: + r''' + Wrap original node for calculating statistic metrics. + ''' + + def __init__(self, hostnode): + self.hostnode = hostnode + self.children_node = [] + self.runtime_node = [] + self.cpu_time = 0 + self.self_cpu_time = 0 + self.gpu_time = 0 + self.self_gpu_time = 0 + + def cal_statistic(self): + for child in self.children_node: + child.cal_statistic() + for rt in self.runtime_node: + rt.cal_statistic() + + self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns + for child in self.children_node: + self.gpu_time += child.gpu_time + self.self_cpu_time -= (child.end_ns - child.start_ns) + for rt in self.runtime_node: + self.self_cpu_time -= (rt.end_ns - rt.start_ns) + self.gpu_time += rt.gpu_time + self.self_gpu_time += rt.gpu_time + for device in self.hostnode.device_node: + self.gpu_time += (device.end_ns - device.start_ns) + self.self_gpu_time += (device.end_ns - device.start_ns) + + @property + def end_ns(self): + return self.hostnode.end_ns + + @property + def start_ns(self): + return self.hostnode.start_ns + + def __getattr__(self, name): + return getattr(self.hostnode, name) + + +def traverse_tree(nodetrees): + results = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + threadlist = results[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + for childnode in current_node.children_node: + stack.append(childnode) + return results + + +def wrap_tree(nodetrees): + ''' + Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. + ''' + node_statistic_tree = {} + results = collections.defaultdict(list) + newresults = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + root_statistic_node = HostStatisticNode(rootnode) + newstack = [] + newstack.append(root_statistic_node) + node_statistic_tree[thread_id] = root_statistic_node + threadlist = results[thread_id] + newthreadlist = newresults[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + current_statistic_node = newstack.pop() + newthreadlist.append(current_statistic_node) + for childnode in current_node.children_node: + stack.append(childnode) + child_statistic_node = HostStatisticNode(childnode) + current_statistic_node.children_node.append( + child_statistic_node) + newstack.append(child_statistic_node) + for runtimenode in current_node.runtime_node: + runtime_statistic_node = HostStatisticNode(runtimenode) + current_statistic_node.runtime_node.append( + runtime_statistic_node) + # recursive calculate node statistic values + for thread_id, root_statistic_node in node_statistic_tree.items(): + root_statistic_node.cal_statistic() + + return node_statistic_tree, newresults + + +class TimeRangeSummary: + r""" + Analyse time ranges for each TracerEventType, and summarize the time. + """ + + def __init__(self): + self.CPUTimeRange = collections.defaultdict(list) + self.GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(list) + ) # GPU events should be divided into different devices + self.CPUTimeRangeSum = collections.defaultdict(int) + self.GPUTimeRangeSum = collections.defaultdict( + lambda: collections.defaultdict(int)) + self.call_times = collections.defaultdict(int) + + def parse(self, nodetrees): + r""" + Analysis node trees in profiler result, and get time range for different tracer event type. + """ + thread2hostnodes = traverse_tree(nodetrees) + for threadid, hostnodes in thread2hostnodes.items(): + CPUTimeRange = collections.defaultdict(list) + GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(lambda: collections.defaultdict(list)) + ) # device_id/type/stream_id + for hostnode in hostnodes[1:]: #skip root node + CPUTimeRange[hostnode.type].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[hostnode.type] += 1 + if hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in + _CommunicationOpName]): # special case, communication op + CPUTimeRange[TracerEventType.Communication].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + is_communication_node = ( + hostnode.type == TracerEventType.Communication + ) or (hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in _CommunicationOpName])) + for runtimenode in hostnode.runtime_node: + CPUTimeRange[runtimenode.type].append( + (runtimenode.start_ns, runtimenode.end_ns)) + self.call_times[runtimenode.type] += 1 + for devicenode in runtimenode.device_node: + GPUTimeRange[devicenode.device_id][devicenode.type][ + devicenode.stream_id].append( + (devicenode.start_ns, devicenode.end_ns)) + self.call_times[devicenode.type] += 1 + if is_communication_node: # gpu activity for communication node + GPUTimeRange[devicenode.device_id][ + TracerEventType.Communication][ + devicenode.stream_id].append(( + devicenode.start_ns, devicenode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + + for event_type, time_ranges in CPUTimeRange.items(): + time_ranges = merge_self_ranges(time_ranges, is_sorted=False) + self.CPUTimeRange[event_type] = merge_ranges( + self.CPUTimeRange[event_type], time_ranges, is_sorted=True) + for device_id, device_time_ranges in GPUTimeRange.items(): + for event_type, event_time_ranges in device_time_ranges.items(): + for stream_id, time_ranges in event_time_ranges.items(): + time_ranges = merge_self_ranges( + time_ranges, is_sorted=False) + self.GPUTimeRange[device_id][event_type] = merge_ranges( + self.GPUTimeRange[device_id][event_type], + time_ranges, + is_sorted=True) + + for event_type, time_ranges in self.CPUTimeRange.items(): + self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges) + for device_id, device_time_ranges in self.GPUTimeRange.items(): + for event_type, time_ranges in device_time_ranges.items(): + self.GPUTimeRangeSum[device_id][event_type] = sum_ranges( + time_ranges) + + def get_gpu_devices(self): + return self.GPUTimeRange.keys() + + def get_gpu_range_sum(self, device_id, event_type): + return self.GPUTimeRangeSum[device_id][event_type] + + def get_cpu_range_sum(self, event_type): + return self.CPUTimeRangeSum[event_type] + + +class EventSummary: + r""" + Analyse operator event in profiling data, correlate with its device event. + """ + + class DeviceItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_item(self, node): + self.call += 1 + self.add_gpu_time(node.end_ns - node.start_ns) + + class OperatorItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.gpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + self.devices = {} + self.operator_inners = {} + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + for child in node.children_node: + if child.name not in self.operator_inners: + self.operator_inners[ + child.name] = EventSummary.OperatorItem(child.name) + self.operator_inners[child.name].add_item(child) + + for runtimenode in node.runtime_node: + for devicenode in runtimenode.device_node: + if devicenode.name not in self.devices: + self.devices[devicenode.name] = EventSummary.DeviceItem( + devicenode.name) + self.devices[devicenode.name].add_item(devicenode) + + class GeneralItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + + def __init__(self): + self.items = {} # for operator summary + self.thread_items = collections.defaultdict( + dict) # for operator summary + self.userdefined_items = {} # for userdefined summary + self.userdefined_thread_items = collections.defaultdict( + dict) # for userdefined summary + self.model_perspective_items = {} # for model summary + self.memory_manipulation_items = {} # for memory manipulation summary + + def parse(self, nodetrees): + r""" + Analysis operator event in the nodetress. + """ + node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees) + for threadid, host_statistic_nodes in thread2host_statistic_nodes.items( + ): + for host_statistic_node in host_statistic_nodes[ + 1:]: #skip root node + if host_statistic_node.type == TracerEventType.Operator: + self.add_operator_item(host_statistic_node) + if host_statistic_node.type == TracerEventType.UserDefined\ + or host_statistic_node.type == TracerEventType.PythonUserDefined: + if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\ + or 'memset' in host_statistic_node.name.lower(): + self.add_memory_manipulation_item(host_statistic_node) + else: + self.add_userdefined_item(host_statistic_node) + + for threadid, root_statistic_node in node_statistic_trees.items(): + deque = collections.deque() + deque.append(root_statistic_node) + while deque: + current_node = deque.popleft() + for child in current_node.children_node: + if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\ + or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization: + self.add_model_perspective_item( + child) #find first model perspective node + else: + deque.append(child) + + def add_operator_item(self, operator_node): + if operator_node.name not in self.items: + self.items[operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + + self.items[operator_node.name].add_item(operator_node) + + if operator_node.name not in self.thread_items[operator_node.thread_id]: + self.thread_items[operator_node.thread_id][ + operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + self.thread_items[operator_node.thread_id][operator_node.name].add_item( + operator_node) + + def add_userdefined_item(self, userdefined_node): + if userdefined_node.name not in self.userdefined_items: + self.userdefined_items[ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + + self.userdefined_items[userdefined_node.name].add_item(userdefined_node) + + if userdefined_node.name not in self.userdefined_thread_items[ + userdefined_node.thread_id]: + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name].add_item(userdefined_node) + + def add_memory_manipulation_item(self, memory_manipulation_node): + if memory_manipulation_node.name not in self.memory_manipulation_items: + self.memory_manipulation_items[ + memory_manipulation_node.name] = EventSummary.GeneralItem( + memory_manipulation_node.name) + self.memory_manipulation_items[memory_manipulation_node.name].add_item( + memory_manipulation_node) + + def add_model_perspective_item(self, model_perspective_node): + if model_perspective_node.type == TracerEventType.Forward: + name = 'Forward' + elif model_perspective_node.type == TracerEventType.Backward: + name = 'Backward' + elif model_perspective_node.type == TracerEventType.Optimization: + name = 'Optimization' + elif model_perspective_node.type == TracerEventType.Dataloader: + name = 'Dataloader' + else: + return + if name not in self.model_perspective_items: + self.model_perspective_items[name] = EventSummary.GeneralItem(name) + self.model_perspective_items[name].add_item(model_perspective_node) + + +class StatisticData: + r""" + Hold all analysed results. + """ + + def __init__(self, node_trees, extra_info): + self.node_trees = node_trees + self.extra_info = extra_info + self.time_range_summary = TimeRangeSummary() + self.event_summary = EventSummary() + self.time_range_summary.parse(node_trees) + self.event_summary.parse(node_trees) + + +def _build_table(statistic_data, + sorted_by=SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms', + row_limit=100, + max_src_column_width=75): + """Prints a summary of events.""" + # format table row + SPACING_SIZE = 2 + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + def add_column(padding, text_dir='<'): + row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + ( + ' ' * SPACING_SIZE) + header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE) + line_length_list[0] += padding + SPACING_SIZE + + def add_title(padding, text): + left_length = padding - len(text) + half = left_length // 2 + return '-' * half + text + '-' * (left_length - half) + + result = [] + + def append(s): + result.append(s) + result.append('\n') + + def format_time(time, unit='ms', indent=0): + r""" + Transform time in ns to time in unit. + """ + if time == float('inf'): + return '-' + else: + result = float(time) + if unit == 's': + result /= 1e9 + elif unit == 'ms': + result /= 1e6 + elif unit == 'us': + result /= 1e3 + return '{}{:.2f}'.format(' ' * indent, result) + + def format_ratio(ratio, indent=0): + r""" + Transform ratio within [0, 1] to percentage presentation. + """ + return '{}{:.2f}'.format(' ' * indent, ratio * 100) + + total_time = statistic_data.time_range_summary.get_cpu_range_sum( + TracerEventType.ProfileStep) + ###### Print Device Summary ###### + headers = ['Device', 'Utilization (%)'] + name_column_width = 30 + DEFAULT_COLUMN_WIDTH = 20 + add_column(name_column_width) + for _ in headers[1:]: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + + append(add_title(line_length, "Device Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'CPU(Process)', format_ratio( + float(statistic_data.extra_info['Process Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + row_values = [ + 'CPU(System)', format_ratio( + float(statistic_data.extra_info['System Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + for gpu_name in statistic_data.time_range_summary.get_gpu_devices(): + gpu_time = float( + statistic_data.time_range_summary.get_gpu_range_sum( + gpu_name, TracerEventType.Kernel)) + utilization = gpu_time / total_time + row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n" + "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n" + "GPU Utilization = Current process GPU time / elapsed time") + append('-' * line_length) + append('') + append('') + + if total_time == 0: + return ''.join(result) + + ###### Print Overview Summary ###### + headers = ['Event Type', 'CPU Time', 'Ratio (%)'] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + DEFAULT_COLUMN_WIDTH = 25 + for _ in headers: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Overview Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'Total Time', format_time( + total_time, unit=time_unit), format_ratio(1) + ] + append(row_format.format(*row_values)) + cpu_type_time = collections.defaultdict(int) + gpu_type_time = collections.defaultdict(int) + for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( + ): + cpu_type_time[event_type] = value + + gpu_time_range = collections.defaultdict(list) + for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( + ): + for event_type, time_range in device_time_ranges.items(): + gpu_time_range[event_type] = merge_ranges( + gpu_time_range[event_type], time_range, is_sorted=True) + for event_type, time_range in gpu_time_range.items(): + gpu_type_time[event_type] = sum_ranges(time_range) + + sorted_items = sorted( + cpu_type_time.items(), key=lambda x: x[1], reverse=True) + for event_type, time in sorted_items: + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + append(header_sep) + headers = ['', 'GPU Time', 'Ratio (%)'] + append(row_format.format(*headers)) + append(header_sep) + for event_type, time in gpu_type_time.items(): + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nIn this table, We sum up all collected events in terms of event type.\n" + "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" + "ratio = CPU(GPU) Time / Total Time." + "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" + "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" + "Example:\n" + "Thread 1:\n" + " Operator: |___________| |__________|\n" + "Thread 2:\n" + " Operator: |____________| |___|\n" + "After merged:\n" + " Result: |______________| |__________|\n") + append('-' * line_length) + append('') + append('') + + ###### Print Operator Summary Report ###### + if statistic_data.event_summary.items: + headers = [ + 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', + 'GPU Total / Avg / Max / Min / Ratio(%)' + ] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + name_column_width = 50 + add_column(name_column_width) + add_column(6) + add_column(40) + add_column(40) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Operator Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + if thread_sep == True: + thread_items = statistic_data.event_summary.thread_items + else: + thread_items = { + 'All threads merged': statistic_data.event_summary.items + } + for thread_id, items in thread_items.items(): + append(add_title(line_length, "Thread: {}".format(thread_id))) + if sorted_by == SortedKeys.CPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].cpu_time, reverse=True) + elif sorted_by == SortedKeys.CPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_cpu_time) + elif sorted_by == SortedKeys.GPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].gpu_time, reverse=True) + elif sorted_by == SortedKeys.GPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_gpu_time) + + total_cpu_time = 0 + total_gpu_time = 0 + for name, item in sorted_items: + total_cpu_time += item.cpu_time + total_gpu_time += item.gpu_time + for name, item in sorted_items: + row_values = [ + name, item.call, '{} / {} / {} / {} / {}'.format( + format_time( + item.cpu_time, unit=time_unit), + format_time( + item.avg_cpu_time, unit=time_unit), + format_time( + item.max_cpu_time, unit=time_unit), + format_time( + item.min_cpu_time, unit=time_unit), + format_ratio(float(item.cpu_time) / total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + item.gpu_time, unit=time_unit), + format_time( + item.avg_gpu_time, unit=time_unit), + format_time( + item.max_gpu_time, unit=time_unit), + format_time( + item.min_gpu_time, unit=time_unit), + format_ratio(float(item.gpu_time) / total_gpu_time)) + ] + append(row_format.format(*row_values)) + if op_detail: + for innerop_name, innerop_node in item.operator_inners.items( + ): + row_values = [ + ' {}'.format(innerop_name), innerop_node.call, + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.cpu_time, unit=time_unit), + format_time( + innerop_node.avg_cpu_time, unit=time_unit), + format_time( + innerop_node.max_cpu_time, unit=time_unit), + format_time( + innerop_node.min_cpu_time, unit=time_unit), + format_ratio( + float(innerop_node.cpu_time) / + total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.gpu_time, unit=time_unit), + format_time( + innerop_node.avg_gpu_time, unit=time_unit), + format_time( + innerop_node.max_gpu_time, unit=time_unit), + format_time( + innerop_node.min_gpu_time, unit=time_unit), + format_ratio( + float(innerop_node.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, devicenode in innerop_node.devices.items( + ): + if len(device_node_name) + 4 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 7] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), + devicenode.call, '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, + unit=time_unit), + format_time( + devicenode.max_gpu_time, + unit=time_unit), + format_time( + devicenode.min_gpu_time, + unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, device_node in item.devices.items(): + if len(device_node_name) + 2 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 5] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), devicenode.call, + '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, unit=time_unit), + format_time( + devicenode.max_gpu_time, unit=time_unit), + format_time( + devicenode.min_gpu_time, unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + append(header_sep) + append('') + append('') + return ''.join(result) -- GitLab From 688743bf7ce7846873481dc5fdc2454c6e2de4f6 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 8 Mar 2022 21:22:17 +0800 Subject: [PATCH 120/261] Rename phi::func::TensorReduceImpl to phi::func::ReduceKernel. (#40183) --- .../fluid/operators/reduce_ops/reduce_op.cu.h | 4 +-- paddle/phi/kernels/funcs/matrix_reduce.cu | 9 ++---- paddle/phi/kernels/funcs/reduce_function.h | 12 ++++---- .../gpu/broadcast_tensors_grad_kernel.cu | 5 ++-- paddle/phi/kernels/gpu/compare_kernel.cu | 4 +-- paddle/phi/kernels/gpu/elementwise_grad.h | 29 +++++++------------ paddle/phi/kernels/gpu/reduce.h | 24 +++++---------- ...d_cross_entropy_with_logits_grad_kernel.cu | 17 +++-------- ...igmoid_cross_entropy_with_logits_kernel.cu | 18 +++--------- paddle/phi/kernels/gpu/trace_kernel.cu | 5 ++-- .../kernels/impl/matmul_grad_kernel_impl.h | 5 ++-- 11 files changed, 44 insertions(+), 88 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index eb76eee1048..16061769533 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::funcs::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu index 5e288c6e9c2..5c3ebd6bb01 100644 --- a/paddle/phi/kernels/funcs/matrix_reduce.cu +++ b/paddle/phi/kernels/funcs/matrix_reduce.cu @@ -45,13 +45,8 @@ class MatrixReduceSumFunctor { out_reduce_dims.push_back(idx); } } - TensorReduceImpl>( - dev_ctx, - in, - out, - kps::IdentityFunctor(), - out_reduce_dims, - dev_ctx.stream()); + ReduceKernel>( + dev_ctx, in, out, kps::IdentityFunctor(), out_reduce_dims); } }; diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index ce6bb0d559c..5834f091d9a 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -1087,12 +1087,12 @@ template class ReduceOp, typename TransformOp> -void TensorReduceImpl(const phi::GPUContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - KPStream stream) { +void ReduceKernel(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + phi::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims) { + auto stream = dev_ctx.stream(); dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 926dffc7450..d4850b74477 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx, *input_tensor, ctx.GetPlace(), ctx, output_tensor); } else { // reduce_sum implementation on CUDA - funcs::TensorReduceImpl>( + funcs::ReduceKernel>( ctx, *input_tensor, output_tensor, kps::IdentityFunctor(), - reduce_dims_vec, - ctx.stream()); + reduce_dims_vec); } } } diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu index 9c02627e546..225164687b7 100644 --- a/paddle/phi/kernels/gpu/compare_kernel.cu +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -80,8 +80,8 @@ inline void CompareAllKernelImpl(const Context& ctx, for (int i = 0; i < reduce_dims.size(); ++i) { reduce_dims[i] = i; } - funcs::TensorReduceImpl>( - ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); + funcs::ReduceKernel>( + ctx, tmp, out, kps::IdentityFunctor(), reduce_dims); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index b356f19555f..98df65c92f3 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -29,13 +29,8 @@ void ReduceWrapper(const GPUContext &dev_ctx, DenseTensor *dst) { std::vector reduce_dims = funcs::GetReduceDim(dst->dims(), src->dims(), axis); - funcs::TensorReduceImpl>( - dev_ctx, - *src, - dst, - kps::IdentityFunctor(), - reduce_dims, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims); } template @@ -172,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -187,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims); } } } @@ -285,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -306,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::InverseFunctor(), reduce_dims); } } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 0319de7558e..da5315f3447 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -39,8 +39,6 @@ void Reduce(const KPDevice& dev_ctx, reduce_num *= (x.dims())[i]; } - KPStream stream = dev_ctx.stream(); - if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { auto tmp_tensor = phi::Cast(dev_ctx, x, out_dtype); PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES( @@ -48,29 +46,23 @@ void Reduce(const KPDevice& dev_ctx, phi::DataType::INT64, phi::DataType::FLOAT16, out_dtype, - "TensorReduceImpl", + "ReduceKernel", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( + phi::funcs::ReduceKernel>( dev_ctx, tmp_tensor, out, TransformOp(reduce_num), - reduce_dims, - stream); + reduce_dims); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( - dev_ctx, - x, - out, - TransformOp(reduce_num), - reduce_dims, - stream); + phi::funcs::ReduceKernel>( + dev_ctx, x, out, TransformOp(reduce_num), reduce_dims); } } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index 598b0138fb3..6fc65006ae2 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(in_grad->dims()); - int limit = in_grad->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label, &out_grad}; std::vector outs = {in_grad, counts_tensor}; auto functor = SigmoidBwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,6 +104,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 13d63f8d97e..4b6e5628c72 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(out->dims()); - int limit = out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label}; std::vector outs = {out, counts_tensor}; auto functor = SigmoidFwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,8 +104,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; - delete counts_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 4266f0174ff..4a749c5b334 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx, T* out_data = ctx.template Alloc(out); auto diag = funcs::Diagonal(ctx, &x, offset, axis1, axis2); if (diag.numel() > 0) { - auto stream = ctx.stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - funcs::TensorReduceImpl>( - ctx, diag, out, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, diag, out, kps::IdentityFunctor(), reduce_dims); } else { phi::funcs::SetConstant functor; functor(ctx, out, static_cast(0)); diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index d06bdc55030..495b93f2a4e 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad { const DenseTensor& input, DenseTensor* output, const std::vector& reduce_dims) { - auto stream = dev_ctx.stream(); - funcs::TensorReduceImpl>( - dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims); } }; #endif -- GitLab From e548f65f96697830035a28f9070b40829408ccdb Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Tue, 8 Mar 2022 22:26:02 +0800 Subject: [PATCH 121/261] support ema optimizer in sharding optimizers (#39860) --- .../paddle/distributed/fleet/meta_optimizers/sharding/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index d04a3a53db3..b42f21989ab 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -901,9 +901,10 @@ def save_persistables(exe, dirname, main_program, filename=None): def is_opt_vars(var): # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer # now only Momentum and adam are compatible with sharding + # support EMA optimizer checks = [ "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0", - "_velocity_0" + "_velocity_0", "_ema_0" ] for check in checks: if var.name.endswith(check) and var.persistable: -- GitLab From fcae3430808576c6a143562410f2527cc793bc70 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Wed, 9 Mar 2022 10:10:55 +0800 Subject: [PATCH 122/261] fix take_along_axis cuda op register bug (#40270) * fix take_along_axis cuda op register bug * add comma after float Co-authored-by: Chen Weihang --- paddle/phi/kernels/gpu/take_along_axis_kernel.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 63113e3e672..9665a917d9d 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(take_along_axis, GPU, ALL_LAYOUT, phi::TakeAlongAxisKernel, + float, double, int64_t, int, -- GitLab From fb4215b2d1765e305f687d2d1ca5f19c90f7eeb1 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Wed, 9 Mar 2022 10:21:50 +0800 Subject: [PATCH 123/261] fix batch_norm op kernel (#40171) --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 6ad12245d2a..49b550f51e6 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx, void *reserve_space_ptr = nullptr; void *workspace_ptr = nullptr; DenseTensor workspace_tensor; + DenseTensor reserve_space_tensor; // Create reserve space and workspace for batch norm. // Create tensor for each batchnorm op, it will be used in the // backward. Thus this tensor shouldn't be temp. // auto *reserve_space = ctx.Output("ReserveSpace"); + if (reserve_space == nullptr) { + reserve_space = &reserve_space_tensor; + } PADDLE_ENFORCE_NOT_NULL( reserve_space, phi::errors::NotFound( -- GitLab From 8031a4dc8b05dcfee95af2ca613fc736fc7f9830 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 9 Mar 2022 10:27:30 +0800 Subject: [PATCH 124/261] [Phi] move Reduce max kernel into phi (#40225) * add reduce_max kernel * add reduce max kernel * update reduce max Argumentmapping * remove reduce_max kernel * remove reduce_max kernel * add reduce max infermeta * rename reduce infermeta --- .../operators/reduce_ops/reduce_max_op.cc | 31 ++++++++---- .../operators/reduce_ops/reduce_max_op.cu | 23 --------- .../operators/reduce_ops/reduce_mean_op.cc | 2 +- .../operators/reduce_ops/reduce_sum_op.cc | 2 +- paddle/phi/core/compat/op_utils.h | 1 + paddle/phi/infermeta/unary.cc | 50 ++++++++++++------- paddle/phi/infermeta/unary.h | 22 ++++---- paddle/phi/kernels/cpu/reduce_max_kernel.cc | 39 +++++++++++++++ paddle/phi/kernels/funcs/reduce_functor.h | 8 +++ paddle/phi/kernels/gpu/reduce_max_kernel.cu | 37 ++++++++++++++ paddle/phi/kernels/math_kernel.h | 2 +- paddle/phi/kernels/reduce_max_kernel.cc | 39 +++++++++++++++ paddle/phi/kernels/reduce_max_kernel.h | 38 ++++++++++++++ paddle/phi/ops/compat/reduce_sig.cc | 24 ++++++++- python/paddle/utils/code_gen/api.yaml | 2 +- 15 files changed, 252 insertions(+), 68 deletions(-) delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.cu create mode 100644 paddle/phi/kernels/cpu/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/gpu/reduce_max_kernel.cu create mode 100644 paddle/phi/kernels/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/reduce_max_kernel.h diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a805..41df8e4a15f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu deleted file mode 100644 index 8194805ddc3..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_max -REGISTER_OP_CUDA_KERNEL( - reduce_max, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 894106883cb..4a183309138 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -97,7 +97,7 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PD_INFER_META(phi::MeanRawInferMeta)); + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 6559ed479c8..6441d53239e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -103,7 +103,7 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PD_INFER_META(phi::ReduceInferMetaBase)); + PD_INFER_META(phi::SumRawInferMeta)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 9947e00ecb5..1ab718c0794 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,6 +47,7 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "max", "reshape", "reshape_grad", "expand", diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 17edc846187..32744659163 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -406,7 +406,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } -/* Why not use ReduceInferMetaBase directly? +/* Why not use SumRawInferMeta directly? Because we need make InferMetaFunction's args follow the design of api.yaml */ void SumInferMeta(const MetaTensor& x, @@ -415,15 +415,13 @@ void SumInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out) { bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out); + SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); } -void ReduceInferMetaBase(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - DataType dtype, - MetaTensor* out) { +DDim ReduceInferDim(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all) { auto x_rank = x.dims().size(); std::vector formated_axis = axis; @@ -486,6 +484,17 @@ void ReduceInferMetaBase(const MetaTensor& x, } DDim out_dim = phi::make_ddim(out_dim_vector); + return out_dim; +} + +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + DataType out_dtype; if (dtype != DataType::UNDEFINED) { out_dtype = dtype; @@ -503,20 +512,23 @@ void ReduceInferMetaBase(const MetaTensor& x, out->set_layout(x.layout()); } -void MeanRawInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - MetaTensor* out) { - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); +void ReduceInferMetaBase(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + out->set_dims(out_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); } -void MeanInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out) { +void ReduceInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out) { bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); } void TransferLayoutInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index dac7c19cf9b..735a77faefe 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -94,23 +94,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out); + void ReduceInferMetaBase(const MetaTensor& x, const std::vector& axis, bool keep_dim, bool reduce_all, - DataType dtype, MetaTensor* out); -void MeanRawInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - MetaTensor* out); - -void MeanInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out); +void ReduceInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out); void SumInferMeta(const MetaTensor& x, const std::vector& axis, diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc new file mode 100644 index 00000000000..f9ea0aa0faf --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index aebd155ac59..4e83d0fa371 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -41,5 +41,13 @@ struct ProdFunctor { } }; +//////// Max Functor /////// +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu new file mode 100644 index 00000000000..98c3986c51d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index fe8f3b749cd..7569cbcff08 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx, bool keep_dim) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc new file mode 100644 index 00000000000..de172a12d72 --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h new file mode 100644 index 00000000000..7560473d43c --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 92839fb3030..36798abe4c1 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "sum_raw" KernelSignature. - // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // And the InferMeta function(i.e. SumRawInferMeta) is accordance with // the "sum_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature("sum_raw", @@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "mean_raw" KernelSignature. - // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the // "mean_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( @@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); } +KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the + // "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping); diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 8c68ca4d7e0..6c27d465cb1 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -124,7 +124,7 @@ args : (Tensor x, int64[] axis={}, bool keep_dim=false) output : Tensor infer_meta : - func : MeanInferMeta + func : ReduceInferMeta kernel : func : mean -- GitLab From 041c4bca832ef342679b17783c67f5d7294b1f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=20Wei=20=28=E4=BB=BB=E5=8D=AB=29?= Date: Wed, 9 Mar 2022 10:29:37 +0800 Subject: [PATCH 125/261] build documents if public apis modified, meanwhile their samplecodes should be tested (#39728) * run document_preview when samplecodes be tested * run document_preview when samplecodes be tested * sphinx-build symbol link; and build-doc default * FLUIDDOCDIR typo * download the required configirations and some other scripts * install required python packages. * clone specified branch of docs repo, and if failed, clone the default branch * clean workspace for docs repo * use the conf.py imported by https://github.com/PaddlePaddle/docs/pull/4222/ * download and install the boscmd * Optimaze the code comments. * specify the pypi index server * only do doc-build when running in cpu mode * pull docs pr git log paddle_pr_info * install jq * force using sphinx-build under py3.7 * using our new domain name for preview * install python package error * don't build doc default --- tools/document_preview.sh | 170 ++++++++++++++++++++++++++++++++++---- tools/sampcd_processor.py | 45 ++++++++++ 2 files changed, 198 insertions(+), 17 deletions(-) diff --git a/tools/document_preview.sh b/tools/document_preview.sh index 83c758d0aa8..424169bbc51 100755 --- a/tools/document_preview.sh +++ b/tools/document_preview.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,19 +14,155 @@ # See the License for the specific language governing permissions and # limitations under the License. -PADDLE_ROOT=/home -mkdir ${PADDLE_ROOT} -cd ${PADDLE_ROOT} -pip install /paddle/build/opt/paddle/share/wheels/*.whl -git clone https://github.com/PaddlePaddle/FluidDoc -git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git -cd ${PADDLE_ROOT}/PaddlePaddle.org -git reset 3feaa68376d8423e41d076814e901e6bf108c705 -cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api -sh gen_doc.sh -apt-get update && apt-get install -y python-dev build-essential -cd ${PADDLE_ROOT}/PaddlePaddle.org/portal -pip install -r requirements.txt -#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI. -sed -i "s#8000#$1#g" runserver -nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc & +is_shell_attribute_set() { # attribute, like "x" + case "$-" in + *"$1"*) return 0 ;; + *) return 1 ;; + esac +} +function get_docs_pr_num_from_paddle_pr_info(){ + # get_repo_pr_info's output + pr_info_file=$1 + if [ ! -r ${pr_info_file} ] ; then + return 1 + fi + + declare -A arr_kv + while read line + do + echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null + if [ $? = 0 ] ; then + kv=($(echo $line | sed 's/=/\n/g')) + k=($(echo "${kv[0]}" | sed 's/\s//g')) + v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g')) + # arr_kv[${kv[1]}]=${kv[2]} + arr_kv[${k}]=${v} + fi + done < <(jq -r '.body' ${pr_info_file}) + + echo ${arr_kv[PADDLEDOCS_PR]} + return 0 +} + +# Attention: +# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. +# 2. And /docs is used as the output of doc-build process. +# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and +# OUTPUTDIR in the subsequent codes. +# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC. +# All the Chinese and English docs will be generated, and then uploaded. + +PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None" +BUILD_DOC=${BUILD_DOC:=false} +UPLOAD_DOC=${UPLOAD_DOC:=false} + +CURPWD=${PWD} + +if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then + if [ -f /usr/local/bin/sphinx-build ] ; then + rm /usr/local/bin/sphinx-build + fi + ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build +fi + +if [ "${BUILD_DOC}" = "true" ] && [ -x /usr/local/bin/sphinx-build ] ; then + export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc} + export OUTPUTDIR=${OUTPUTDIR:=/docs} + export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g') + + if [ -d ${FLUIDDOCDIR} ] ; then + echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done." + cd ${FLUIDDOCDIR} + git reset --hard + git clean -dfx + cd ${CURPWD} + else + git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR} + if [ ! "$?" = "0" ] ; then + git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR} + fi + fi + if [ -d ${OUTPUTDIR} ] ; then + echo "$0: rm -rf ${OUTPUTDIR}" + rm -rf ${OUTPUTDIR} + mkdir -p ${OUTPUTDIR} + fi + + # install requirements + export no_proxy=mirror.baidu.com,${no_proxy} + apt-get install -y --no-install-recommends doxygen jq + echo 'beautifulsoup4 +Markdown +sphinx-sitemap +sphinx-markdown-tables +breathe +exhale +sphinx_design +nbsphinx +' >/tmp/doc-build.requirements && \ + pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \ + rm /tmp/doc-build.requirements + + + source ${FLUIDDOCDIR}/ci_scripts/utils.sh + paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID}) + docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info}) + if [ -n "${docs_pr_id}" ] ; then + cd ${FLUIDDOCDIR} + git fetch --depth=1 origin pull/${docs_pr_id}/head + git checkout -b "pr${docs_pr_id}" FETCH_HEAD + git log --pretty=oneline -10 + fi + echo "docs_pr_id=${docs_pr_id}" + + + # build doc + /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh + if [ $? -ne 0 ];then + echo 'gendoc error' + exit 1 + fi + + if [ "${UPLOAD_DOC}" = "true" ] ; then + curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \ + python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \ + chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \ + rm /tmp/linux-bcecmd-0.3.0.zip && \ + curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \ + tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \ + rm /tmp/boscmdconfig.tgz + + # credentials file is empty, please build it if need. + BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd + BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig + + is_shell_attribute_set x + xdebug_setted=$? + if [ $xdebug_setted ] ; then + set +x + fi + if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then + echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials + echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials + fi + if [ $xdebug_setted ] ; then + set -x + fi + + PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}" + BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev} + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/" + PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html" + fi +fi + +cd ${CURPWD} +# print the preview url +echo "${PREVIEW_URL_PROMPT}" diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index d8cb70c9dd1..2d8692c5bc7 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -550,6 +550,42 @@ def get_incrementapi(): f.write('\n') +def exec_gen_doc(): + result = True + cmd = ["bash", "document_preview.sh"] + logger.info("----exec gen_doc----") + start_time = time.time() + subprc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, error = subprc.communicate() + msg = "".join(output.decode(encoding='utf-8')) + err = "".join(error.decode(encoding='utf-8')) + end_time = time.time() + + if subprc.returncode != 0: + logger.info("----gen_doc msg----") + logger.info(msg) + logger.error("----gen_doc error msg----") + logger.error(err) + logger.error("----exec gen_doc failed----") + result = False + else: + logger.info("----gen_doc msg----") + logger.info(msg) + logger.info("----exec gen_doc success----") + + for fn in [ + '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html' + ]: + if os.path.exists(fn): + logger.info('%s exists.', fn) + else: + logger.error('%s not exists.', fn) + + # msg is the returned code execution report + return result, msg, end_time - start_time + + arguments = [ # flags, dest, type, default, help ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'], @@ -570,6 +606,11 @@ def parse_args(): parser.add_argument('--debug', dest='debug', action="store_true") parser.add_argument('--full-test', dest='full_test', action="store_true") parser.add_argument('mode', type=str, help='run on device', default='cpu') + parser.add_argument( + '--build-doc', + dest='build_doc', + action='store_true', + help='build doc if need.') for item in arguments: parser.add_argument( item[0], dest=item[1], help=item[4], type=item[2], default=item[3]) @@ -702,3 +743,7 @@ if __name__ == '__main__': exit(1) logger.info("Sample code check is successful!") + + if args.mode == "cpu": + # As cpu mode is also run with the GPU whl, so skip it in gpu mode. + exec_gen_doc() -- GitLab From b5a8a0d96b594ae16ae95b645aa38e3bbc78ec76 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Wed, 9 Mar 2022 11:22:21 +0800 Subject: [PATCH 126/261] [MLU] add mlu buffer reader (#40131) --- .../fluid/operators/reader/buffered_reader.cc | 68 +++++++++++++++++++ .../fluid/operators/reader/buffered_reader.h | 12 ++++ .../fluid/platform/stream_callback_manager.cc | 8 +-- 3 files changed, 84 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602..4b6759ea165 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df..f0f3b6b7f9f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 7fce0296d43..7148afee273 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -80,10 +80,10 @@ void StreamCallbackManager::AddCallback( #endif #if PADDLE_WITH_MLU - VLOG(3) << "MLULaunchCallback at stream: " << stream_; - LOG(ERROR) << "failed to call MLULaunchCallback, " - << "because mlu not support StreamAddCallback yet. " - << "function: " << func; + VLOG(3) << "MLULaunchCallback at stream: " << stream_ + << " Failed to call MLULaunchCallback, " + << "because mlu not support StreamAddCallback yet. " + << "function: " << func; #endif } -- GitLab From 86effa0ce1309ea27f29af6a28dd5bb3d4aa1ac5 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:23:02 +0800 Subject: [PATCH 127/261] [IPU] update ipu unittests p3 (#40072) * update ipu UTs part3 * rename uts * sync api changes * update uts for new api * update use_ipumodel() * split pr --- .../unittests/ipu/test_matmul_v2_op_ipu.py | 186 ++++++++++++++++++ .../tests/unittests/ipu/test_mean_op_ipu.py | 109 ++++------ ...pipeline.py => test_model_pipeline_ipu.py} | 16 +- .../tests/unittests/ipu/test_mul_op_ipu.py | 112 +++++------ .../unittests/ipu/test_pool_avg_op_ipu.py | 84 ++++---- .../unittests/ipu/test_pool_max_op_ipu.py | 128 ++++++------ .../tests/unittests/ipu/test_pow_op_ipu.py | 140 +++++++------ .../tests/unittests/ipu/test_print_op_ipu.py | 143 ++++++++++++++ .../unittests/ipu/test_reduce_x_op_ipu.py | 124 ++++++------ .../ipu/test_reshape_inplace_op_ipu.py | 84 ++++---- .../unittests/ipu/test_reshape_op_ipu.py | 83 ++++---- ...est_save_load.py => test_save_load_ipu.py} | 105 +++++++--- .../tests/unittests/ipu/test_scale_op_ipu.py | 152 +++++++------- 13 files changed, 934 insertions(+), 532 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_model_pipeline.py => test_model_pipeline_ipu.py} (86%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_save_load.py => test_save_load_ipu.py} (58%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py new file mode 100644 index 00000000000..9f1c115403a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py @@ -0,0 +1,186 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {"transpose_x": False, "transpose_y": False} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + y = paddle.static.data( + name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + + out = paddle.matmul(x, y, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = { + "transpose_x": True, + "transpose_y": True, + } + + +class TestCase3(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[5, 4, 2, 3]) + y = np.random.uniform(size=[5, 4, 3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase4(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[4, 3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase5(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase6(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +@unittest.skip("not supported") +class TestCase6_2(TestCase6): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"transpose_x": True, "transpose_y": True} + + +class TestCase7(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 1]) + y = np.random.uniform(size=[1, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +@unittest.skip("dim > 4 is not supported") +class TestCase8(TestBase): + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'), + "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'), + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py index f04d712755d..b9dd7358b79 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py @@ -16,13 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,97 +26,79 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) + @property + def fp16_enabled(self): + return True - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - self.attrs['axis'] = None - self.attrs['keepdim'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - out = paddle.mean(x, **self.attrs) - fetch_list = [out.name] + out = paddle.fluid.layers.mean(x) - if run_ipu: - place = paddle.IPUPlace() - else: + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() -class TestCase1(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 1 - self.attrs['keepdim'] = False - - -class TestCase2(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 2 - self.attrs['keepdim'] = False - - -class TestCase3(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 2 - self.attrs['keepdim'] = True - - -class TestCase4(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = None - self.attrs['keepdim'] = True + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py similarity index 86% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py index e1ed7603ed6..7e702399640 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py @@ -17,8 +17,7 @@ from __future__ import print_function import numpy as np import unittest import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static paddle.enable_static() SEED = 2021 @@ -28,7 +27,7 @@ SEED = 2021 "core is not compiled with IPU") class TestCastNet(unittest.TestCase): def _test(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = SEED @@ -37,14 +36,14 @@ class TestCastNet(unittest.TestCase): np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name='image', shape=[1, 3, 10, 10], dtype='float32') - with fluid.ipu_shard(ipu_index=0): + with paddle.static.ipu_shard_guard(index=0): conv1 = paddle.static.nn.conv2d( image, num_filters=3, filter_size=3, bias_attr=False) - with fluid.ipu_shard(ipu_index=1): + with paddle.static.ipu_shard_guard(index=1): conv2 = paddle.static.nn.conv2d( conv1, num_filters=3, filter_size=3, bias_attr=False) loss = paddle.mean(conv2) @@ -60,9 +59,10 @@ class TestCastNet(unittest.TestCase): feed_list = [image.name] fetch_list = [loss.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( num_ipus=2, is_training=False, enable_manual_shard=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_pipelining_config(enable_pipelining=False) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py index 78a2589d9ac..7a9135626df 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,90 +26,98 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[2, 5]).astype('float32'), - "y": np.random.uniform(size=[5, 3]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[2, 5]) + y = np.random.uniform(size=[5, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "x_num_col_dims": 1, "y_num_col_dims": 1, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.mul(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2, 5]).astype('float32'), - "y": np.random.uniform(size=[5, 3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 2, 5]) + y = np.random.uniform(size=[5, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "x_num_col_dims": 2, "y_num_col_dims": 1, @@ -123,13 +125,13 @@ class TestCase1(TestBase): class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'), - "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3, 4, 2, 9]) + y = np.random.uniform(size=[3, 6, 1, 2, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'x_num_col_dims': 2, 'y_num_col_dims': 2, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py index e81591ad683..4288b82832e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,23 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "pool_size": 3, "pool_type": 'avg', @@ -60,53 +56,59 @@ class TestBase(IPUOpTest): "data_format": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pool2d(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py index a7c45c6686f..911a163b8aa 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,23 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "pool_size": 3, "pool_type": 'max', @@ -60,120 +56,126 @@ class TestBase(IPUOpTest): "data_format": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pool2d(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_size'] = 3 class TestCase1_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_size'] = [3, 1] class TestCase2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_stride'] = 2 class TestCase2_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_stride'] = [2, 1] class TestCase3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = [1, 1] class TestCase3_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = [1, 1, 2, 2] @unittest.skip('auto_pad is not currently supported') class TestCase3_3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = 'VALID' @unittest.skip('auto_pad is not currently supported') class TestCase3_4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = 'SAME' class TestCase4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['global_pooling'] = True class TestCase5(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['ceil_mode'] = True class TestCase6(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['exclusive'] = False diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py index 5059de7ba77..b3562d722c4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,124 +26,146 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"factor": 2.0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pow(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.array([2.0]).astype('float32'), + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 2, 2]) + data2 = np.array([2.0]) + + self.feed_fp32 = { + "x": data1.astype(np.float32), + "y": data2.astype(np.float32) + } + self.feed_fp16 = { + "x": data1.astype(np.float16), + "y": data2.astype(np.float16) } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') factor = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py new file mode 100644 index 00000000000..c9454e5945f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + out = paddle.fluid.layers.conv2d( + x, num_filters=3, filter_size=3) + out = paddle.fluid.layers.Print(out, **self.attrs) + + if self.is_training: + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + fetch_list = [loss.name] + else: + fetch_list = [out.name] + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + if self.is_training: + result = [] + for _ in range(self.epoch): + loss_res = exe.run(program, + feed=self.feed, + fetch_list=fetch_list) + result.append(loss_res[0]) + return np.array(result) + else: + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def test(self): + res0 = self._test_base(False) + res1 = self._test_base(True) + + self.assertTrue( + np.allclose( + res0.flatten(), res1.flatten(), atol=self.atol)) + + self.assertTrue(res0.shape == res1.shape) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"message": "input_data"} + + +class TestTrainCase1(TestBase): + def set_op_attrs(self): + # "forward" : print forward + # "backward" : print forward and backward + # "both": print forward and backward + self.attrs = {"message": "input_data2", "print_phase": "both"} + + def set_training(self): + self.is_training = True + self.epoch = 2 + + +@unittest.skip("attrs are not supported") +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = { + "first_n": 10, + "summarize": 10, + "print_tensor_name": True, + "print_tensor_type": True, + "print_tensor_shape": True, + "print_tensor_layout": True, + "print_tensor_lod": True + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index ac8ad08e8b2..929ee51b650 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,125 +26,137 @@ class TestMean(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.init_op() + self.set_test_op() + + @property + def fp16_enabled(self): + return True - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_mean def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = self.op(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def run_test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - def set_feed0(self): - self.feed = {} - self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + def set_data_feed0(self): + data = np.random.uniform(size=[2, 4]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} self.set_feed_attr() - def set_feed1(self): - self.feed = {} - self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32) + def set_data_feed1(self): + data = np.random.uniform(size=[2, 2, 2]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} self.set_feed_attr() - def set_attr0(self): + def set_op_attr0(self): self.attrs = {} self.attrs['dim'] = None self.attrs['keep_dim'] = False def test_case0(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.run_test_base() def test_case1(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = 0 self.run_test_base() def test_case2(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = -1 self.run_test_base() def test_case3(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = 1 self.run_test_base() def test_case4(self): - self.set_feed0() + self.set_data_feed0() self.attrs = {} self.attrs['dim'] = 1 self.attrs['keep_dim'] = True self.run_test_base() def test_case5(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [1, 2] self.attrs['keep_dim'] = False self.run_test_base() def test_case6(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [0, 1] self.attrs['keep_dim'] = False self.run_test_base() def test_case7(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [0, 1] self.attrs['keep_dim'] = True @@ -158,22 +164,22 @@ class TestMean(IPUOpTest): class TestMax(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_max class TestMin(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_min class TestProd(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_prod class TestSum(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_sum diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py index f312b7b69ad..9ddf5c7537f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,76 +26,84 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "shape": [30, 10], "inplace": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + add = paddle.fluid.layers.elementwise_add(x, x) out = paddle.fluid.layers.reshape(add, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py index 5163838bc0c..11977193170 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py @@ -16,13 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,82 +26,92 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([2, 4, 6]) + @property + def fp16_enabled(self): + return True - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[2, 4, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [6, 8] self.attrs['inplace'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = paddle.fluid.layers.reshape(x=x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [2, 3, -1, 2] self.attrs['inplace'] = False class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [-1, 0, 3, 2] self.attrs['inplace'] = False diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py similarity index 58% rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py index 24bb8e11184..3a694873062 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py @@ -12,55 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest -import shutil import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -paddle.enable_static() - @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): self.set_atol() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'sgd' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() def _test_base(self, save_otherwise_load): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = self.SEED startup_prog.random_seed = self.SEED - generator = fluid.unique_name.UniqueNameGenerator() + generator = paddle.fluid.unique_name.UniqueNameGenerator() - with fluid.unique_name.guard(generator): - with fluid.scope_guard(scope): + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -91,12 +88,17 @@ class TestBase(IPUOpTest): exe.run(startup_prog) if not save_otherwise_load: - paddle.static.load(main_prog, "model/model") + paddle.static.load(main_prog, self.attrs['model_path'].name) ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( is_training=self.attrs['is_training']) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_precision_config( + enable_fp16=self.attrs['enable_fp16']) + ipu_strategy.set_options({ + 'save_per_n_step': self.attrs['save_at_step'] + }) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) @@ -104,16 +106,17 @@ class TestBase(IPUOpTest): run_steps = self.attrs['steps'] if save_otherwise_load \ else self.attrs['steps'] - self.attrs['save_at_step'] + feed = self.feed_fp16 if self.attrs[ + 'enable_fp16'] else self.feed_fp32 for i in range(run_steps): - tmp = exe.run(program, - feed=self.feed, - fetch_list=fetch_list) + tmp = exe.run(program, feed=feed, fetch_list=fetch_list) # currently, we update opt state every sess.run, # will optimize if save_otherwise_load and \ i == self.attrs['save_at_step'] - 1: - paddle.static.save(main_prog, "model/model") + paddle.static.save(main_prog, + self.attrs['model_path'].name) if save_otherwise_load and i >= self.attrs['save_at_step']: result.append(tmp) @@ -129,25 +132,65 @@ class TestBase(IPUOpTest): self.assertTrue( np.allclose( res0.flatten(), res1.flatten(), atol=self.atol)) - shutil.rmtree("model", True) + self.attrs['model_path'].cleanup() class TestAdam(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'adam' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() class TestLamb(TestBase): - def set_attrs(self): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'lamb' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestSGDFP16(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'sgd' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestAdamFP16(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'adam' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestLambFP16(TestBase): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'lamb' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py index 6ad2a89a738..49714eba8d4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,80 +26,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return False + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.0, "bias_after_scale": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.scale(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 5.0, "bias": 0.0, @@ -114,7 +116,7 @@ class TestCase1(TestBase): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.5, @@ -123,7 +125,16 @@ class TestCase2(TestBase): class TestCase3(TestBase): - def set_attrs(self): + def set_op_attrs(self): + self.attrs = { + "scale": 5.0, + "bias": 0.7, + "bias_after_scale": True, + } + + +class TestCase4(TestBase): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.0, @@ -131,59 +142,66 @@ class TestCase3(TestBase): } -class TestCase4(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'), - "y": np.array([3.0]).astype('float32'), - } +class TestCase5(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 3, 10, 10]) + y = np.array([3.0]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "bias": 0.0, "bias_after_scale": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.scale(x, scale=y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] -- GitLab From fe765cb34e5a3970119f73472ab8cdd250924f11 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:23:12 +0800 Subject: [PATCH 128/261] [IPU] update ipu unittests p1 (#39923) * update ipu UTs part1 * rename ut * sync api changes * update uts for new api * update use_ipumodel() * update use_ipumodel() * split pr --- .../unittests/ipu/test_dropout_op_ipu.py | 88 +++++----- .../unittests/ipu/test_elemetwise_x_op_ipu.py | 150 +++++++++------- .../tests/unittests/ipu/test_equal_op_ipu.py | 114 +++++++------ .../tests/unittests/ipu/test_expand_op_ipu.py | 135 ++++++++------- .../ipu/test_fill_any_like_op_ipu.py | 111 ++++++++++++ .../ipu/test_fill_constant_op_ipu.py | 68 ++++---- .../ipu/test_fp16_inference_io_ipu.py | 160 ++++++++++++++++++ .../tests/unittests/ipu/test_gather_op_ipu.py | 97 +++++------ .../tests/unittests/ipu/test_gelu_op_ipu.py | 93 +++++----- .../unittests/ipu/test_greater_op_ipu.py | 140 +++++++++++++++ .../unittests/ipu/test_groupnorm_op_ipu.py | 112 ++++++------ ...l_io.py => test_inference_model_io_ipu.py} | 71 ++++---- .../unittests/ipu/test_instancenorm_op_ipu.py | 104 ++++++------ 13 files changed, 960 insertions(+), 483 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_inference_model_io.py => test_inference_model_io_ipu.py} (78%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py index 8b1560edfd8..e34da7f7016 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py @@ -16,14 +16,9 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,81 +27,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.5, "is_test": True, "dropout_implementation": "downgrade_in_infer" } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + dropout = paddle.fluid.layers.dropout(x, **self.attrs) out = paddle.fluid.layers.elementwise_add(dropout, dropout) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.5, "is_test": True, @@ -115,7 +117,7 @@ class TestCase1(TestBase): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.0, "is_test": False, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 07b06d77c90..a9d6d230832 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -16,14 +16,9 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,101 +27,136 @@ class TestMul(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.init_op() + self.set_test_op() + + @property + def fp16_enabled(self): + if IPUOpTest.use_ipumodel(): + return False + else: + return True - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_mul def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = self.op(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def run_test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) def test_case0(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(2, 3, 4, 5)) + + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.attrs = {} self.set_feed_attr() self.run_test_base() def test_case1(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(3, 4)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(3, 4)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": 1} self.run_test_base() def test_case2(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(5)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(5)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": -1} self.run_test_base() def test_case3(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(2)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(2)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": 0} @@ -134,37 +164,43 @@ class TestMul(IPUOpTest): class TestAdd(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_add class TestSub(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_sub class TestDiv(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_div class TestMin(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_min class TestMax(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_max class TestPow(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_pow class TestMod(TestMul): - def init_op(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_mod diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py index c319894bfae..5b18c738513 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,94 +26,106 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.zeros([1, 10]).astype('float32'), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.zeros([1, 10]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): - # XX x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.equal(x, y, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten().astype(np.int32) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.ones([1, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.ones([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.arange(0, 10).reshape([1, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.arange(0, 10).reshape([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py index 5b7ea61568e..966dfdef87b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,125 +26,142 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')} + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"expand_times": [1, 2, 2]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype="float32") + out = paddle.fluid.layers.expand(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')} + def set_data_feed(self): + x = np.random.uniform(size=[2, 2]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - expand_times = fluid.layers.fill_constant( + dtype="float32") + + expand_times = paddle.fluid.layers.fill_constant( shape=[len(self.feed_shape[0])], dtype="int32", value=2) out = paddle.fluid.layers.expand( x, expand_times=expand_times, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py new file mode 100644 index 00000000000..00b855a5a7a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {'fill_value': 0.3, 'dtype': 'float32'} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + x_fill = paddle.full_like(x, **self.attrs) + out = paddle.fluid.layers.elementwise_add(x_fill, x_fill) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {'fill_value': 3, 'dtype': 'int32'} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py index c62e0c08f9c..3a1c202bf11 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,21 +26,23 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): + def set_data_feed(self): self.feed = {} def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'name': 'x', 'shape': [1, 3, 3, 3], @@ -54,33 +50,34 @@ class TestBase(IPUOpTest): 'value': 0.3, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.fluid.layers.fill_constant(**self.attrs) out = paddle.fluid.layers.elementwise_add(x, x) - fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -89,19 +86,18 @@ class TestBase(IPUOpTest): result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'name': 'x', 'shape': [1, 3, 3, 3], diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py new file mode 100644 index 00000000000..cd29ff705b8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import shutil + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'sgd' + self.attrs['path'] = 'model' + self.attrs['model_name'] = 'test' + + def _test_save(self): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + generator = paddle.fluid.unique_name.UniqueNameGenerator() + self.full_name = '/'.join( + [self.attrs['path'], self.attrs['model_name']]) + + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + scale = paddle.fluid.layers.scale( + x, scale=1.0, bias=0.0, bias_after_scale=True) + conv = paddle.static.nn.conv2d( + scale, + num_filters=3, + filter_size=3, + bias_attr=False, + name='conv2d') + loss = paddle.mean(conv) + + if self.attrs['is_training']: + if self.attrs['opt_type'] == 'sgd': + sgd = paddle.optimizer.SGD(learning_rate=1e-2) + sgd.minimize(loss) + elif self.attrs['opt_type'] == 'adam': + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + elif self.attrs['opt_type'] == 'lamb': + lamb = paddle.optimizer.Lamb(learning_rate=1e-2) + lamb.minimize(loss) + + fetch_list = [loss.name] + + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=True) + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile( + self.feed_list, fetch_list) + + for _ in range(self.attrs['steps']): + exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list) + + paddle.static.save_inference_model( + self.full_name, x, loss, exe, program=program.org_program) + + def _test_load(self, run_ipu): + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + [inference_program, feed_target_names, fetch_targets] = ( + paddle.static.load_inference_model(self.full_name, exe)) + + if run_ipu: + feed_list = feed_target_names + fetch_list = [fetch_targets[0].name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=False) + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + inference_program, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = inference_program + + feed = self.feed_fp16 if run_ipu else self.feed_fp32 + result = [] + for i in range(10): + feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype) + out = exe.run(program, feed=feed, fetch_list=[fetch_targets]) + result.append(out) + + return np.array(result) + + def test_base(self): + self._test_save() + cpu_res = self._test_load(False) + ipu_res = self._test_load(True).astype(np.float32) + + self.assertTrue( + np.allclose( + cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16)) + + shutil.rmtree(self.attrs['path'], True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py index d5be8ae0cf7..01a56fd14be 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,85 +26,92 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[10, 20]).astype('float32'), - "y": np.array([1, 3, 5]).astype('int32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[10, 20]) + y = np.array([1, 3, 5]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='int32') + out = paddle.fluid.layers.gather(x, index=y, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[100]).astype('float32'), - "y": np.array([1, 3, 5]).astype('int32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[100]) + y = np.array([1, 3, 5]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)} if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py index ca8c0935d78..602289f3f19 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,80 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_atol(self): - self.atol = 1e-3 + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"approximate": False} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.gelu(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) -@unittest.skip('approximate=True is not supported') class TestCase1(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 2e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {"approximate": True} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py new file mode 100644 index 00000000000..05a37dcb3d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.randn(3, 4, 5) + y = np.random.randn(3, 4, 5) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + y = paddle.static.data( + name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + + out = paddle.fluid.layers.greater_than(x, y, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten().astype(np.int32) + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.ones([10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase2(TestBase): + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.zeros([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase3(TestBase): + def set_data_feed(self): + x = np.zeros([1, 10]) + y = np.ones([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py index eb644c2c667..102e764cb2f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,43 +26,49 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 3e-6 + self.rtol = 1e-6 + self.atol_fp16 = 4e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 8, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 8, "epsilon": 1e-05, "data_layout": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -78,62 +78,68 @@ class TestBase(IPUOpTest): bias = paddle.ParamAttr(trainable=True) out = paddle.fluid.layers.nn.group_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) else: - scale = True - bias = True out = paddle.fluid.layers.nn.group_norm( - x, param_attr=scale, bias_attr=bias, **self.attrs) + x, param_attr=True, bias_attr=True, **self.attrs) if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) fetch_list = [loss.name] else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + if mode > ExecutionMode.IPU_FP32 and self.is_training: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 4, "epsilon": 1e-05, @@ -147,11 +153,15 @@ class TestTrainCase1(TestBase): self.epoch = 10 +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") class TestTrainCase2(TestBase): def set_atol(self): - self.atol = 1e-3 + self.atol = 7e-4 + self.rtol = 1e-6 + self.atol_fp16 = 4e-3 + self.rtol_fp16 = 1e-3 - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 4, "epsilon": 1e-05, @@ -163,7 +173,5 @@ class TestTrainCase2(TestBase): self.epoch = 10 -# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)` - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py similarity index 78% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py index 0a331d80454..33a63a80e3b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py @@ -12,59 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest -import shutil import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -paddle.enable_static() - @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): self.set_atol() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed = {"in_0": data.astype(np.float32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'sgd' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' def _test_save(self): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = self.SEED startup_prog.random_seed = self.SEED - generator = fluid.unique_name.UniqueNameGenerator() + generator = paddle.fluid.unique_name.UniqueNameGenerator() self.full_name = '/'.join( - [self.attrs['path'], self.attrs['model_name']]) + [self.attrs['path'].name, self.attrs['model_name']]) - with fluid.unique_name.guard(generator): - with fluid.scope_guard(scope): + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -88,16 +88,16 @@ class TestBase(IPUOpTest): elif self.attrs['opt_type'] == 'lamb': lamb = paddle.optimizer.Lamb(learning_rate=1e-2) lamb.minimize(loss) - fetch_list = [loss.name] + fetch_list = [loss.name] place = paddle.IPUPlace() exe = paddle.static.Executor(place) exe.run(startup_prog) ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( is_training=self.attrs['is_training']) - program = compiler.IPUCompiledProgram( + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) @@ -125,8 +125,8 @@ class TestBase(IPUOpTest): feed_list = feed_target_names fetch_list = [fetch_targets[0].name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=False) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=False) + program = paddle.static.IpuCompiledProgram( inference_program, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -134,7 +134,7 @@ class TestBase(IPUOpTest): tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets]) - return tmp + return np.array(tmp) def test_base(self): self._test_save() @@ -142,27 +142,26 @@ class TestBase(IPUOpTest): ipu_res = self._test_load(True) self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol)) - - shutil.rmtree(self.attrs['path'], True) + self.attrs['path'].cleanup() class TestAdam(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['is_training'] = True self.attrs['opt_type'] = 'adam' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' class TestLamb(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['is_training'] = True self.attrs['opt_type'] = 'lamb' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py index ee9cd875cf2..ed8f3950ace 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,39 +26,45 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"epsilon": 1e-05} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -74,58 +74,64 @@ class TestBase(IPUOpTest): bias = paddle.ParamAttr(trainable=True) out = paddle.fluid.layers.nn.instance_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) else: - scale = True - bias = True out = paddle.fluid.layers.nn.instance_norm( - x, param_attr=scale, bias_attr=bias, **self.attrs) + x, param_attr=True, bias_attr=True, **self.attrs) if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) fetch_list = [loss.name] else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + if mode > ExecutionMode.IPU_FP32 and self.is_training: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestTrainCase1(TestBase): @@ -134,7 +140,5 @@ class TestTrainCase1(TestBase): self.epoch = 10 -# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)` - if __name__ == "__main__": unittest.main() -- GitLab From 0b597754e27113129e9969e6be8d2a588def032e Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:39:41 +0800 Subject: [PATCH 129/261] add ipu uts (#40205) --- .../unittests/ipu/test_flatten_op_ipu.py | 118 +++++++++++++ .../tests/unittests/ipu/test_optimizer_ipu.py | 165 ++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py new file mode 100644 index 00000000000..6f0cafc6680 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 2, 4, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 1 + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.fluid.layers.flatten(x=x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) + + self.check(output_dict, check_shape=True) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 0 + + +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py new file mode 100644 index 00000000000..1cc10da3d73 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py @@ -0,0 +1,165 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + self.set_attrs() + + def set_atol(self): + self.atol = 1e-6 + + def set_data_feed(self): + self.feed = { + "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def set_attrs(self): + self.attrs = { + "optimizer": 'sgd', + "weight_decay": 0.0, + "loss_scaling": 1.0, + } + + def _test_optimizer(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + np.random.seed(self.SEED) + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + image = paddle.static.data( + name='image', shape=[1, 3, 10, 10], dtype='float32') + conv1 = paddle.static.nn.conv2d( + image, num_filters=3, filter_size=3, bias_attr=False) + loss = paddle.mean(conv1) + + weight_decay = self.attrs['weight_decay'] + opt = paddle.optimizer.SGD(learning_rate=1e-1, + weight_decay=weight_decay) + if self.attrs['optimizer'] == 'adam': + opt = paddle.optimizer.Adam( + learning_rate=1e-1, weight_decay=weight_decay) + elif self.attrs['optimizer'] == 'lamb': + + opt = paddle.optimizer.Lamb( + learning_rate=1e-1, lamb_weight_decay=weight_decay) + opt.minimize(loss) + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = [image.name] + fetch_list = [loss.name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=True) + ipu_strategy.loss_scaling = self.attrs["loss_scaling"] + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile(feed_list, + fetch_list) + else: + program = main_prog + + result = [] + for epoch in range(100): + loss_res = exe.run(program, feed=self.feed, fetch_list=[loss]) + result.append(loss_res) + + return np.array(result) + + def test(self): + # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1) + ipu_loss = self._test_optimizer(True).flatten() + cpu_loss = self._test_optimizer(False).flatten() + + self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol)) + + +@unittest.skip('do not support L2 regularization') +class TestSGD(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'sgd', + "weight_decay": 0.1, + "loss_scaling": 2.0, + } + + +@unittest.skip('do not support L2 regularization') +class TestAdamCase1(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.1, + "loss_scaling": 3.0, + } + + +class TestAdamCase2(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.0, + "loss_scaling": 4.0, + } + + +@unittest.skip('seems cpu output wrong') +class TestLambCase1(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.0, + "loss_scaling": 5.0, + } + + +@unittest.skip('seems cpu output wrong') +class TestLamb(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.1, + "loss_scaling": 6.0, + } + + +if __name__ == "__main__": + unittest.main() -- GitLab From 2037fa68db8a79ff4869afcf0ce6864d7e05449f Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 9 Mar 2022 11:49:44 +0800 Subject: [PATCH 130/261] [optest]: fix transpose, support different parameter name between python_api and KernelSignature. (#40258) * optest: fix transpose * fix --- .../paddle/fluid/tests/unittests/op_test.py | 75 ++++++++++++++----- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 6455da92475..457f20ac5b0 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -50,6 +50,7 @@ from paddle.fluid.tests.unittests.white_list import ( no_check_set_white_list, op_threshold_white_list, no_grad_set_white_list, ) +from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs): @@ -698,19 +699,55 @@ class OpTest(unittest.TestCase): self.__class__.__name__) def _calc_python_api_output(self, place): - def prepare_python_api_arguments(op_proto_ins, op_proto_attrs, + def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs, kernel_sig): """ map from `op proto inputs and attrs` to `api input list and api attrs dict` """ + + class Empty: + pass + + def is_empty(a): + return isinstance(a, Empty) + + def get_default(idx, all_params_number, defaults): + related_idx = idx - all_params_number + len(defaults) + assert related_idx >= 0, "%d-th arguments don't have default value" % idx + return defaults[related_idx] + + def remove_name(x): + if isinstance(x, list): return [i for i in x if i != 'name'] + if isinstance(x, dict): + return {k: v for k, v in x.items() if k != 'name'} + assert False, "Only support list or dict." + + def to_defaults_list(params, defaults): + return [defaults[p] for p in params if p in defaults] + # NOTE(xiongkun): why don't use input arguments dicts ? # Because we don't know the python api name of each arguments. + # using parse_arg_and_kwargs, we can get the all api information we need. + api_params, api_defaults = [ + remove_name(item) for item in parse_arg_and_kwargs(api) + ] + api_defaults = to_defaults_list(api_params, api_defaults) inputs_sig, attrs_sig, outputs_sig = kernel_sig - input_arguments = [op_proto_ins[name] for name in inputs_sig] - attr_arguments = { - name: op_proto_attrs[name] - for name in attrs_sig if name in op_proto_attrs - } - return input_arguments, attr_arguments + inputs_and_attrs = inputs_sig + attrs_sig + assert ( + len(api_params) == len(inputs_and_attrs) + ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)" + input_arguments = [op_proto_ins[name] for name in inputs_sig] + [ + op_proto_attrs[name] if name in op_proto_attrs else Empty() + for name in attrs_sig + ] + results = [] + for idx, arg in enumerate(input_arguments): + if is_empty(arg): + results.append( + get_default(idx, len(input_arguments), api_defaults)) + else: + results.append(arg) + return results def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): if not isinstance(ret_tuple, (tuple, list)): @@ -720,25 +757,27 @@ class OpTest(unittest.TestCase): len(output_sig), len(ret_tuple)) return {a: b for a, b in zip(output_sig, ret_tuple)} - def assumption_assert_and_transform(args, argvs): + def assumption_assert_and_transform(args, inp_num): """ - transform by the following rules: + transform inputs by the following rules: 1. [Tensor] -> Tensor 2. [Tensor, Tensor, ...] -> list of Tensors only support "X" is list of Tensor, currently don't support other structure like dict. """ - for inp in args: + for inp in args[:inp_num]: assert isinstance( inp, list ), "currently only support `X` is [Tensor], don't support other structure." - args = [inp[0] if len(inp) == 1 else inp for inp in args] - return args, argvs + args = [ + inp[0] if len(inp) == 1 else inp for inp in args[:inp_num] + ] + args[inp_num:] + return args - def cal_python_api(python_api, args, argvs, kernel_sig): - args, argvs = assumption_assert_and_transform(args, argvs) + def cal_python_api(python_api, args, kernel_sig): inputs_sig, attrs_sig, outputs_sig = kernel_sig - ret_tuple = python_api(*args, **argvs) + args = assumption_assert_and_transform(args, len(inputs_sig)) + ret_tuple = python_api(*args) return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) with fluid.dygraph.base.guard(place=place): @@ -764,11 +803,11 @@ class OpTest(unittest.TestCase): assert hasattr( self, "python_api" ), "Please set the `self.python_api` if you want to compare python api output." - arg, argv = prepare_python_api_arguments(inputs, attrs_outputs, - kernel_sig) + args = prepare_python_api_arguments(self.python_api, inputs, + attrs_outputs, kernel_sig) """ we directly return the cal_python_api value because the value is already tensor. """ - return cal_python_api(self.python_api, arg, argv, kernel_sig) + return cal_python_api(self.python_api, args, kernel_sig) def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): self.__class__.op_type = self.op_type # for ci check, please not delete it for now -- GitLab From 9968c56321a74c51fb762cb583f80bac6de90e6f Mon Sep 17 00:00:00 2001 From: chenenquan Date: Wed, 9 Mar 2022 11:53:36 +0800 Subject: [PATCH 131/261] [Phi] Migrate linspace op to phi (#40124) * [Phi] Migrate linspace op * [Phi] Migrate linspace op * [Phi] Fix linspace op * [PHI] rename data_tranform to data_type_transform * [PHI] Fix DECLARE and PD --- paddle/fluid/operators/linspace_op.cc | 45 ++------ paddle/fluid/operators/linspace_op.cu | 104 ------------------ paddle/fluid/operators/linspace_op.h | 76 ------------- paddle/phi/infermeta/ternary.cc | 29 +++++ paddle/phi/infermeta/ternary.h | 5 + paddle/phi/kernels/cpu/linspace_kernel.cc | 71 ++++++++++++ .../phi/kernels/funcs/data_type_transform.h | 58 ++++++++++ paddle/phi/kernels/gpu/linspace_kernel.cu | 97 ++++++++++++++++ paddle/phi/kernels/linspace_kernel.h | 26 +++++ 9 files changed, 298 insertions(+), 213 deletions(-) delete mode 100644 paddle/fluid/operators/linspace_op.cu delete mode 100644 paddle/fluid/operators/linspace_op.h create mode 100644 paddle/phi/kernels/cpu/linspace_kernel.cc create mode 100644 paddle/phi/kernels/funcs/data_type_transform.h create mode 100644 paddle/phi/kernels/gpu/linspace_kernel.cu create mode 100644 paddle/phi/kernels/linspace_kernel.h diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e89..378c7573d61 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9..00000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc..00000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index c3472a24801..eb807ad4615 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -209,4 +209,33 @@ void LerpInferMeta(const MetaTensor& x, out->share_lod(x); } +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out) { + auto s_dims = start.dims(); + PADDLE_ENFORCE_EQ( + (s_dims.size() == 1) && (s_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," + "but received input shape is [%s].", + s_dims)); + auto e_dims = stop.dims(); + PADDLE_ENFORCE_EQ( + (e_dims.size() == 1) && (e_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," + "but received input shape is [%s].", + e_dims)); + auto step_dims = number.dims(); + PADDLE_ENFORCE_EQ( + (step_dims.size() == 1) && (step_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," + "but received input shape is [%s].", + step_dims)); + out->set_dims(phi::make_ddim({-1})); + out->set_dtype(start.dtype()); +} + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index cff57e1ba70..4dec1442516 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -58,4 +58,9 @@ void LerpInferMeta(const MetaTensor& x, const MetaTensor& weight, MetaTensor* out); +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc new file mode 100644 index 00000000000..4b8b7f7a2e0 --- /dev/null +++ b/paddle/phi/kernels/cpu/linspace_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/linspace_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" + +namespace phi { + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out) { + int32_t num = number.data()[0]; + auto start_t = phi::funcs::TransDataType(ctx, start, dtype); + auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype); + + T start_data = start_t.template data()[0]; + T stop_data = stop_t.template data()[0]; + PADDLE_ENFORCE_GT( + num, + 0, + phi::errors::InvalidArgument("The num of linspace op should be larger " + "than 0, but received num is %d", + num)); + + out->Resize(phi::make_ddim({num})); + T* out_data = ctx.template Alloc(out); + + if (num > 1) { + // step should be of double type for all types + double step = (static_cast(stop_data - start_data)) / (num - 1); + int half_num = num / 2; + for (int i = 0; i < num; ++i) { + if (i < half_num) { + out_data[i] = static_cast(start_data + step * i); + } else { + out_data[i] = static_cast(stop_data - step * (num - i - 1)); + } + } + } else { + out_data[0] = static_cast(start_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(linspace, + CPU, + ALL_LAYOUT, + phi::LinspaceKernel, + float, + int32_t, + int64_t, + double) {} diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h new file mode 100644 index 00000000000..ad7f2aa192c --- /dev/null +++ b/paddle/phi/kernels/funcs/data_type_transform.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { +namespace funcs { + +template +phi::DenseTensor TransDataType(const Context& dev_ctx, + const phi::DenseTensor& x, + DataType dtype) { + VLOG(3) << "TransDataType " + << "src type:" << x.dtype() << "; dst typoe: " << dtype; + + switch (x.dtype()) { + case DataType::FLOAT32: + return phi::Cast(dev_ctx, x, dtype); + case DataType::FLOAT64: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT32: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT64: + return phi::Cast(dev_ctx, x, dtype); + case DataType::FLOAT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::BFLOAT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::BOOL: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::UINT8: + return phi::Cast(dev_ctx, x, dtype); + default: + PADDLE_THROW(phi::errors::Unimplemented( + "Data type (%s) is not supported when casting data type.", + x.dtype())); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu new file mode 100644 index 00000000000..3a6ff365c11 --- /dev/null +++ b/paddle/phi/kernels/gpu/linspace_kernel.cu @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/linspace_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__global__ void LinspaceKernelInner( + T start, T stop, double step, int64_t size, T* out) { + int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + + for (; index < size; index += blockDim.x * gridDim.x) { + if (index < size / 2) { + out[index] = static_cast(start + step * index); + } else { + out[index] = static_cast(stop - step * (size - index - 1)); + } + } +} + +template +__global__ void LinspaceSpecialKernel(T start, T* out) { + out[0] = static_cast(start); +} + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out) { + auto start_t = phi::funcs::TransDataType(ctx, start, dtype); + auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype); + + DenseTensor n_start; + DenseTensor n_stop; + DenseTensor n_num; + phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start); + T start_data = n_start.data()[0]; + phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop); + T stop_data = n_stop.data()[0]; + phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num); + int64_t num = static_cast(n_num.data()[0]); + + PADDLE_ENFORCE_GT( + num, + 0, + phi::errors::InvalidArgument("The num of linspace op should be larger " + "than 0, but received num is %d", + num)); + + out->Resize(phi::make_ddim({num})); + T* out_data = ctx.template Alloc(out); + + double step = 0; + auto stream = ctx.stream(); + int block = 512; + int grid = (num + block - 1) / block; + if (num != 1) { + step = (static_cast(stop_data - start_data)) / (num - 1); + LinspaceKernelInner<<>>( + start_data, stop_data, step, num, out_data); + } else { + LinspaceSpecialKernel<<>>(start_data, out_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(linspace, + GPU, + ALL_LAYOUT, + phi::LinspaceKernel, + float, + int32_t, + int64_t, + double) {} diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h new file mode 100644 index 00000000000..ca2b940aef9 --- /dev/null +++ b/paddle/phi/kernels/linspace_kernel.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out); + +} // namespace phi -- GitLab From 05ff6cc52d309ccfba217225f62b1bc427d626e2 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 9 Mar 2022 12:18:38 +0800 Subject: [PATCH 132/261] bypass eager mode (#40245) --- .../paddle/fluid/tests/unittests/test_function_hook.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py index d45ef528261..55981b01c40 100644 --- a/python/paddle/fluid/tests/unittests/test_function_hook.py +++ b/python/paddle/fluid/tests/unittests/test_function_hook.py @@ -20,6 +20,7 @@ import numpy as np import paddle.fluid.core as core from paddle import _C_ops +from paddle.fluid.framework import _test_eager_guard class TestCapture: @@ -41,7 +42,7 @@ def grad_hook(grad): class TestBakcwardFunctionHookError(unittest.TestCase): - def test_hook(self): + def func_hook(self): input_data = np.ones([4, 4]).astype('float32') x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False) @@ -58,6 +59,12 @@ class TestBakcwardFunctionHookError(unittest.TestCase): assert test_cap.list == [1, 2, 1] + def test_hook(self): + # _register_void_function_post_hook do not support in eager mode + with _test_eager_guard(): + pass + self.func_hook() + if __name__ == "__main__": unittest.main() -- GitLab From c1116b657ee99f8501ff065578fe8b07de97e889 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 9 Mar 2022 12:57:20 +0800 Subject: [PATCH 133/261] Fix code style (#40344) * fix code style * test=document_fix * fix code style --- python/paddle/profiler/profiler_statistic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index e39871c7365..7400f21e913 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -170,9 +170,9 @@ class TimeRangeSummary: CPUTimeRange[hostnode.type].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[hostnode.type] += 1 - if hostnode.type == TracerEventType.Operator and any( - [name in hostnode.name for name in - _CommunicationOpName]): # special case, communication op + if hostnode.type == TracerEventType.Operator and any([ + name in hostnode.name for name in _CommunicationOpName + ]): # special case, communication op CPUTimeRange[TracerEventType.Communication].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[TracerEventType.Communication] += 1 -- GitLab From e0866dc630dc8dc81567d0644c0688976132eb2c Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 9 Mar 2022 13:53:03 +0800 Subject: [PATCH 134/261] [hybrid] fused_feedforward op support tensor model parallel (#40160) --- .../operators/fused/fused_feedforward_op.cc | 2 + .../operators/fused/fused_feedforward_op.cu | 48 ++- .../fluid/tests/unittests/CMakeLists.txt | 2 + ...static_model_parallel_fused_feedforward.py | 384 ++++++++++++++++++ ...static_model_parallel_fused_feedforward.py | 45 ++ 5 files changed, 476 insertions(+), 5 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py create mode 100644 python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae42604..f3f8f174275 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955b..c38d9f7d4bc 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f8102ec4080..be91fb4fdf6 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -23,6 +23,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_pipeline) list(APPEND DIST_TEST_OPS test_ir_pass_pipeline) list(APPEND DIST_TEST_OPS test_static_model_parallel) +list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height) @@ -1150,6 +1151,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120) set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120) set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240) + set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_split_embedding test_collective_split_embedding_none_divisible test_collective_split_row_linear diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py new file mode 100644 index 00000000000..5f467da6a64 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py @@ -0,0 +1,384 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np + +import paddle +import paddle.fluid as fluid +from test_dist_base import TestDistRunnerBase, runtime_main +import paddle.distributed.fleet as fleet + +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.fluid.dygraph.layers import Layer +from paddle.fluid.layer_helper import LayerHelper +from paddle.nn.initializer import Constant + +paddle.enable_static() + +DTYPE = "float32" +MODEL_PARALLEL_SIZE = 2 +IN_SIZE = 2 * MODEL_PARALLEL_SIZE +OUT_SIZE = 2 * MODEL_PARALLEL_SIZE + + +def fused_feedforward(x, + linear1_weight, + linear2_weight, + linear1_bias=None, + linear2_bias=None, + ln1_scale=None, + ln1_bias=None, + ln2_scale=None, + ln2_bias=None, + dropout1_rate=0.5, + dropout2_rate=0.5, + activation="relu", + ln1_epsilon=1e-5, + ln2_epsilon=1e-5, + pre_layer_norm=False, + training=True, + mode='upscale_in_train', + ring_id=-1, + name=None): + seed = None + if mode not in ('downscale_in_infer', 'upscale_in_train'): + raise ValueError( + "mode argument should be 'downscale_in_infer' or 'upscale_in_train'") + mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer + + helper = LayerHelper("fused_feedforward") + dtype = x.dtype + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + 'fused_feedforward') + check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], + 'fused_feedforward') + + out = helper.create_variable_for_type_inference(x.dtype) + dropout1_mask = helper.create_variable_for_type_inference( + 'uint8', stop_gradient=True) + dropout2_mask = helper.create_variable_for_type_inference( + 'uint8', stop_gradient=True) + ln1_mean = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln1_variance = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln2_mean = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln2_variance = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + linear1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + dropout1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + dropout2_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + + if (seed is None or seed == 0) and helper.main_program.random_seed != 0: + seed = helper.main_program.random_seed + + helper.append_op( + type='fused_feedforward', + inputs={ + 'X': x, + 'Linear1Weight': linear1_weight, + 'Linear1Bias': linear1_bias, + 'Linear2Weight': linear2_weight, + 'Linear2Bias': linear2_bias, + 'Ln1Scale': ln1_scale, + 'Ln1Bias': ln1_bias, + 'Ln2Scale': ln2_scale, + 'Ln2Bias': ln2_bias, + }, + outputs={ + 'Out': out, + 'Dropout1Mask': dropout1_mask, + 'Dropout2Mask': dropout2_mask, + 'Ln1Mean': ln1_mean, + 'Ln1Variance': ln1_variance, + 'Ln2Mean': ln2_mean, + 'Ln2Variance': ln2_variance, + 'Linear1Out': linear1_out, + 'Ln1Out': ln1_out, + 'Dropout1Out': dropout1_out, + 'Dropout2Out': dropout2_out, + }, + attrs={ + 'dropout1_rate': dropout1_rate, + 'dropout2_rate': dropout2_rate, + 'act_method': activation, + 'pre_layer_norm': pre_layer_norm, + 'ln1_epsilon': ln1_epsilon, + 'ln2_epsilon': ln2_epsilon, + 'dropout1_is_test': not training, + 'dropout2_is_test': not training, + 'dropout1_fix_seed': seed is not None, + 'dropout2_fix_seed': seed is not None, + 'dropout1_seed': seed if seed is not None else 0, + 'dropout2_seed': seed if seed is not None else 0, + 'dropout1_implementation': mode, + 'dropout2_implementation': mode, + 'ring_id': ring_id, + }) + return out + + +def _set_var_distributed(var): + if var is None: + return + + var.is_distributed = True + + # NOTE: use current_block and find_var_recursive to support while_loop + startup_block = paddle.static.default_startup_program().current_block() + main_block = paddle.static.default_main_program().current_block() + startup_block._find_var_recursive(var.name).is_distributed = True + main_block._find_var_recursive(var.name).is_distributed = True + + +class ParallelFusedFeedForward(Layer): + def __init__(self, + d_model, + dim_feedforward, + dropout_rate=0.1, + epsilon=1e-05, + activation="relu", + act_dropout_rate=None, + normalize_before=False, + linear1_weight_attr=None, + linear1_bias_attr=None, + linear2_weight_attr=None, + linear2_bias_attr=None, + ln1_scale_attr=None, + ln1_bias_attr=None, + ln2_scale_attr=None, + ln2_bias_attr=None, + nranks=1, + ring_id=-1, + name=None): + super(ParallelFusedFeedForward, self).__init__() + assert d_model > 0, ( + "Expected d_model to be greater than 0, but recieved {}".format( + d_model)) + assert dim_feedforward > 0, ( + "Expected dim_feedforward to be greater than 0, but recieved {}". + format(dim_feedforward)) + + self._dtype = self._helper.get_default_dtype() + self._d_model = d_model + + assert dim_feedforward % nranks == 0 + dim_feedforward = dim_feedforward // nranks + self._dim_feedforward = dim_feedforward + self._dropout_rate = dropout_rate + self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate + self._act_method = activation + self._normalize_before = normalize_before + self._epsilon = epsilon + self._ring_id = ring_id + + self._linear1_weight = self.create_parameter( + shape=[d_model, dim_feedforward], + attr=linear1_weight_attr, + dtype=self._dtype, + is_bias=False) + self._linear1_bias = self.create_parameter( + shape=[dim_feedforward], + attr=linear1_bias_attr, + dtype=self._dtype, + is_bias=True) + + self._linear2_weight = self.create_parameter( + shape=[dim_feedforward, d_model], + attr=linear2_weight_attr, + dtype=self._dtype, + is_bias=False) + + self._linear2_bias = self.create_parameter( + shape=[d_model], + attr=linear2_bias_attr, + dtype=self._dtype, + is_bias=True) + + if nranks > 1: + assert ring_id != -1 + # column parallel + _set_var_distributed(self._linear1_weight) + _set_var_distributed(self._linear1_bias) + _set_var_distributed(self._linear2_weight) + + if normalize_before: + self._ln1_scale = self.create_parameter( + shape=[d_model], + attr=ln1_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln1_bias = self.create_parameter( + shape=[d_model], attr=ln1_bias_attr, is_bias=True) + self._ln2_scale = None + self._ln2_bias = None + else: + self._ln1_bias = None + self._ln2_bias = None + self._ln2_scale = self.create_parameter( + shape=[d_model], + attr=ln2_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln2_bias = self.create_parameter( + shape=[d_model], attr=ln2_bias_attr, is_bias=True) + + self.name = name + + def forward(self, src, cache=None): + out = fused_feedforward( + src, + self._linear1_weight, + self._linear2_weight, + self._linear1_bias, + self._linear2_bias, + self._ln1_scale, + self._ln1_bias, + self._ln2_scale, + self._ln2_bias, + dropout1_rate=self._act_dropout_rate, + dropout2_rate=self._dropout_rate, + activation=self._act_method, + ln1_epsilon=self._epsilon, + ln2_epsilon=self._epsilon, + pre_layer_norm=self._normalize_before, + training=self.training, + ring_id=self._ring_id, + name=self.name) + return out + + +def get_param_attr(weight, bias): + weight_attr = paddle.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(weight)) + bias_attr = paddle.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(bias)) + return weight_attr, bias_attr + + +def create_model(data, rank): + np.random.seed(2021) + ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE) + b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE) + w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE) + b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + data.stop_gradient = False + if rank is not None: + start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE + end = start + OUT_SIZE // MODEL_PARALLEL_SIZE + col_w0 = w0[:, start:end] + col_b0 = b0[start:end] + row_w1 = w1[start:end, :] + + ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b) + w0_attr, b0_attr = get_param_attr(col_w0, col_b0) + w1_attr, b1_attr = get_param_attr(row_w1, b1) + + ffn = ParallelFusedFeedForward( + IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr, + nranks=MODEL_PARALLEL_SIZE, + ring_id=0) + #ffn.eval() + result = ffn(data) + else: + ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b) + w0_attr, b0_attr = get_param_attr(w0, b0) + w1_attr, b1_attr = get_param_attr(w1, b1) + + ffn = ParallelFusedFeedForward( + IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr) + #ffn.eval() + result = ffn(data) + + predict = paddle.sum(result) + return predict + + +class TestModelParallel(TestDistRunnerBase): + def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): + # Input data + seq_len = 2 + data_in = fluid.data( + name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE) + + if dist_strategy: + data_loader = fluid.io.DataLoader.from_generator( + feed_list=[data_in], + capacity=64, + use_double_buffer=False, + iterable=False) + + if dist_strategy: + fleet.init(is_collective=True) + strategy = fleet.DistributedStrategy() + strategy.tensor_parallel = True + strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2} + + rank = fleet.worker_index() if dist_strategy else None + avg_cost = create_model(data_in, rank) + opt = fluid.optimizer.SGD(0.1) + + if dist_strategy: + dist_opt = fleet.distributed_optimizer( + optimizer=opt, strategy=strategy) + dist_opt.minimize(avg_cost) + else: + opt.minimize(avg_cost) + + def gen_data(): + np.random.seed(2021) + while True: + data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)] + yield data + + train_reader = paddle.batch(gen_data, batch_size=batch_size) + + if dist_strategy: + return None, avg_cost, train_reader, None, None, None, data_loader + else: + return None, avg_cost, train_reader, None, None, None + + +if __name__ == "__main__": + runtime_main(TestModelParallel) diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py new file mode 100644 index 00000000000..1a6b637e1b4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py @@ -0,0 +1,45 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + +import os +import paddle + +paddle.enable_static() +flag_name = os.path.splitext(__file__)[0] + + +class TestStaticModelParallel(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl_comm_num = 1 + self._pipeline_mode = True + + def test_dist_static_model_parallel_fused_feedforward(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "static_model_parallel_fused_feedforward.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == '__main__': + unittest.main() -- GitLab From 7ea9235c4b3cfcb80fbc1cf286fedaafa3ff1221 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 9 Mar 2022 14:13:49 +0800 Subject: [PATCH 135/261] Fix time of utest in distributed (#40163) * fix time of utest --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index be91fb4fdf6..5d861cddea2 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1114,10 +1114,10 @@ set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) - set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) -- GitLab From aeaf69b36de112e57e8e5bd01caa0e43a497c31b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 9 Mar 2022 14:20:12 +0800 Subject: [PATCH 136/261] remove determinant deps for svd helper (#40235) --- paddle/fluid/operators/determinant_op.h | 71 +++++++++++++------------ paddle/phi/kernels/full_kernel.h | 12 +++++ 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 463a707ccf1..f89ecd37222 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,17 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { @@ -172,7 +178,7 @@ template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* det = context.Input("Out"); const auto* grad = @@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel { // checked in forward, pass } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (det(A)=0) if (!CheckMatrixInvertible(context, det)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; ddet->Resize(input->dims()); - ddet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, ddet, static_cast(0.0f)); + phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), + ddet); return; } @@ -218,8 +227,6 @@ class DeterminantGradKernel : public framework::OpKernel { // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, // -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! @@ -227,26 +234,28 @@ class DeterminantGradKernel : public framework::OpKernel { inverse_A.mutable_data(context.GetPlace()); phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); // Third: dA * |A| - auto mul_dA_detA = helper.Mul(*grad, *det); + auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); @@ -331,7 +340,7 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* slogdet = context.Input("Out"); const auto* grad = @@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel { input->dims().size() - grad->dims().size())); } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); @@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); - dslogdet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + phi::Full(dev_ctx, phi::vectorize(input->dims()), + std::numeric_limits::quiet_NaN(), dslogdet); return; } @@ -373,8 +385,6 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * // inverse(A).conj().transpose(-2, -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! @@ -382,25 +392,18 @@ class SlogDeterminantGradKernel : public framework::OpKernel { inverse_A.mutable_data(context.GetPlace()); phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).conj() - framework::Tensor conj_inverse_A; - conj_inverse_A.Resize(inverse_A.dims()); - auto numel = input->numel(); - auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), - size_t(numel * sizeof(T))); - - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); - for_range(functor); + auto conj_inverse_A = phi::Conj(dev_ctx, inverse_A); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); // Third: inverse(A).conj().transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, conj_inverse_A); VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel { det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); // Fifth: unsqueeze(dslA, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dslA) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); framework::TensorCopy(res, context.GetPlace(), dslogdet); diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index c44f048051d..41fc96b6db1 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -37,6 +37,18 @@ void FullLikeKernel(const Context& dev_ctx, DataType dtype, DenseTensor* out); +template +void Full(const Context& dev_ctx, + const ScalarArray& shape, + const Scalar& val, + DenseTensor* out) { + FullKernel(dev_ctx, + shape, + val, + paddle::experimental::CppTypeToDataType::Type(), + out); +} + template DenseTensor Full(const Context& dev_ctx, const ScalarArray& shape, -- GitLab From 767647ceaf145787df66f3a5a00806f76946bfd4 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Wed, 9 Mar 2022 00:26:58 -0600 Subject: [PATCH 137/261] [Infrt]Update kernel dialect (#40141) --- .gitignore | 1 + .../pybind/kernel_signature_generator.cc | 38 ++++--- .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc | 6 +- paddle/infrt/host_context/paddle_mlir.cc | 17 ++-- .../infershaped/infershape_launchers_test.cc | 2 +- .../infrt/tests/dialect/phi/dense_tensor.mlir | 2 +- paddle/scripts/infrt_build.sh | 5 +- tools/infrt/generate_phi_kernel_dialect.py | 98 +++++++++++-------- tools/infrt/get_compat_kernel_signature.py | 77 +++++++++++++++ tools/infrt/get_phi_kernel_info.py | 4 +- 10 files changed, 174 insertions(+), 76 deletions(-) create mode 100644 tools/infrt/get_compat_kernel_signature.py diff --git a/.gitignore b/.gitignore index 21222678f04..801790d0a47 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json +tools/infrt/kernel_signature.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index f0d5a4e477f..8d78adaf5a4 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -44,35 +44,41 @@ int main(int argc, char **argv) { paddle::framework::InitDefaultKernelSignatureMap(); auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance(); auto &kernel_factory = phi::KernelFactory::Instance(); - std::cout << "{"; + std::string kernel_signature_map_str{"{"}; for (const auto &op_kernel_pair : kernel_factory.kernels()) { if (kernel_signature_map.Has(op_kernel_pair.first)) { - std::cout << "\"" << op_kernel_pair.first << "\":{"; + kernel_signature_map_str = + kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{"; auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; - std::cout << "\"inputs\":["; + kernel_signature_map_str += "\"inputs\":["; auto inputs_ = std::get<0>(args); - if (inputs_.size() > 0) std::cout << inputs_[0]; - for (size_t i = 1; i < inputs_.size(); i++) { - std::cout << ",\"" << inputs_[i] << "\""; + for (size_t i = 0; i < inputs_.size(); i++) { + kernel_signature_map_str = + kernel_signature_map_str + "\"" + inputs_[i] + "\","; } + if (inputs_.size()) kernel_signature_map_str.pop_back(); - std::cout << "],\"attrs\":["; + kernel_signature_map_str += "],\"attrs\":["; auto attrs_ = std::get<1>(args); - if (attrs_.size() > 0) std::cout << attrs_[0]; - for (size_t i = 1; i < attrs_.size(); i++) { - std::cout << ",\"" << attrs_[i] << "\""; + for (size_t i = 0; i < attrs_.size(); i++) { + kernel_signature_map_str = + kernel_signature_map_str + "\"" + attrs_[i] + "\","; } - - std::cout << "],\"outputs\":["; + if (attrs_.size()) kernel_signature_map_str.pop_back(); + kernel_signature_map_str += "],\"outputs\":["; auto outputs_ = std::get<2>(args); - for (size_t i = 1; i < outputs_.size(); i++) { - std::cout << ",\"" << outputs_[i] << "\""; + for (size_t i = 0; i < outputs_.size(); i++) { + kernel_signature_map_str = + kernel_signature_map_str + "\"" + outputs_[i] + "\","; } - std::cout << "]},"; + if (outputs_.size()) kernel_signature_map_str.pop_back(); + kernel_signature_map_str += "]},"; } } - std::cout << "}" << std::endl; + kernel_signature_map_str.pop_back(); + kernel_signature_map_str += "}\n"; + std::cout << kernel_signature_map_str; return 0; } diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc index 376ab31938a..4347ec19e81 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -125,10 +125,8 @@ void phiOpCvtPass::diapatchStage() { kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + kernel_name + - getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) + - getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision); - - // mlir::OperationName operation_name = kernel_op.getOperation()->getName(); + getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + + getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 1c36b04f366..83a2a4269c3 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -56,6 +56,7 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( UpdateModelParams(program, &mainFunc); UpdateModelOps(program); UpdateModelOutputs(program); + return module_; } @@ -143,13 +144,14 @@ void MLIRModelGenImpl::UpdateModelParams( const infrt::paddle::framework_proto::ProgramDesc &program, mlir::FuncOp *mainFunc) { // update input vars + int input_index = 1; for (auto &op_desc : main_block_.ops()) { if (op_desc.type() == "feed") { for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) { // update input variables auto &in = op_desc.outputs()[var_idx]; std::string input_var_name = in.arguments(0); - ::mlir::Value input_ = mainFunc->getArgument(1); + ::mlir::Value input_ = mainFunc->getArgument(input_index++); params_map_.insert( std::pair(input_var_name, input_)); } @@ -211,7 +213,6 @@ void MLIRModelGenImpl::buildOperation( const infrt::paddle::framework_proto::OpDesc &op_) { const std::string &op_name = "pd." + op_.type(); mlir::Location loc = mlir::UnknownLoc::get(context_); - llvm::SmallVector operands = GetOpInputValue(op_); llvm::SmallVector resultTypes = GetOpOutputType(op_); llvm::SmallVector attrs = GetOpAttributes(op_); @@ -227,7 +228,6 @@ llvm::SmallVector MLIRModelGenImpl::GetOpInputValue( std::unordered_map inputs_info = {}; if (pd_dialect_inputs_info_map_.count(op_.type())) inputs_info = pd_dialect_inputs_info_map_.at(op_.type()); - for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) { auto &var = op_.inputs(var_idx); if (!var.arguments().empty()) { @@ -249,10 +249,8 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( // update op outputs info for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) { auto &var_name = op_.outputs(var_idx).arguments()[0]; - if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter())) continue; - // update persistable tensors for (int i = 0; i < main_block_.vars_size(); i++) { auto var_desc = main_block_.vars(i); @@ -315,7 +313,6 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( llvm::ArrayRef attr_names_ = registered_op_name_.getAttributeNames(); std::vector attr_names_vec_ = attr_names_.vec(); - // update attrs for (int attrs_num = 0; attrs_num < op_.attrs_size(); attrs_num++) { auto attr_name_ = op_.attrs(attrs_num).name(); @@ -351,11 +348,17 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( void MLIRModelGenImpl::RegisterOpOutputVars( const infrt::paddle::framework_proto::OpDesc &op_, mlir::Operation *mlir_op_) { + std::unordered_map pd_dialect_outputs_info = + pd_dialect_outputs_info_map_.at(op_.type()); + // op outputs for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) { + if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter())) + continue; auto &var_name = op_.outputs(var_idx).arguments()[0]; + int index = pd_dialect_outputs_info[op_.outputs(var_idx).parameter()]; // output name - auto var_ = mlir_op_->getResult(var_idx); + auto var_ = mlir_op_->getResult(index); params_map_.insert(std::pair(var_name, var_)); } } diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 37f9197edb7..08c2e19dedd 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) { host_context::KernelRegistry registry; RegisterInferShapeLaunchers(®istry); ASSERT_GE(registry.size(), 1UL); - auto creator = registry.GetKernel("phi_cpu.add.any.float32"); + auto creator = registry.GetKernel("phi_cpu.add.float32.any"); const phi::DDim dims({1, 2}); const phi::DataType dtype{phi::DataType::FLOAT32}; diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index 586af7a9c50..b2e1cc52be6 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -6,7 +6,7 @@ func @sign_any_float32_execute() { %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator) -> !phi.context %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () - %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.context, !infrt.dense_tensor) -> (!infrt.dense_tensor) + %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context, !infrt.dense_tensor) -> (!infrt.dense_tensor) // CHECK: dense_tensor: shape=shape[1], values=[1] "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index fb7be82d1c5..0ba2dae9096 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -33,16 +33,17 @@ function update_pd_ops() { rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF - make -j8 paddle_python print_pten_kernels + make -j8 paddle_python print_pten_kernels kernel_signature_generator cd ${PADDLE_ROOT}/build ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json + ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json cd python/dist/ python3 -m pip uninstall -y paddlepaddle python3 -m pip install *whl # update pd_ops.td cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py - python3 generate_phi_kernel_dialect.py ./kernels.json + python3 generate_phi_kernel_dialect.py } function init() { diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index f3a78a8d4e8..36561d4e71d 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -14,9 +14,16 @@ import json import sys - -attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'} -supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw'] +import os +from get_compat_kernel_signature import get_compat_kernels_info + +#TODO @DannyIsFunny: more attr types need to be supported. +attr_type_converter = { + "i": 'SI32Attr', + "b": 'BoolAttr', + "l": 'SI64Attr', + "f": 'F32Attr' +} target_type_converter = {"CPU": "CPU", "GPU": "GPU"} layout_type_converter = { @@ -39,40 +46,34 @@ precision_type_converter = { "bool": "BOOL" } +kernel_types_info_file = "./kernels.json" +kernel_signature_info_file = "./kernel_signature.json" + def generate_kernel_name(op_name, place_str): [target_, layout_, precision_] = place_str[1:-1].split(',') target_ = target_type_converter[target_.strip()] layout_ = layout_type_converter[layout_.strip()] precision_ = precision_type_converter[precision_.strip()] + class_name_ = "{}{}".format( + op_name.replace("_", "").title(), "".join([ + target_.strip().title(), precision_.strip(), layout_.strip().title() + .title() + ])) alias_ = "{}.{}".format(op_name, ".".join( - [target_.strip(), layout_.strip(), precision_.strip()])) - return alias_ + [target_.strip(), precision_.strip(), layout_.strip()])) + return alias_, class_name_ def generate_attrs_info(op_name, attrs_info): - kernel_attrs_names = { - 'split': ['sections', 'num', 'axis', 'mkldnn_data_type'], - 'sign': [], - 'masked_select': [], - 'trace': ['offset', 'axis1', 'axis2'], - 'concat': ['axis'], - 'empty': ['shape', 'dtype'], - 'conj': [], - 'norm': ['axis', 'epsilon', 'is_test'], - 'histogram': ['bins', 'min', 'max'], - 'dot': [], - 'scale': ['scale', 'bias', 'bias_after_scale'], - 'digamma': [], - 'lerp': [], - 'cast': ['out_dtype', 'in_dtype'], - 'abs': [], - 'add_raw': ['axis'], - } + kernel_attrs_names = {} attrs_args_ = "" - if len(kernel_attrs_names[op_name]) == len(attrs_info): + with open(kernel_signature_info_file) as f: + kernel_attrs_names = json.load(f) + kernel_attrs_names.update(get_compat_kernels_info()) + if len(kernel_attrs_names[op_name]["attrs"]) == len(attrs_info): for index in range(len(attrs_info)): - attr_name = kernel_attrs_names[op_name][index] + attr_name = kernel_attrs_names[op_name]["attrs"][index] attr_type = attr_type_converter[attrs_info[index]] attrs_args_ += '{type_}:${name_},'.format( type_=attr_type, name_=attr_name) @@ -97,7 +98,11 @@ def generate_arguments_info(op_name, input_info, attr_info): input_args = generate_inputs_info(input_info) attr_args = generate_attrs_info(op_name, attr_info) context_args = "Context:$dev_ctx" - argument_ = "{},{},{}".format(context_args, input_args, attr_args) + argument_list = [context_args] + input_args.split(",") + attr_args.split( + ",") + while ("" in argument_list): + argument_list.remove("") + argument_ = ",".join(argument_list) return (("let arguments = (ins {});".format(argument_.strip(",")))) @@ -116,6 +121,10 @@ def generate_results_info(output_info): def generate_supported_kernel_list(load_dict): supported_kernels_list_ = [] + kernel_attrs_names = {} + with open(kernel_signature_info_file) as f: + kernel_attrs_names = json.load(f) + kernel_attrs_names.update(get_compat_kernels_info()) for op_name in load_dict: kernel_list = load_dict[op_name] for kernel_info in kernel_list: @@ -125,13 +134,10 @@ def generate_supported_kernel_list(load_dict): for attribute in attributes: if attribute not in attr_type_converter: flag = False - if flag: + if flag and op_name in kernel_attrs_names: supported_kernels_list_.append(op_name) - - alias_ = generate_kernel_dialect(op_name, kernel_alias_, - kernel_info[kernel_alias_]) supported_kernels_list_ = list(set(supported_kernels_list_)) - print(supported_kernels_list_) + return supported_kernels_list_ def scan_kernel_info(load_dict): @@ -156,16 +162,14 @@ def scan_kernel_info(load_dict): def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info): - alias = generate_kernel_name(op_name, kernel_alias_) + alias, class_name = generate_kernel_name(op_name, kernel_alias_) summary = 'let summary = "{name}";'.format(name=alias) dialect_name = alias.split(".") dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ 3] header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( - kernel_name=alias.replace(".", ""), - name=dialect_name.lower(), - left_brace="{") + kernel_name=class_name, name=dialect_name.lower(), left_brace="{") inputs_ = kernel_info["input"] attributes = kernel_info["attribute"] @@ -185,16 +189,14 @@ def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info): def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info): - alias = generate_kernel_name(op_name, kernel_alias_) + alias, class_name = generate_kernel_name(op_name, kernel_alias_) summary = 'let summary = "{name}";'.format(name=alias) dialect_name = alias.split(".") dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ 3] header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( - kernel_name=alias.replace(".", ""), - name=dialect_name.lower(), - left_brace="{") + kernel_name=class_name, name=dialect_name.lower(), left_brace="{") inputs_ = kernel_info["input"] attributes = kernel_info["attribute"] arguments = generate_arguments_info(op_name, inputs_, attributes) @@ -236,14 +238,17 @@ def get_kernel_target(kernel_alias_): return target[0] -def main(path_): - with open(path_, "r") as f: +def main(): + with open(kernel_types_info_file, "r") as f: load_dict = json.load(f) head = generate_dialect_head() cpu_registry_ = "" gpu_registry_ = "" + supported_kernels = generate_supported_kernel_list(load_dict) + print("Supported kernels:") + print(supported_kernels) for op_name in load_dict: if op_name not in supported_kernels: continue @@ -273,5 +278,12 @@ def main(path_): if __name__ == '__main__': - path = sys.argv[1] - main(path) + if not os.path.exists(kernel_types_info_file): + print("Error: '{file_name}' not exist!".format( + file_name=kernel_types_info_file)) + if not os.path.exists(kernel_signature_info_file): + print("Error: '{file_name}' not exist!".format( + file_name=kernel_signature_info_file)) + if os.path.exists(kernel_types_info_file) and os.path.exists( + kernel_signature_info_file): + main() diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py new file mode 100644 index 00000000000..78d59c2aef1 --- /dev/null +++ b/tools/infrt/get_compat_kernel_signature.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import json + + +def parse_compat_registry(kernel_info): + name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{") + kernel_info = {} + kernel_info["inputs"] = inputs_str[:-1].split(",") + kernel_info["attrs"] = attrs_str[:-1].split(",") + kernel_info["outputs"] = outputs_str[:-1].split(",") + return name, kernel_info + + +def remove_grad_registry(kernels_registry): + clean_kernel_registry = {} + for registry in kernels_registry: + if (not "_grad" in registry): + clean_kernel_registry[registry] = kernels_registry[registry] + return clean_kernel_registry + + +def get_compat_kernels_info(): + kernels_info = {} + compat_files = os.listdir("../../paddle/phi/ops/compat") + for file_ in compat_files: + if not ".cc" in file_: + compat_files.remove(file_) + + for file_ in compat_files: + with open("../../paddle/phi/ops/compat/" + file_) as in_file: + txt = in_file.readlines() + content = "" + registry = False + for line in txt: + if ("KernelSignature(" in line): + content = "" + registry = True + if (registry): + content += line + if (registry and ";" in line): + data = content.replace("\n", "").replace( + " ", "").strip("return").strip( + "KernelSignature(").strip("\);").replace("\"", "") + registry = False + name, registry_info = parse_compat_registry(data) + + if name in kernels_info: + cur_reg = kernels_info[name] + kernels_info[name]["inputs"] = list( + set(registry_info["inputs"] + kernels_info[name][ + "inputs"])) + kernels_info[name]["attrs"] = list( + set(registry_info["attrs"] + kernels_info[name][ + "attrs"])) + kernels_info[name]["outputs"] = list( + set(registry_info["outputs"] + kernels_info[name][ + "outputs"])) + else: + kernels_info[name] = registry_info + + compat_registry_ = remove_grad_registry(kernels_info) + return compat_registry_ diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 9ea3fef0030..774f6cd6bf3 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]): for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): kernel_func = gen_kernel_func(update_item[3], ctx_name, origin_dtype) - ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[ - 2].lower() + '.' + ir_dtype + ir_name = 'phi_cpu.' + update_item[0].lower( + ) + '.' + ir_dtype + '.' + update_item[2].lower() res += f""" registry->AddKernel("{ir_name}",""" -- GitLab From 68af310be1a7efc53c61d3982aa080b68e0e5263 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Wed, 9 Mar 2022 14:27:46 +0800 Subject: [PATCH 138/261] add MobileNetV3 (#38653) * add mobilenetv3 --- python/paddle/tests/test_pretrained_model.py | 2 + python/paddle/tests/test_vision_models.py | 6 + python/paddle/vision/__init__.py | 4 + python/paddle/vision/models/__init__.py | 8 + python/paddle/vision/models/mobilenetv2.py | 16 +- python/paddle/vision/models/mobilenetv3.py | 445 +++++++++++++++++++ python/paddle/vision/models/utils.py | 32 ++ python/paddle/vision/ops.py | 56 ++- 8 files changed, 554 insertions(+), 15 deletions(-) create mode 100644 python/paddle/vision/models/mobilenetv3.py create mode 100644 python/paddle/vision/models/utils.py diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py index bbde64f2e60..4441faee14e 100644 --- a/python/paddle/tests/test_pretrained_model.py +++ b/python/paddle/tests/test_pretrained_model.py @@ -61,6 +61,8 @@ class TestPretrainedModel(unittest.TestCase): arches = [ 'mobilenet_v1', 'mobilenet_v2', + 'mobilenet_v3_small', + 'mobilenet_v3_large', 'squeezenet1_0', 'shufflenet_v2_x0_25', ] diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py index 547c5334599..dc98fc3219b 100644 --- a/python/paddle/tests/test_vision_models.py +++ b/python/paddle/tests/test_vision_models.py @@ -40,6 +40,12 @@ class TestVisonModels(unittest.TestCase): def test_mobilenetv1(self): self.models_infer('mobilenet_v1') + def test_mobilenetv3_small(self): + self.models_infer('mobilenet_v3_small') + + def test_mobilenetv3_large(self): + self.models_infer('mobilenet_v3_large') + def test_vgg11(self): self.models_infer('vgg11') diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index 37520175a71..3749e0f64fc 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -40,6 +40,10 @@ from .models import MobileNetV1 # noqa: F401 from .models import mobilenet_v1 # noqa: F401 from .models import MobileNetV2 # noqa: F401 from .models import mobilenet_v2 # noqa: F401 +from .models import MobileNetV3Small # noqa: F401 +from .models import MobileNetV3Large # noqa: F401 +from .models import mobilenet_v3_small # noqa: F401 +from .models import mobilenet_v3_large # noqa: F401 from .models import SqueezeNet # noqa: F401 from .models import squeezenet1_0 # noqa: F401 from .models import squeezenet1_1 # noqa: F401 diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py index 044be6a42b7..5ff3562e56e 100644 --- a/python/paddle/vision/models/__init__.py +++ b/python/paddle/vision/models/__init__.py @@ -24,6 +24,10 @@ from .mobilenetv1 import MobileNetV1 # noqa: F401 from .mobilenetv1 import mobilenet_v1 # noqa: F401 from .mobilenetv2 import MobileNetV2 # noqa: F401 from .mobilenetv2 import mobilenet_v2 # noqa: F401 +from .mobilenetv3 import MobileNetV3Small # noqa: F401 +from .mobilenetv3 import MobileNetV3Large # noqa: F401 +from .mobilenetv3 import mobilenet_v3_small # noqa: F401 +from .mobilenetv3 import mobilenet_v3_large # noqa: F401 from .vgg import VGG # noqa: F401 from .vgg import vgg11 # noqa: F401 from .vgg import vgg13 # noqa: F401 @@ -79,6 +83,10 @@ __all__ = [ #noqa 'mobilenet_v1', 'MobileNetV2', 'mobilenet_v2', + 'MobileNetV3Small', + 'MobileNetV3Large', + 'mobilenet_v3_small', + 'mobilenet_v3_large', 'LeNet', 'DenseNet', 'densenet121', diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py index 74071fc1216..6c486037c7d 100644 --- a/python/paddle/vision/models/mobilenetv2.py +++ b/python/paddle/vision/models/mobilenetv2.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle - import paddle.nn as nn -import paddle.nn.functional as F - from paddle.utils.download import get_weights_path_from_url +from .utils import _make_divisible + __all__ = [] model_urls = { @@ -29,16 +27,6 @@ model_urls = { } -def _make_divisible(v, divisor, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - - if new_v < 0.9 * v: - new_v += divisor - return new_v - - class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py new file mode 100644 index 00000000000..da7ae010c58 --- /dev/null +++ b/python/paddle/vision/models/mobilenetv3.py @@ -0,0 +1,445 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +from paddle.utils.download import get_weights_path_from_url +from functools import partial + +from .utils import _make_divisible +from ..ops import ConvNormActivation + +__all__ = [] + +model_urls = { + "mobilenet_v3_small_x1.0": + ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams", + "34fe0e7c1f8b00b2b056ad6788d0590c"), + "mobilenet_v3_large_x1.0": + ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams", + "118db5792b4e183b925d8e8e334db3df"), +} + + +class SqueezeExcitation(nn.Layer): + """ + This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1). + Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3. + This code is based on the torchvision code with modifications. + You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127 + Args: + input_channels (int): Number of channels in the input image + squeeze_channels (int): Number of squeeze channels + activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU`` + scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid`` + """ + + def __init__(self, + input_channels, + squeeze_channels, + activation=nn.ReLU, + scale_activation=nn.Sigmoid): + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2D(1) + self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1) + self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1) + self.activation = activation() + self.scale_activation = scale_activation() + + def _scale(self, input): + scale = self.avgpool(input) + scale = self.fc1(scale) + scale = self.activation(scale) + scale = self.fc2(scale) + return self.scale_activation(scale) + + def forward(self, input): + scale = self._scale(input) + return scale * input + + +class InvertedResidualConfig: + def __init__(self, + in_channels, + kernel, + expanded_channels, + out_channels, + use_se, + activation, + stride, + scale=1.0): + self.in_channels = self.adjust_channels(in_channels, scale=scale) + self.kernel = kernel + self.expanded_channels = self.adjust_channels( + expanded_channels, scale=scale) + self.out_channels = self.adjust_channels(out_channels, scale=scale) + self.use_se = use_se + if activation is None: + self.activation_layer = None + elif activation == "relu": + self.activation_layer = nn.ReLU + elif activation == "hardswish": + self.activation_layer = nn.Hardswish + else: + raise RuntimeError("The activation function is not supported: {}". + format(activation)) + self.stride = stride + + @staticmethod + def adjust_channels(channels, scale=1.0): + return _make_divisible(channels * scale, 8) + + +class InvertedResidual(nn.Layer): + def __init__(self, in_channels, expanded_channels, out_channels, + filter_size, stride, use_se, activation_layer, norm_layer): + super().__init__() + self.use_res_connect = stride == 1 and in_channels == out_channels + self.use_se = use_se + self.expand = in_channels != expanded_channels + + if self.expand: + self.expand_conv = ConvNormActivation( + in_channels=in_channels, + out_channels=expanded_channels, + kernel_size=1, + stride=1, + padding=0, + norm_layer=norm_layer, + activation_layer=activation_layer) + + self.bottleneck_conv = ConvNormActivation( + in_channels=expanded_channels, + out_channels=expanded_channels, + kernel_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2), + groups=expanded_channels, + norm_layer=norm_layer, + activation_layer=activation_layer) + + if self.use_se: + self.mid_se = SqueezeExcitation( + expanded_channels, + _make_divisible(expanded_channels // 4), + scale_activation=nn.Hardsigmoid) + + self.linear_conv = ConvNormActivation( + in_channels=expanded_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + norm_layer=norm_layer, + activation_layer=None) + + def forward(self, x): + identity = x + if self.expand: + x = self.expand_conv(x) + x = self.bottleneck_conv(x) + if self.use_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.use_res_connect: + x = paddle.add(identity, x) + return x + + +class MobileNetV3(nn.Layer): + """MobileNetV3 model from + `"Searching for MobileNetV3" `_. + + Args: + config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config. + last_channel (int): The number of channels on the penultimate layer. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + """ + + def __init__(self, + config, + last_channel, + scale=1.0, + num_classes=1000, + with_pool=True): + super().__init__() + + self.config = config + self.scale = scale + self.last_channel = last_channel + self.num_classes = num_classes + self.with_pool = with_pool + self.firstconv_in_channels = config[0].in_channels + self.lastconv_in_channels = config[-1].in_channels + self.lastconv_out_channels = self.lastconv_in_channels * 6 + norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99) + + self.conv = ConvNormActivation( + in_channels=3, + out_channels=self.firstconv_in_channels, + kernel_size=3, + stride=2, + padding=1, + groups=1, + activation_layer=nn.Hardswish, + norm_layer=norm_layer) + + self.blocks = nn.Sequential(*[ + InvertedResidual( + in_channels=cfg.in_channels, + expanded_channels=cfg.expanded_channels, + out_channels=cfg.out_channels, + filter_size=cfg.kernel, + stride=cfg.stride, + use_se=cfg.use_se, + activation_layer=cfg.activation_layer, + norm_layer=norm_layer) for cfg in self.config + ]) + + self.lastconv = ConvNormActivation( + in_channels=self.lastconv_in_channels, + out_channels=self.lastconv_out_channels, + kernel_size=1, + stride=1, + padding=0, + groups=1, + norm_layer=norm_layer, + activation_layer=nn.Hardswish) + + if with_pool: + self.avgpool = nn.AdaptiveAvgPool2D(1) + + if num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(self.lastconv_out_channels, self.last_channel), + nn.Hardswish(), + nn.Dropout(p=0.2), + nn.Linear(self.last_channel, num_classes)) + + def forward(self, x): + x = self.conv(x) + x = self.blocks(x) + x = self.lastconv(x) + + if self.with_pool: + x = self.avgpool(x) + + if self.num_classes > 0: + x = paddle.flatten(x, 1) + x = self.classifier(x) + + return x + + +class MobileNetV3Small(MobileNetV3): + """MobileNetV3 Small architecture model from + `"Searching for MobileNetV3" `_. + + Args: + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import MobileNetV3Small + + # build model + model = MobileNetV3Small(scale=1.0) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + """ + + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): + config = [ + InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale), + InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale), + InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale), + InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale), + InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale), + InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale), + InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale), + InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale), + InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale), + InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale), + InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale), + ] + last_channel = _make_divisible(1024 * scale, 8) + super().__init__( + config, + last_channel=last_channel, + scale=scale, + with_pool=with_pool, + num_classes=num_classes) + + +class MobileNetV3Large(MobileNetV3): + """MobileNetV3 Large architecture model from + `"Searching for MobileNetV3" `_. + + Args: + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import MobileNetV3Large + + # build model + model = MobileNetV3Large(scale=1.0) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + """ + + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): + config = [ + InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale), + InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale), + InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale), + InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale), + InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale), + InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale), + InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2, + scale), + InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1, + scale), + InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1, + scale), + InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2, + scale), + InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1, + scale), + InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1, + scale), + ] + last_channel = _make_divisible(1280 * scale, 8) + super().__init__( + config, + last_channel=last_channel, + scale=scale, + with_pool=with_pool, + num_classes=num_classes) + + +def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs): + if arch == "mobilenet_v3_large": + model = MobileNetV3Large(scale=scale, **kwargs) + else: + model = MobileNetV3Small(scale=scale, **kwargs) + if pretrained: + arch = "{}_x{}".format(arch, scale) + assert ( + arch in model_urls + ), "{} model do not have a pretrained model now, you should set pretrained=False".format( + arch) + weight_path = get_weights_path_from_url(model_urls[arch][0], + model_urls[arch][1]) + + param = paddle.load(weight_path) + model.set_dict(param) + return model + + +def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs): + """MobileNetV3 Small architecture model from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import mobilenet_v3_small + + # build model + model = mobilenet_v3_small() + + # build model and load imagenet pretrained weight + # model = mobilenet_v3_small(pretrained=True) + + # build mobilenet v3 small model with scale=0.5 + model = mobilenet_v3_small(scale=0.5) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + + """ + model = _mobilenet_v3( + "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs) + return model + + +def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs): + """MobileNetV3 Large architecture model from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import mobilenet_v3_large + + # build model + model = mobilenet_v3_large() + + # build model and load imagenet pretrained weight + # model = mobilenet_v3_large(pretrained=True) + + # build mobilenet v3 large model with scale=0.5 + model = mobilenet_v3_large(scale=0.5) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + + """ + model = _mobilenet_v3( + "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs) + return model diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py new file mode 100644 index 00000000000..f61d0d601a4 --- /dev/null +++ b/python/paddle/vision/models/utils.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _make_divisible(v, divisor=8, min_value=None): + """ + This function ensures that all layers have a channel number that is divisible by divisor + You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505 + + Args: + divisor (int): The divisor for number of channels. Default: 8. + min_value (int, optional): The minimum value of number of channels, if it is None, + the default is divisor. Default: None. + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 4983ca49ac3..b65bfa502c4 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -17,7 +17,7 @@ from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import core, layers from ..fluid.layers import nn, utils -from ..nn import Layer +from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D from ..fluid.initializer import Normal from paddle.common_ops_import import * @@ -1297,3 +1297,57 @@ class RoIAlign(Layer): output_size=self._output_size, spatial_scale=self._spatial_scale, aligned=aligned) + + +class ConvNormActivation(Sequential): + """ + Configurable block used for Convolution-Normalzation-Activation blocks. + This code is based on the torchvision code with modifications. + You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68 + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block + kernel_size: (int, optional): Size of the convolving kernel. Default: 3 + stride (int, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, + in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation`` + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer. + If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d`` + activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization + layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU`` + dilation (int): Spacing between kernel elements. Default: 1 + bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=None, + groups=1, + norm_layer=BatchNorm2D, + activation_layer=ReLU, + dilation=1, + bias=None): + if padding is None: + padding = (kernel_size - 1) // 2 * dilation + if bias is None: + bias = norm_layer is None + layers = [ + Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=dilation, + groups=groups, + bias_attr=bias) + ] + if norm_layer is not None: + layers.append(norm_layer(out_channels)) + if activation_layer is not None: + layers.append(activation_layer()) + super().__init__(*layers) -- GitLab From 7f43055dfa12831cd467314fe53ae4af65dce662 Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Wed, 9 Mar 2022 14:43:42 +0800 Subject: [PATCH 139/261] remove additional deps of phi (#40251) --- paddle/fluid/eager/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 8cb69caf663..698a698fc6d 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,4 @@ -set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) @@ -10,11 +10,11 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) -cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) +cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) -- GitLab From 55bfc6cb8372041fee5749902bc00322f965dcdd Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 9 Mar 2022 14:50:22 +0800 Subject: [PATCH 140/261] [phi] transfer the nll_loss kernel to phi and pass the test (#39936) * transfer the nll_loss_op and pass the CI * push * fix by self-review * fix by cr * add nll_loss * fix code --- paddle/fluid/operators/nll_loss_op.cc | 9 +- paddle/fluid/operators/nll_loss_op.h | 306 ----------------- .../phi/kernels/cpu/nll_loss_grad_kernel.cc | 171 ++++++++++ paddle/phi/kernels/cpu/nll_loss_kernel.cc | 202 +++++++++++ .../kernels/gpu/nll_loss.h} | 316 ++++++------------ .../phi/kernels/gpu/nll_loss_grad_kernel.cu | 114 +++++++ paddle/phi/kernels/gpu/nll_loss_kernel.cu | 116 +++++++ paddle/phi/kernels/nll_loss_grad_kernel.h | 31 ++ paddle/phi/kernels/nll_loss_kernel.cc | 41 +++ paddle/phi/kernels/nll_loss_kernel.h | 33 ++ paddle/phi/ops/compat/nll_loss_sig.cc | 39 +++ 11 files changed, 849 insertions(+), 529 deletions(-) delete mode 100644 paddle/fluid/operators/nll_loss_op.h create mode 100644 paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/nll_loss_kernel.cc rename paddle/{fluid/operators/nll_loss_op.cu => phi/kernels/gpu/nll_loss.h} (50%) create mode 100644 paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/nll_loss_kernel.cu create mode 100644 paddle/phi/kernels/nll_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/nll_loss_kernel.cc create mode 100644 paddle/phi/kernels/nll_loss_kernel.h create mode 100644 paddle/phi/ops/compat/nll_loss_sig.cc diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index f510c7bebec..6c35ad29e97 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/nll_loss_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -264,10 +264,3 @@ REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, ops::NLLLossGradMaker); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); -REGISTER_OP_CPU_KERNEL( - nll_loss, ops::NLLLossOpKernel, - ops::NLLLossOpKernel); -REGISTER_OP_CPU_KERNEL( - nll_loss_grad, - ops::NLLLossGradOpKernel, - ops::NLLLossGradOpKernel); diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h deleted file mode 100644 index be6f4422d4a..00000000000 --- a/paddle/fluid/operators/nll_loss_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int64_t i = 0; i < batch_size; ++i) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "Label value is out of range. " - "Expected label value in range of [0, %d), but " - "received value is %d.", - n_classes, cur_label)); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int64_t i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= x_data[i * n_classes + cur_label] * cur_weight; - } - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[index] = -x_data[i * sample_size + cur_label * map_size + - h * in_dim3 + w] * - cur_weight; - } - } - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= - x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * - cur_weight; - } - } - } - - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -class NLLLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto reduction = ctx.Attr("reduction"); - auto ignore_index = ctx.Attr("ignore_index"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - *total_weight_data = 0; - - auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_1D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_2D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, in_dim2, in_dim3, - reduction, ignore_index); - } - } -}; - -template -static void nll_loss_grad_1D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[i * n_classes + cur_label] /= total_weight_val; - } - } -} - -template -static void nll_loss_grad_2D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = - -cur_weight * dout_data[index]; - } - } - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - const auto dx_index = - i * sample_size + cur_label * map_size + h * in_dim3 + w; - dx_data[dx_index] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[dx_index] /= total_weight_val; - } - } - } - } -} - -template -class NLLLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* total_weight = ctx.Input("Total_weight"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - memset(dx_data, 0, dx->numel() * sizeof(T)); - - const auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, in_dim2, - in_dim3, reduction, ignore_index); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc new file mode 100644 index 00000000000..e7d74759f51 --- /dev/null +++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc @@ -0,0 +1,171 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nll_loss_grad_kernel.h" + +#include +#include +#include "paddle/fluid/operators/math.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +static void nll_loss_grad_1D(T* dx_data, + const T* dout_data, + const int64_t* label_data, + const T* weight_data, + const T* total_weight_data, + const int64_t batch_size, + const int64_t n_classes, + const std::string reduction, + const int64_t ignore_index) { + if (reduction == "none") { + for (int i = 0; i < batch_size; i++) { + const auto cur_label = label_data[i]; + if (cur_label == ignore_index) { + continue; + } + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; + } + return; + } + + const T dout_val = *dout_data; + const T total_weight_val = *total_weight_data; + for (int i = 0; i < batch_size; i++) { + const auto cur_label = label_data[i]; + if (cur_label == ignore_index) { + continue; + } + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; + if (reduction == "mean") { + dx_data[i * n_classes + cur_label] /= total_weight_val; + } + } +} + +template +static void nll_loss_grad_2D(T* dx_data, + const T* dout_data, + const int64_t* label_data, + const T* weight_data, + const T* total_weight_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t in_dim2, + const int64_t in_dim3, + const std::string& reduction, + const int64_t ignore_index) { + const auto map_size = in_dim2 * in_dim3; + const auto sample_size = n_classes * map_size; + + if (reduction == "none") { + for (int i = 0; i < batch_size; i++) { + for (int h = 0; h < in_dim2; h++) { + for (int w = 0; w < in_dim3; w++) { + const auto index = i * map_size + h * in_dim3 + w; + const auto cur_label = label_data[index]; + if (cur_label == ignore_index) { + continue; + } + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = + -cur_weight * dout_data[index]; + } + } + } + return; + } + + const T dout_val = *dout_data; + const T total_weight_val = *total_weight_data; + for (int i = 0; i < batch_size; i++) { + for (int h = 0; h < in_dim2; h++) { + for (int w = 0; w < in_dim3; w++) { + const auto index = i * map_size + h * in_dim3 + w; + const auto cur_label = label_data[index]; + if (cur_label == ignore_index) { + continue; + } + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + const auto dx_index = + i * sample_size + cur_label * map_size + h * in_dim3 + w; + dx_data[dx_index] = -dout_val * cur_weight; + if (reduction == "mean") { + dx_data[dx_index] /= total_weight_val; + } + } + } + } +} + +template +void NllLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& labels, + const DenseTensor& total_weight, + paddle::optional weight, + const DenseTensor& d_out, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* dx) { + auto dx_data = dev_ctx.template Alloc(dx); + auto dout_data = d_out.data(); + auto label_data = labels.data(); + auto weight_data = weight.get_ptr() ? weight.get_ptr()->data() : nullptr; + auto total_weight_data = total_weight.data(); + memset(dx_data, 0, dx->numel() * sizeof(T)); + + const auto x_dims = x.dims(); + const auto batch_size = x_dims[0]; + const auto n_classes = x_dims[1]; + + if (x_dims.size() == 2) { + nll_loss_grad_1D(dx_data, + dout_data, + label_data, + weight_data, + total_weight_data, + batch_size, + n_classes, + reduction, + ignore_index); + } else if (x_dims.size() == 4) { + const auto in_dim2 = x_dims[2]; + const auto in_dim3 = x_dims[3]; + nll_loss_grad_2D(dx_data, + dout_data, + label_data, + weight_data, + total_weight_data, + batch_size, + n_classes, + in_dim2, + in_dim3, + reduction, + ignore_index); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + nll_loss_grad, CPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc new file mode 100644 index 00000000000..334b0082bde --- /dev/null +++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nll_loss_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +static void nll_loss_1D(T* out_data, + T* total_weight_data, + const T* x_data, + const int64_t* label_data, + const T* weight_data, + const int64_t batch_size, + const int64_t n_classes, + const std::string& reduction, + const int64_t ignore_index) { + if (reduction == "none") { + for (int64_t i = 0; i < batch_size; ++i) { + const auto cur_label = label_data[i]; + if (cur_label == ignore_index) { + out_data[i] = 0; + continue; + } + PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, + true, + phi::errors::InvalidArgument( + "Label value is out of range. " + "Expected label value in range of [0, %d), but " + "received value is %d.", + n_classes, + cur_label)); + + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; + } + return; + } + + T output_val = 0; + T total_weight_val = 0; + + for (int64_t i = 0; i < batch_size; i++) { + const auto cur_label = label_data[i]; + if (cur_label == ignore_index) { + out_data[i] = 0; + continue; + } + PADDLE_ENFORCE_EQ( + cur_label >= 0 && cur_label < n_classes, + true, + phi::errors::InvalidArgument("label should not be out of bounds.")); + + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + total_weight_val += cur_weight; + output_val -= x_data[i * n_classes + cur_label] * cur_weight; + } + if (reduction == "mean" && total_weight_val != 0) { + output_val /= total_weight_val; + } + *out_data = output_val; + *total_weight_data = total_weight_val; +} + +template +static void nll_loss_2D(T* out_data, + T* total_weight_data, + const T* x_data, + const int64_t* label_data, + const T* weight_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t in_dim2, + const int64_t in_dim3, + const std::string& reduction, + const int64_t ignore_index) { + const auto map_size = in_dim2 * in_dim3; + const auto sample_size = n_classes * map_size; + if (reduction == "none") { + for (int i = 0; i < batch_size; i++) { + for (int h = 0; h < in_dim2; h++) { + for (int w = 0; w < in_dim3; w++) { + const auto index = i * map_size + h * in_dim3 + w; + const auto cur_label = label_data[index]; + if (cur_label == ignore_index) { + out_data[index] = 0; + continue; + } + PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, + true, + phi::errors::InvalidArgument( + "label should not be out of bounds.")); + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + out_data[index] = -x_data[i * sample_size + cur_label * map_size + + h * in_dim3 + w] * + cur_weight; + } + } + } + return; + } + + T output_val = 0; + T total_weight_val = 0; + + for (int i = 0; i < batch_size; i++) { + for (int h = 0; h < in_dim2; h++) { + for (int w = 0; w < in_dim3; w++) { + const auto index = i * map_size + h * in_dim3 + w; + const auto cur_label = label_data[index]; + if (cur_label == ignore_index) { + out_data[index] = 0; + continue; + } + PADDLE_ENFORCE_EQ( + cur_label >= 0 && cur_label < n_classes, + true, + phi::errors::InvalidArgument("label should not be out of bounds.")); + const auto cur_weight = + weight_data ? weight_data[cur_label] : static_cast(1); + total_weight_val += cur_weight; + output_val -= + x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * + cur_weight; + } + } + } + + if (reduction == "mean" && total_weight_val != 0) { + output_val /= total_weight_val; + } + *out_data = output_val; + *total_weight_data = total_weight_val; +} + +template +void NllLossRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& labels, + paddle::optional weight, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* out, + DenseTensor* total_weight) { + auto x_data = x.data(); + auto label_data = labels.data(); + auto weight_data = weight.get_ptr() ? weight.get_ptr()->data() : nullptr; + auto out_data = dev_ctx.template Alloc(out); + auto total_weight_data = dev_ctx.template Alloc(total_weight); + *total_weight_data = 0; + + auto x_dims = x.dims(); + const auto batch_size = x_dims[0]; + const auto n_classes = x_dims[1]; + + if (x_dims.size() == 2) { + nll_loss_1D(out_data, + total_weight_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + reduction, + ignore_index); + } else if (x_dims.size() == 4) { + const auto in_dim2 = x_dims[2]; + const auto in_dim3 = x_dims[3]; + nll_loss_2D(out_data, + total_weight_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + in_dim2, + in_dim3, + reduction, + ignore_index); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + nll_loss, CPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {} diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/phi/kernels/gpu/nll_loss.h similarity index 50% rename from paddle/fluid/operators/nll_loss_op.cu rename to paddle/phi/kernels/gpu/nll_loss.h index fd8a44cc05d..a457264498f 100644 --- a/paddle/fluid/operators/nll_loss_op.cu +++ b/paddle/phi/kernels/gpu/nll_loss.h @@ -1,37 +1,39 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include #include #include #include #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/operators/nll_loss_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - +namespace phi { static constexpr int kNumCUDAThreads = 512; static constexpr int kNumMaxinumNumBlocks = 4096; static const int NTHREADS = 32; - static inline int NumBlocks(const int N) { return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, kNumMaxinumNumBlocks); } template -__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data, +__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, + const T* x_data, const int64_t* label_data, const T* weight_data, const int64_t batch_size, @@ -51,11 +53,15 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data, } template -__global__ void GPUNLLLossForward1D_with_reduce( - T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, const int64_t batch_size, - const int64_t n_classes, const int64_t size_average, - const int64_t ignore_index) { +__global__ void GPUNLLLossForward1D_with_reduce(T* out_data, + T* total_weight_data, + const T* x_data, + const int64_t* label_data, + const T* weight_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t size_average, + const int64_t ignore_index) { __shared__ T sharedInputs[NTHREADS], sharedWeights[NTHREADS]; sharedInputs[threadIdx.x] = 0; sharedWeights[threadIdx.x] = 0; @@ -99,9 +105,11 @@ __global__ void GPUNLLLossForward1D_with_reduce( // then __syncthreads is needed either before or afterwards to prevent non-0 // threads overriding smem in the next loop before num-0 thread reads from it. template -__device__ void reduceNValuesInBlock(T* smem, T threadVals[N], +__device__ void reduceNValuesInBlock(T* smem, + T threadVals[N], const unsigned int numVals, - ReduceOp reduceOp, T init) { + ReduceOp reduceOp, + T init) { if (numVals == 0) { #pragma unroll for (int i = 0; i < N; ++i) { @@ -175,18 +183,26 @@ __device__ void reduceNValuesInBlock(T* smem, T threadVals[N], // then __syncthreads is needed either before or afterwards to prevent non-0 // threads overriding smem in the next loop before num-0 thread reads from it. template -__device__ T reduceBlock(T* smem, const unsigned int numVals, T threadVal, - ReduceOp reduceOp, T init) { - reduceNValuesInBlock(smem, &threadVal, numVals, reduceOp, - init); +__device__ T reduceBlock(T* smem, + const unsigned int numVals, + T threadVal, + ReduceOp reduceOp, + T init) { + reduceNValuesInBlock( + smem, &threadVal, numVals, reduceOp, init); return threadVal; } template -__global__ void GPUNLLLossForward2D_no_reduce( - T* out_data, const T* x_data, const int64_t* label_data, - const T* weight_data, const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) { +__global__ void GPUNLLLossForward2D_no_reduce(T* out_data, + const T* x_data, + const int64_t* label_data, + const T* weight_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t in_dim2, + const int64_t in_dim3, + const int64_t ignore_index) { const int64_t map_size = in_dim2 * in_dim3; const int64_t sample_size = n_classes * map_size; const int64_t out_numel = batch_size * map_size; @@ -211,11 +227,16 @@ __global__ void GPUNLLLossForward2D_no_reduce( } template -__global__ void GPUNLLLossForward2D_with_reduce( - T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, const int64_t batch_size, - const int64_t n_classes, const int64_t map_nelem, - const int64_t blocks_per_sample, const int64_t ignore_index) { +__global__ void GPUNLLLossForward2D_with_reduce(T* out_data, + T* total_weight_data, + const T* x_data, + const int64_t* label_data, + const T* weight_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t map_nelem, + const int64_t blocks_per_sample, + const int64_t ignore_index) { __shared__ T partial_sums[kNumCUDAThreads]; int64_t i; T input_sum = 0; @@ -228,7 +249,8 @@ __global__ void GPUNLLLossForward2D_with_reduce( int64_t ioffset = sample * map_nelem * n_classes; int64_t step = blockDim.x * blocks_per_sample; for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; - i < map_nelem; i += step) { + i < map_nelem; + i += step) { const int64_t cur_label = label_data[toffset + i]; if (cur_label != ignore_index) { PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes, @@ -242,8 +264,8 @@ __global__ void GPUNLLLossForward2D_with_reduce( input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus(), (T)0); __syncthreads(); - acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, - thrust::plus(), (T)0); + acc_weight = reduceBlock( + partial_sums, blockDim.x, acc_weight, thrust::plus(), (T)0); if (threadIdx.x == 0) { paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight); @@ -258,12 +280,14 @@ __global__ void GPUNLLLossForward2D_size_average(T* out_data, *out_data /= *total_weight_data; } } - template -__global__ void GPUNLLLossBackward1D_no_reduce( - T* dx_data, const int64_t* label_data, const T* weight_data, - const T* dout_data, const int64_t batch_size, const int64_t n_classes, - const int64_t ignore_index) { +__global__ void GPUNLLLossBackward1D_no_reduce(T* dx_data, + const int64_t* label_data, + const T* weight_data, + const T* dout_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t ignore_index) { CUDA_KERNEL_LOOP(i, batch_size) { const int64_t cur_label = label_data[i]; if (cur_label == ignore_index) { @@ -275,11 +299,15 @@ __global__ void GPUNLLLossBackward1D_no_reduce( } template -__global__ void GPUNLLLossBackward1D_with_reduce( - T* dx_data, const T* total_weight_data, const int64_t* label_data, - const T* weight_data, const T* dout_data, const int64_t batch_size, - const int64_t n_classes, const int64_t size_average, - const int64_t ignore_index) { +__global__ void GPUNLLLossBackward1D_with_reduce(T* dx_data, + const T* total_weight_data, + const int64_t* label_data, + const T* weight_data, + const T* dout_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t size_average, + const int64_t ignore_index) { if (*total_weight_data <= 0) { return; } @@ -295,10 +323,15 @@ __global__ void GPUNLLLossBackward1D_with_reduce( } template -__global__ void GPUNLLLossBackward2D_no_reduce( - T* dx_data, const int64_t* label_data, const T* weight_data, - const T* dout_data, const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) { +__global__ void GPUNLLLossBackward2D_no_reduce(T* dx_data, + const int64_t* label_data, + const T* weight_data, + const T* dout_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t in_dim2, + const int64_t in_dim3, + const int64_t ignore_index) { const int64_t map_size = in_dim2 * in_dim3; const int64_t sample_size = n_classes * map_size; const int64_t out_numel = batch_size * map_size; @@ -319,10 +352,16 @@ __global__ void GPUNLLLossBackward2D_no_reduce( template __global__ void GPUNLLLossBackward2D_with_reduce( - T* dx_data, const T* total_weight_data, const int64_t* label_data, - const T* weight_data, const T* dout_data, const int64_t batch_size, - const int64_t n_classes, const int64_t map_nelem, - const int64_t blocks_per_sample, const int64_t size_average, + T* dx_data, + const T* total_weight_data, + const int64_t* label_data, + const T* weight_data, + const T* dout_data, + const int64_t batch_size, + const int64_t n_classes, + const int64_t map_nelem, + const int64_t blocks_per_sample, + const int64_t size_average, const int64_t ignore_index) { if (*total_weight_data <= 0) { return; @@ -334,7 +373,8 @@ __global__ void GPUNLLLossBackward2D_with_reduce( int toffset = sample * map_nelem; int ioffset = sample * map_nelem * n_classes; for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; - i < map_nelem; i += step) { + i < map_nelem; + i += step) { const int64_t cur_label = label_data[toffset + i]; if (cur_label != ignore_index) { dx_data[ioffset + i + map_nelem * cur_label] = @@ -343,158 +383,4 @@ __global__ void GPUNLLLossBackward2D_with_reduce( } } -template -class NLLLossCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto x_data = x->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; -#ifdef PADDLE_WITH_HIP - hipMemset(total_weight_data, 0, sizeof(T)); -#else - cudaMemset(total_weight_data, 0, sizeof(T)); -#endif - auto x_dims = x->dims(); - auto batch_size = x_dims[0]; - auto n_classes = x_dims[1]; - int64_t size_average = (int64_t)(reduction == "mean"); - - if (x_dims.size() == 2) { - int blocks = NumBlocks(batch_size); - int threads = kNumCUDAThreads; - auto& dev_ctx = ctx.cuda_device_context(); - if (reduction == "none") { - GPUNLLLossForward1D_no_reduce< - T><<>>( - out_data, x_data, label_data, weight_data, batch_size, n_classes, - ignore_index); - } else { - GPUNLLLossForward1D_with_reduce< - T><<<1, NTHREADS, 0, dev_ctx.stream()>>>( - out_data, total_weight_data, x_data, label_data, weight_data, - batch_size, n_classes, size_average, ignore_index); - } - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - const auto map_size = in_dim2 * in_dim3; - const auto out_numel = batch_size * in_dim2 * in_dim3; - int blocks = NumBlocks(out_numel); - int threads = kNumCUDAThreads; - auto& dev_ctx = ctx.cuda_device_context(); - if (reduction == "none") { - GPUNLLLossForward2D_no_reduce< - T><<>>( - out_data, x_data, label_data, weight_data, batch_size, n_classes, - in_dim2, in_dim3, ignore_index); - } else { - int blocks_per_sample = NumBlocks(map_size) / 128; - blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; - int total_blocks = blocks_per_sample * batch_size; - GPUNLLLossForward2D_with_reduce< - T><<>>( - out_data, total_weight_data, x_data, label_data, weight_data, - batch_size, n_classes, map_size, blocks_per_sample, ignore_index); - if (size_average) { - GPUNLLLossForward2D_size_average<<<1, 1, 0, dev_ctx.stream()>>>( - out_data, total_weight_data); - } - } - } - } -}; - -template -class NLLLossGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* total_weight = ctx.Input("Total_weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); -#ifdef PADDLE_WITH_HIP - hipMemset(dx_data, 0, dx->numel() * sizeof(T)); -#else - cudaMemset(dx_data, 0, dx->numel() * sizeof(T)); -#endif - - int64_t size_average = (int64_t)(reduction == "mean"); - auto x_dims = x->dims(); - auto batch_size = x_dims[0]; - auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - int blocks = NumBlocks(batch_size); - int threads = kNumCUDAThreads; - auto& dev_ctx = ctx.cuda_device_context(); - if (reduction == "none") { - GPUNLLLossBackward1D_no_reduce< - T><<>>( - dx_data, label_data, weight_data, dout_data, batch_size, n_classes, - ignore_index); - } else { - GPUNLLLossBackward1D_with_reduce< - T><<<1, NTHREADS, 0, dev_ctx.stream()>>>( - dx_data, total_weight_data, label_data, weight_data, dout_data, - batch_size, n_classes, size_average, ignore_index); - } - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - const auto map_size = in_dim2 * in_dim3; - const auto out_numel = batch_size * in_dim2 * in_dim3; - - int blocks = NumBlocks(out_numel); - int threads = kNumCUDAThreads; - auto& dev_ctx = ctx.cuda_device_context(); - if (reduction == "none") { - GPUNLLLossBackward2D_no_reduce< - T><<>>( - dx_data, label_data, weight_data, dout_data, batch_size, n_classes, - in_dim2, in_dim3, ignore_index); - } else { - int blocks_per_sample = NumBlocks(map_size) / 128; - blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; - int total_blocks = blocks_per_sample * batch_size; - GPUNLLLossBackward2D_with_reduce< - T><<>>( - dx_data, total_weight_data, label_data, weight_data, dout_data, - batch_size, n_classes, map_size, blocks_per_sample, size_average, - ignore_index); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - nll_loss, - ops::NLLLossCUDAKernel, - ops::NLLLossCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - nll_loss_grad, - ops::NLLLossGradCUDAKernel, - ops::NLLLossGradCUDAKernel); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu new file mode 100644 index 00000000000..9a2d9c6e479 --- /dev/null +++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nll_loss_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/nll_loss.h" + +namespace phi { +template +void NllLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& labels, + const DenseTensor& total_weight, + paddle::optional weight, + const DenseTensor& dout, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* dx) { + auto dx_data = dev_ctx.template Alloc(dx); + auto dout_data = dout.data(); + auto label_data = labels.data(); + auto weight_data = weight.get_ptr() ? weight.get_ptr()->data() : nullptr; + auto total_weight_data = total_weight.data(); +#ifdef PADDLE_WITH_HIP + hipMemset(dx_data, 0, dx->numel() * sizeof(T)); +#else + cudaMemset(dx_data, 0, dx->numel() * sizeof(T)); +#endif + + int64_t size_average = (int64_t)(reduction == "mean"); + auto x_dims = x.dims(); + auto batch_size = x_dims[0]; + auto n_classes = x_dims[1]; + + if (x_dims.size() == 2) { + int blocks = NumBlocks(batch_size); + int threads = kNumCUDAThreads; + if (reduction == "none") { + GPUNLLLossBackward1D_no_reduce< + T><<>>(dx_data, + label_data, + weight_data, + dout_data, + batch_size, + n_classes, + ignore_index); + } else { + GPUNLLLossBackward1D_with_reduce<<<1, NTHREADS, 0, dev_ctx.stream()>>>( + dx_data, + total_weight_data, + label_data, + weight_data, + dout_data, + batch_size, + n_classes, + size_average, + ignore_index); + } + } else if (x_dims.size() == 4) { + const auto in_dim2 = x_dims[2]; + const auto in_dim3 = x_dims[3]; + const auto map_size = in_dim2 * in_dim3; + const auto out_numel = batch_size * in_dim2 * in_dim3; + + int blocks = NumBlocks(out_numel); + int threads = kNumCUDAThreads; + if (reduction == "none") { + GPUNLLLossBackward2D_no_reduce< + T><<>>(dx_data, + label_data, + weight_data, + dout_data, + batch_size, + n_classes, + in_dim2, + in_dim3, + ignore_index); + } else { + int blocks_per_sample = NumBlocks(map_size) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + GPUNLLLossBackward2D_with_reduce< + T><<>>(dx_data, + total_weight_data, + label_data, + weight_data, + dout_data, + batch_size, + n_classes, + map_size, + blocks_per_sample, + size_average, + ignore_index); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + nll_loss_grad, GPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu new file mode 100644 index 00000000000..6b0e1fef7ba --- /dev/null +++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nll_loss_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/nll_loss.h" + +namespace phi { + +template +void NllLossRawKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + paddle::optional weight, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* out, + DenseTensor* total_weight) { + auto* x = &input; + auto x_data = x->data(); + auto out_data = dev_ctx.template Alloc(out); + auto total_weight_data = dev_ctx.template Alloc(total_weight); + auto label_data = label.data(); + auto weight_data = weight.get_ptr() ? weight.get_ptr()->data() : nullptr; +#ifdef PADDLE_WITH_HIP + hipMemset(total_weight_data, 0, sizeof(T)); +#else + cudaMemset(total_weight_data, 0, sizeof(T)); +#endif + auto x_dims = x->dims(); + auto batch_size = x_dims[0]; + auto n_classes = x_dims[1]; + int64_t size_average = (int64_t)(reduction == "mean"); + + if (x_dims.size() == 2) { + int blocks = NumBlocks(batch_size); + int threads = kNumCUDAThreads; + if (reduction == "none") { + GPUNLLLossForward1D_no_reduce< + T><<>>(out_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + ignore_index); + } else { + GPUNLLLossForward1D_with_reduce<<<1, NTHREADS, 0, dev_ctx.stream()>>>( + out_data, + total_weight_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + size_average, + ignore_index); + } + } else if (x_dims.size() == 4) { + const auto in_dim2 = x_dims[2]; + const auto in_dim3 = x_dims[3]; + const auto map_size = in_dim2 * in_dim3; + const auto out_numel = batch_size * in_dim2 * in_dim3; + int blocks = NumBlocks(out_numel); + int threads = kNumCUDAThreads; + if (reduction == "none") { + GPUNLLLossForward2D_no_reduce< + T><<>>(out_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + in_dim2, + in_dim3, + ignore_index); + } else { + int blocks_per_sample = NumBlocks(map_size) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + GPUNLLLossForward2D_with_reduce< + T><<>>(out_data, + total_weight_data, + x_data, + label_data, + weight_data, + batch_size, + n_classes, + map_size, + blocks_per_sample, + ignore_index); + if (size_average) { + GPUNLLLossForward2D_size_average<<<1, 1, 0, dev_ctx.stream()>>>( + out_data, total_weight_data); + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + nll_loss, GPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {} diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h new file mode 100644 index 00000000000..127dc2f961f --- /dev/null +++ b/paddle/phi/kernels/nll_loss_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void NllLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& total_weight, + paddle::optional weight, + const DenseTensor& d_out, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc new file mode 100644 index 00000000000..b271f0f4d06 --- /dev/null +++ b/paddle/phi/kernels/nll_loss_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/nll_loss_kernel.h" + +namespace phi { +template +void NllLossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + paddle::optional weight, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* out) { + DenseTensor total_weight; + total_weight.set_meta( + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), {1})); + dev_ctx.template Alloc(total_weight); + NllLossRawKernel(dev_ctx, + input, + label, + weight, + ignore_index, + reduction, + out, + &total_weight); +} +} // namespace phi + +// TODO(xiongkun): add the non-raw kernel register here. diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h new file mode 100644 index 00000000000..90083e1d684 --- /dev/null +++ b/paddle/phi/kernels/nll_loss_kernel.h @@ -0,0 +1,33 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void NllLossRawKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + paddle::optional weight, + int64_t ignore_index, + const std::string& reduction, + DenseTensor* out, + DenseTensor* total_weight); + +} // namespace phi diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc new file mode 100644 index 00000000000..f274d7f77c5 --- /dev/null +++ b/paddle/phi/ops/compat/nll_loss_sig.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + // TODO(xiongkun): can't remove the forward mapping, because the Weight is + // optional + return KernelSignature("nll_loss", + {"X", "Label", "Weight"}, + {"ignore_index", "reduction"}, + {"Out", "Total_weight"}); +} + +KernelSignature NllLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "nll_loss_grad", + {"X", "Label", "Total_weight", "Weight", GradVarName("Out")}, + {"ignore_index", "reduction"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping); -- GitLab From 7b18c55b1663bc3fc25818c7ecaf0c3e143ed352 Mon Sep 17 00:00:00 2001 From: wawltor Date: Wed, 9 Mar 2022 15:30:12 +0800 Subject: [PATCH 141/261] fix the document of ones_like, zeros_like (#40233) --- python/paddle/tensor/creation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index bddc45bc961..6555ba0812d 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -283,7 +283,7 @@ def ones_like(x, dtype=None, name=None): Args: x(Tensor): The input tensor which specifies shape and dtype. The dtype of ``x`` can be bool, float16, float32, float64, int32, int64. - dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the + dtype(str|np.dtype, optional): The data type of the output tensor. Supported data types: bool, float16, float32, float64, int32, int64. If ``dtype`` is None, the data type is the same as ``x``. Default is None. @@ -358,7 +358,7 @@ def zeros_like(x, dtype=None, name=None): Args: x(Tensor): The input tensor which specifies shape and dtype. The dtype of ``x`` can be bool, float16, float32, float64, int32, int64. - dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the + dtype(str|np.dtype, optional): The data type of the output tensor. Supported data types: bool, float16, float32, float64, int32, int64. If ``dtype`` is None, the data type is the same as ``x``. Default is None. -- GitLab From 3e9601ba4943b36da375fdf50238474da760abab Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Wed, 9 Mar 2022 15:50:42 +0800 Subject: [PATCH 142/261] adapt run_program OP for eager (#40198) * adapt run_program OP for eager * fix program_id * refine code * fix test --- .../auto_code_generator/eager_generator.cc | 12 +- .../final_state_generator/eager_gen.py | 2 + .../eager/to_static/run_program_op_func.h | 82 +++ .../eager/to_static/run_program_op_node.h | 468 ++++++++++++++++++ .../fluid/pybind/custom_handwrite_op_funcs.h | 51 ++ .../pybind/eager_op_function_generator.cc | 24 +- paddle/fluid/pybind/eager_utils.cc | 60 +++ paddle/fluid/pybind/eager_utils.h | 7 + paddle/fluid/pybind/pybind.cc | 7 +- .../tests/unittests/test_eager_run_program.py | 120 +++++ 10 files changed, 823 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/eager/to_static/run_program_op_func.h create mode 100644 paddle/fluid/eager/to_static/run_program_op_node.h create mode 100644 paddle/fluid/pybind/custom_handwrite_op_funcs.h create mode 100644 python/paddle/fluid/tests/unittests/test_eager_run_program.py diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 2fc846cccc2..dc79a8a45a2 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -47,6 +47,9 @@ std::unordered_map> static std::unordered_map operators_with_attrs = {}; +/* --- Black Ops list that's NO NEED to apply code generation --- */ +static std::unordered_set black_ops_list = {"run_program"}; + static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' @@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type, } static void PrepareAttrMapForOps() { - // Handle "run_program_op" - static framework::ProgramDesc fake_prog; - operators_with_attrs["run_program"] = {}; - operators_with_attrs["run_program"]["global_block"] = - fake_prog.MutableBlock(0); - // Handle "fused_elemwise_add_activation" std::vector functor_list = {"a", "b"}; operators_with_attrs["fused_elemwise_add_activation"] = {}; @@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { if (!CheckOpProto(op_proto)) continue; const std::string& op_type = op_proto->type(); + if (black_ops_list.count(op_type)) { + continue; + } /* ----------------------------- */ /* ---- Collect Information ---- */ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 81d0c9b7bed..b594faa80a8 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -1000,6 +1000,7 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" """ file_contents += node_definition_str @@ -1042,6 +1043,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): #include "paddle/phi/api/all.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" """ file_contents += GenerateCoreOpInfoDeclaration() diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h new file mode 100644 index 00000000000..6f8bccd64e4 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" + +inline void run_program_dygraph_function( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + VLOG(2) << "start run run_program"; + // Call forward function + RunProgramAPI(x, params, out, step_scope, dout, attrs); + VLOG(2) << "start run run_program grad"; + + // Prepare Autograd Meta + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + if (require_any_grad) { + std::vector out_names; + for (auto& t : deref_out) { + out_names.emplace_back(t.name()); + } + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + auto grad_node = std::make_shared(1, 2); + + grad_node->SetFwdOutNames(out_names); + // Set Attributes + grad_node->SetAttrMap(attrs); + // Set TensorWrappers + grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); + grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + + grad_node->SetGradInMeta(&p_autograd_outs, 0); + // Set Next Edges + grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); + grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + egr::EagerUtils::CheckAndRetainGrad(deref_out); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h new file mode 100644 index 00000000000..ae5d86664a3 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -0,0 +1,468 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +#include "paddle/fluid/operators/run_program_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace details { +using Tensor = paddle::experimental::Tensor; + +static std::vector DereferenceTensors( + const std::vector &tensor_ptr) { + std::vector res; + for (auto *t : tensor_ptr) { + res.emplace_back(*t); + } + return res; +} + +static std::vector GetTensorsName(const std::vector &ins) { + std::vector in_names; + for (auto &in_t : ins) { + in_names.emplace_back(in_t.name()); + } + return in_names; +} + +static std::vector GetTensorsName( + const std::vector &ins) { + std::vector in_names; + for (auto *in_t : ins) { + in_names.emplace_back(in_t->name()); + } + return in_names; +} + +static void CheckInputVarStatus(const Tensor &tensor) { + PADDLE_ENFORCE_EQ( + tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); + + PADDLE_ENFORCE_EQ(tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); +} + +static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, + const Tensor &dst_tensor) { + auto name = dst_tensor.name(); + PADDLE_ENFORCE_EQ(dst_tensor.defined(), true, + paddle::platform::errors::InvalidArgument( + "dst_tensor shall be defined.")); + + if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensor %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is DenseTensor", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's internal " + "scope is not initialized.", + name)); + } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensodfr %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is SelectedRows", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + name)); + + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The RunProgram(Grad)Op only support output " + "variable of type LoDTensor or SelectedRows", + name)); + } +} + +static void ShareTensorsIntoScope(const std::vector &tensors, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + auto name = tensors[i].name(); + if (name == "Fake_var" || !tensors[i].is_initialized()) { + continue; + } + auto *var = scope->Var(name); + CheckInputVarStatus(tensors[i]); + // share tensor + auto tensor_base = tensors[i].impl(); + if (phi::DenseTensor::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } + } +} + +static void ShareTensorsFromScope( + const std::vector &tensors, + const paddle::framework::BlockDesc &global_block, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't find them in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. + auto &name = tensors[i]->name(); + if (name == paddle::framework::kEmptyVarName || name == "Fake_var" || + !global_block.HasVar(name)) { + VLOG(2) << "find tensor name is " << name << ", skip it!"; + continue; + } + // NOTE: Here skip not found var is dangerous, if a bug is caused here, + // the result is grad calculation error, which will be very hidden! + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound( + "The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + // TODO(dev): Determine Tensor type by scope.var + // auto tensor_base = tensors[i]->impl(); + // if (phi::DenseTensor::classof(tensor_base.get())) { + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + // } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + +} // namespace details + +inline void RunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index")); + auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test")); + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + // Step 2. prepare executor and init persistable variables + + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + + // share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, &scope); + details::ShareTensorsIntoScope(params, &scope); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); + auto *program = global_block->Program(); + + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + auto ¶llel_executor = cache_info.first; + // all out_vars are skip_eager_var + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + dout_names.begin(), dout_names.end()); + paddle::framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_names, &skip_eager_delete_vars); + } + + // Step 3. run ops + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); + } + // Step 4. Get Output + details::ShareTensorsFromScope(out, *global_block, &scope); + details::ShareTensorsFromScope(dout, *global_block, &scope); + + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + // Step 5. Drop all children scopes while testing. + if (is_test) { + out_scope_vec->front()->DropKids(); + } + VLOG(2) << "The number of sub scopes after forward: " + << out_scope_vec->front()->kids().size(); + // #ifdef PADDLE_WITH_MKLDNN + // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); + // #endif +} + +inline void RunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT + ) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + + // TODO(dev): Remove this line hard code. And need to deal with the out_grad + // name problem. + // const_cast(out_grad[0]) + // .set_name("matmul_v2_0.tmp_0@GRAD"); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + // NOTE: skip `shape` and `fill_constant` op created by + // fluid.backward.gradients, one forward output will generate one `shape` + // and `fill_constant` + int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); + int64_t end_op_index = global_block->OpSize(); + + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto out_grad_names = details::GetTensorsName(out_grad); + // NOTE: after PR22939 [Add double grad] merged, the grad op maker's + // SetOutput will set to None if the input var stop_gradient=True, + // it will cause an NotFound error when ctx.OutputNames() is called + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); + } + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); + } + + // Step 2. prepare executor and scope + auto *program = global_block->Program(); + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + auto ¶llel_executor = cache_info.first; + + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); + + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + x_grad_names.begin(), x_grad_names.end()); + paddle::framework::details::AppendSkipDeletionVars( + param_grad_names, &skip_eager_delete_vars); + } + + details::ShareTensorsIntoScope(out_grad, &scope); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + // Step 3. run ops + parallel_executor->RunWithoutFetch( + /*skip_eager_delete_vars=*/skip_eager_delete_vars); + } + + // Step 4. get outputs + details::ShareTensorsFromScope(x_grad, *global_block, &scope); + details::ShareTensorsFromScope(params_grad, *global_block, &scope); + + // Step5. drop current scope + // global_inner_scope->DeleteScope(&scope); + VLOG(2) << "The number of sub scopes after backward: " + << global_inner_scope->kids().size(); +} + +class GradNodeRunProgram : public egr::GradNodeBase { + public: + GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~GradNodeRunProgram() override = default; + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector> &grads) + override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + PADDLE_ENFORCE_EQ( + grads.size(), 1, + paddle::platform::errors::InvalidArgument( + "The out_grads.size() of RunProgramGradOp should be equal to 1.")); + + VLOG(3) << "out_grads[0].size() : " << grads[0].size(); + std::vector x_grad; + std::vector params_grad; + ConstructGradTensors(x_, &x_grad); + ConstructGradTensors(params_, ¶ms_grad); + std::vector x_grad_ptr; + std::vector params_grad_ptr; + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + params_grad_ptr.emplace_back(&i); + } + + // auto x_grad_ptr = ConstructGradTensors(x_); + // auto params_grad_ptr = ConstructGradTensors(params_); + + PADDLE_ENFORCE_EQ( + grads[0].size(), fwd_out_names_.size(), + paddle::platform::errors::InvalidArgument( + "The grads[0].size() and fwd_out_names_.size() should be equal.")); + for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + const_cast(grads[0][i]) + .set_name(fwd_out_names_[i] + "@GRAD"); + } + + RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr, + params_grad_ptr); + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + return {x_grad, params_grad}; + // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { + x_ = tensors; + } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + void SetFwdOutNames(std::vector out_names) { + fwd_out_names_ = out_names; + } + + protected: + void ConstructGradTensors( + const std::vector &fwd_tensors, + std::vector *grad_tensors) { + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + grad_tensors->emplace_back(fwd_t.impl()); + auto &grad_t = grad_tensors->back(); + grad_t.set_name(fwd_t.name() + "@GRAD"); + } + } + + void ConstructGradTensors( + const std::vector &fwd_tensors) { + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); + grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + } + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector step_scope_; + + std::vector fwd_out_names_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; +}; diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h new file mode 100644 index 00000000000..7a276df0d5b --- /dev/null +++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +static PyObject *eager_api_run_program(PyObject *self, PyObject *args, + PyObject *kwargs) { + PyThreadState *tstate = nullptr; + try { + auto X = GetTensorListFromArgs("run_program", "X", args, 0, false); + auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true); + auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false); + auto OutScope = + GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false); + auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true); + framework::AttributeMap attrs; + ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args), + attrs); + + tstate = PyEval_SaveThread(); + run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); + std::cout << "end run_program_dygraph_function" << std::endl; + PyEval_RestoreThread(tstate); + tstate = nullptr; + } catch (...) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + ThrowExceptionToPython(std::current_exception()); + } + Py_RETURN_NONE; +} + +static PyMethodDef CustomEagerFinalStateMethods[] = { + {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program, + METH_VARARGS | METH_KEYWORDS, + "C++ interface function for run_program in dygraph."}, + + {nullptr, nullptr, 0, nullptr}}; diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index c15c171799f..102cdbb91ab 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -17,6 +17,7 @@ #include #include #include +#include #ifndef _WIN32 #include #endif @@ -129,6 +130,12 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs) const char* PYBIND_ITEM_TEMPLATE = R"( {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)"; +// These operators will skip automatical code generatrion and +// need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE +std::unordered_set CUSTOM_HANDWRITE_OPS_SET = {"run_program"}; +const char* CUSTOM_HANDWRITE_OP_FUNC_FILE = + "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n"; + // clang-format on static inline bool FindInsMap(const std::string& op_type, const std::string& in_name) { @@ -355,7 +362,7 @@ GenerateOpFunctions() { std::vector op_function_list, bind_function_list; auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); - + bool append_custom_head_file = false; for (auto& pair : op_info_map) { auto& op_info = pair.second; auto op_proto = op_info.proto_; @@ -363,7 +370,12 @@ GenerateOpFunctions() { continue; } auto& op_type = op_proto->type(); - // Skip ooerator which is not inherit form OperatorWithKernel, like while, + // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE. + if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) { + append_custom_head_file = true; + continue; + } + // Skip operator which is not inherit form OperatorWithKernel, like while, // since only OperatorWithKernel can run in dygraph mode. // if the phi lib contains op kernel, we still generate ops method if (!all_kernels.count(op_type) && @@ -380,6 +392,9 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(op_function_str)); bind_function_list.emplace_back(std::move(bind_function_str)); } + if (append_custom_head_file) { + op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE); + } return std::make_tuple(op_function_list, bind_function_list); } @@ -449,6 +464,11 @@ int main(int argc, char* argv[]) { << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " "core.eager.ops failed!\"));\n" << " }\n\n" + << " if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < " + "0) {\n" + << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " + "core.eager.ops failed!\"));\n" + << " }\n\n" << "}\n\n" << "} // namespace pybind\n" << "} // namespace paddle\n"; diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 0cfb08345b6..f4e148cf8dc 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/operators/py_func_op.h" @@ -35,6 +36,7 @@ namespace pybind { extern PyTypeObject* p_tensor_type; +extern PyTypeObject* g_framework_scope_pytype; extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_place_pytype; extern PyTypeObject* g_cudaplace_pytype; @@ -830,6 +832,64 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray( return paddle::experimental::ScalarArray({1}); } +paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) { + if (PyObject_IsInstance( + obj, reinterpret_cast(g_framework_scope_pytype))) { + return ::pybind11::handle(obj).cast(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "PyObject can not be cast into framework::Scope")); + } +} + +std::vector GetScopePtrListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* list = PyTuple_GET_ITEM(args, arg_idx); + if (list == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of scope, but got " + "None", + op_type, arg_name, arg_idx)); + } + } + + std::vector result; + if (PyList_Check(list)) { + Py_ssize_t len = PyList_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of scope, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + for (Py_ssize_t i = 0; i < len; i++) { + result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i))); + } + } else if (PyTuple_Check(list)) { + Py_ssize_t len = PyTuple_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of scope, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + for (Py_ssize_t i = 0; i < len; i++) { + result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i))); + } + } else if (list == Py_None) { + return {}; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "%s", + op_type, arg_name, arg_idx, + (reinterpret_cast(list->ob_type))->tp_name)); + } + return result; +} + paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, const std::string& op_type, ssize_t arg_pos) { diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index c5da1bb37af..966a920377b 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -20,6 +20,10 @@ limitations under the License. */ #include "pybind11/pybind11.h" #include "pybind11/stl.h" namespace paddle { +namespace framework { +class Scope; +} + namespace pybind { typedef struct { @@ -134,6 +138,9 @@ std::vector GetTensorPtrListFromArgs( ssize_t arg_idx, bool dispensable = false); // end of Slice related methods +std::vector GetScopePtrListFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fcfc3e6a379..566e38b7a21 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -175,6 +175,7 @@ namespace paddle { namespace pybind { PyTypeObject *g_place_pytype = nullptr; +PyTypeObject *g_framework_scope_pytype = nullptr; PyTypeObject *g_cudaplace_pytype = nullptr; PyTypeObject *g_cpuplace_pytype = nullptr; PyTypeObject *g_xpuplace_pytype = nullptr; @@ -1352,7 +1353,7 @@ All parameter, weight, gradient are variables in Paddle. BindReader(&m); - py::class_(m, "_Scope", R"DOC( + py::class_ _Scope(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. Variables in a parent scope can be retrieved from local scope. @@ -1372,7 +1373,9 @@ All parameter, weight, gradient are variables in Paddle. param_array = np.full((height, row_numel), 5.0).astype("float32") param.set(param_array, place) - )DOC") + )DOC"); + g_framework_scope_pytype = reinterpret_cast(_Scope.ptr()); + _Scope .def("_remove_from_pool", [](Scope &self) { ScopePool::Instance().Remove(&self); }) .def("var", diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py new file mode 100644 index 00000000000..fc6a5d60eca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py @@ -0,0 +1,120 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np +from paddle import _C_ops +from paddle.fluid.framework import _test_eager_guard, Variable +from paddle.fluid import core +from paddle.fluid.layers.utils import _hash_with_id +import paddle.compat as cpt + +import unittest + + +def _append_backward_desc(main_program, outs): + # make sure all status of is_test are False in train mode. + program = main_program.clone() + targets = [] + for out in outs: + if isinstance(out, Variable): + targets.append(program.global_block().var(out.name)) + + if targets: + paddle.fluid.backward.gradients(targets=targets, inputs=[]) + + return program + + +# def _set_grad_type(params, train_program): +# # NOTE: if user set sparse gradient mode, the param's gradient +# # will be SelectedRows, not LoDTensor. But tracer will just +# # set param grad VarBase by forward VarBase(LoDTensor) +# # If we don't change grad_var type here, RunProgramOp need +# # transform SelectedRows to LoDTensor forcibly, it may not +# # be user wanted result. +# for param in params: +# grad_name = param.name + core.grad_var_suffix() +# grad_var = train_program.desc.block(0).find_var( +# cpt.to_bytes(grad_name)) +# # NOTE: cannot find var desc maybe no problem, such as in batch_norm +# if grad_var is None: +# continue +# param._set_grad_type(grad_var.type()) + + +def _create_out(var): + assert isinstance(var, Variable) + var_desc = var.desc + varbase = None + if not core._in_eager_mode(): + var_base = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) + else: + var_base = core.eager.Tensor(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) + return var_base + + +class TestRunProgram(unittest.TestCase): + def test_eager(self): + paddle.set_device('cpu') + paddle.enable_static() + # step 1: construct program + x = paddle.static.data(shape=[2, 4], name='x') + x.stop_gradient = False + y = paddle.static.data(shape=[4, 2], name='y') + y.stop_gradient = False + out = paddle.matmul(x, y) + + main_program = paddle.static.default_main_program() + program = _append_backward_desc(main_program, [out]) + + paddle.disable_static('cpu') + # step 2: call run_program in eager mode + with _test_eager_guard(): + x_t = paddle.ones([2, 4]) + x_t.name = "x" + x_t.stop_gradient = False + y_t = paddle.ones([4, 2]) + y_t.name = "y" + y_t.stop_gradient = False + + fake_var = paddle.zeros([1]) + fake_var.name = 'Fake_var' + + out_t = _create_out(out) + + scope = core.Scope() + attrs = ('global_block', program.desc.block(0), 'start_op_index', 0, + 'end_op_index', main_program.desc.block(0).op_size(), + 'is_test', False, 'program_id', _hash_with_id(program)) + + _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope], + [fake_var], *attrs) + + loss = paddle.mean(out_t) + loss.backward() + + self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy())) + self.assertTrue( + np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy())) + self.assertTrue( + np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy())) + + +if __name__ == '__main__': + unittest.main() -- GitLab From 60b86b2ffb5fd442277fe81d7b1846a50eb2b599 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 9 Mar 2022 16:32:47 +0800 Subject: [PATCH 143/261] Sparse Conv3d gpu backward (#40143) Sparse conv3d backward(gpu) --- .../kernels/sparse/convolution_grad_kernel.h | 6 +- .../phi/kernels/sparse/convolution_kernel.h | 18 +- paddle/phi/kernels/sparse/cpu/convolution.h | 5 - .../sparse/cpu/convolution_grad_kernel.cc | 11 +- .../kernels/sparse/cpu/convolution_kernel.cc | 5 - .../phi/kernels/sparse/gpu/convolution.cu.h | 139 +++++++++++ .../sparse/gpu/convolution_grad_kernel.cu | 217 ++++++++++++++++++ .../kernels/sparse/gpu/convolution_kernel.cu | 172 +++----------- .../kernels/test_sparse_conv3d_dev_api.cc | 72 +++--- 9 files changed, 430 insertions(+), 215 deletions(-) create mode 100644 paddle/phi/kernels/sparse/gpu/convolution.cu.h create mode 100644 paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 3ada3473355..f4265d303d7 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -45,8 +45,10 @@ std::vector Conv3dGrad(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups) { - DenseTensor x_grad = phi::Empty(dev_ctx); - DenseTensor kernel_grad = phi::Empty(dev_ctx); + DenseTensor x_grad = + phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); + DenseTensor kernel_grad = phi::Empty( + dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout())); // TODO(zhangkaihuo): call InferMeta func here Conv3dGradKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 1c1e62c8306..cfb451afdcb 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -20,18 +20,6 @@ limitations under the License. */ #include "paddle/phi/kernels/empty_kernel.h" namespace phi { - -template -DenseTensor Empty(const Context& dev_ctx) { - phi::DenseTensor dense_out( - phi::make_intrusive( - dev_ctx.GetPlace()), - {paddle::experimental::CppTypeToDataType::Type(), - {-1}, - DataLayout::NCHW}); - return dense_out; -} - namespace sparse { struct Dims4D { @@ -149,8 +137,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx, const std::vector& strides, const int groups, DenseTensor* rulebook) { - DenseTensor indices = phi::Empty(dev_ctx); - DenseTensor values = phi::Empty(dev_ctx); + DenseTensor indices = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor values = + phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); SparseCooTensor coo(indices, values, x.dims()); Conv3dKernel( dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook); diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 1031f769179..bcb6db40788 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -45,9 +45,6 @@ void ProductRuleBook(const Context& dev_ctx, const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); - dev_ctx.Alloc(counter_per_kernel, - counter_per_kernel->dtype(), - sizeof(int) * counter_per_kernel->numel()); int* counter_ptr = counter_per_kernel->data(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; memset(counter_ptr, 0, kernel_size * sizeof(int)); @@ -138,8 +135,6 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, x.dtype(), {out_non_zero_num, out_channels}, x.layout()); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - dev_ctx.Alloc( - &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int)); int* out_indices_ptr = out_indices.data(); int i = 0; for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) { diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index cb6cf435435..6ee265a3296 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h" namespace phi { @@ -60,15 +61,8 @@ void Conv3dGradKernel(const Context& dev_ctx, phi::DenseTensor out_grad_features = phi::Empty(dev_ctx, std::move(out_grad_features_meta)); - dev_ctx.Alloc( - &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); T* in_features_ptr = in_features.data(); - dev_ctx.Alloc( - &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel()); T* d_x_features_ptr = d_x_features.data(); - dev_ctx.Alloc(&out_grad_features, - out_grad_features.dtype(), - sizeof(T) * out_grad_features.numel()); T* out_grad_features_ptr = out_grad_features.data(); kernel_grad->Resize(kernel_dims); dev_ctx.Alloc( @@ -156,12 +150,11 @@ void Conv3dGradKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv_grad, +PD_REGISTER_KERNEL(sparse_conv3d_grad, CPU, ALL_LAYOUT, phi::sparse::Conv3dGradKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); - kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO); } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 93397d4c931..64ef068e03a 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -81,8 +81,6 @@ void Conv3dKernel(const Context& dev_ctx, phi::Empty(dev_ctx, std::move(in_features_meta)); phi::DenseTensor out_features = phi::Empty(dev_ctx, std::move(out_features_meta)); - dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel()); - dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel()); T* in_features_ptr = in_features.data(); T* out_features_ptr = out_features.data(); @@ -128,9 +126,6 @@ void Conv3dKernel(const Context& dev_ctx, } // 4. scatter - dev_ctx.Alloc(out->mutable_non_zero_elements(), - out->mutable_non_zero_elements()->dtype(), - sizeof(T) * in_features.numel()); T* out_values_ptr = out->mutable_non_zero_elements()->data(); memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels); Scatter(out_features_ptr, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h new file mode 100644 index 00000000000..03a6aaa6894 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" + +namespace phi { +namespace sparse { + +// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace +// this kernel with phi::GatherCUDAKernel; +// Vectorization can be used to improve read and write bandwidth +/** + * brief: gather data from params according to indices + * params: the inputs + * indices: the indices you want to gather + * output: the outputs + * index_size: the size of indices + * slice_size: slice size corresponding to each index, here is the channel size +**/ +template +__global__ void GatherKernel(const T* params, + const IndexT* indices, + T* output, + size_t index_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = indices[indices_i]; + int64_t params_i = gather_i * slice_size + slice_i; + *(output + i) = *(params + params_i); + } +} + +/** + * brief: scatter add + * input: the inputs + * unique_value: refer to UpdateIndexKernel notes + * out_index: the output feature index + * non_zero_num: the number of output features + * rulebook_len: the length of rulebook + * channels: the output channel size + * out: the outputs +**/ +template +__global__ void ScatterKernel(const T* input, + const int* unique_value, + const int* out_index, + const int non_zero_num, + const int rulebook_len, + const int channels, + T* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { + int indices_i = i / channels; + int channels_i = i - indices_i * channels; + + int start = unique_value[indices_i]; + int end = indices_i == non_zero_num - 1 ? rulebook_len + : unique_value[indices_i + 1]; + // max(end-start) = kernel_size + T sum = static_cast(0); + for (int j = start; j < end; j++) { + const int out_feature_i = out_index[j]; + sum += input[out_feature_i * channels + channels_i]; + } + out[indices_i * channels + channels_i] = sum; + } +} + +template +inline int* SortedAndUniqueIndex(const Context& dev_ctx, + const int* rulebook_ptr, + const int len, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value) { + phi::IndexKernel>( + dev_ctx, out_index, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, unique_value, kps::IdentityFunctor()); + + phi::backends::gpu::GpuMemcpyAsync(unique_key->data(), + rulebook_ptr, + sizeof(int) * len, +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToDevice, +#else + cudaMemcpyDeviceToDevice, +#endif + dev_ctx.stream()); +// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher +// performance, but thrust::merge_by_key limited by data size +#ifdef PADDLE_WITH_HIP + thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key->data(), + unique_key->data() + len, + out_index->data()); + + // 4. unique + thrust::pair new_end = +#ifdef PADDLE_WITH_HIP + thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key->data(), + unique_key->data() + len, + unique_value->data()); + return new_end.first; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu new file mode 100644 index 00000000000..861f18f36e6 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -0,0 +1,217 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" + +namespace phi { +namespace sparse { + +// rulebook[3, rulebook_len]: +//[ +// [kernel_index], +// [in_i], +// [out_i], +//] +// x_grad = out_grad * transpose(kenrel) +// kernel_grad = transpose(x) * out_grad +template +void Conv3dGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + DenseTensor* x_grad, + DenseTensor* kernel_grad) { + const auto& kernel_dims = kernel.dims(); + const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + const int* rulebook_ptr = rulebook.data(); + + const int rulebook_len = rulebook.dims()[1]; + + DenseTensorMeta in_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta d_x_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta out_grad_features_meta( + x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW); + phi::DenseTensor in_features = + phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::DenseTensor d_x_features = + phi::Empty(dev_ctx, std::move(d_x_features_meta)); + phi::DenseTensor out_grad_features = + phi::Empty(dev_ctx, std::move(out_grad_features_meta)); + + T* in_features_ptr = in_features.data(); + T* d_x_features_ptr = d_x_features.data(); + T* out_grad_features_ptr = out_grad_features.data(); + kernel_grad->Resize(kernel_dims); + dev_ctx.Alloc( + kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T)); + T* d_kernel_ptr = kernel_grad->data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, kernel_grad, static_cast(0.0f)); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels, 1); + GatherKernel<<>>( + out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + + auto blas = phi::funcs::GetBlas(dev_ctx); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0), + h_counter(rulebook_len, 0); + phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], + rulebook_ptr, + rulebook_len * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + + dev_ctx.stream()); + dev_ctx.Wait(); + + for (int i = 0; i < rulebook_len; i++) { + counter[h_counter[i]] += 1; + } + int offset = 0; + for (int i = 0; i < kernel_size; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[kernel_size] = offset; + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + const int M = counter[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; + T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels; + T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; + + // call gemm: d_kernel = transpose(x) * out_grad + // (in_channels, n) * (n, out_channels) + blas.GEMM(CblasTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_out_grad_ptr, + static_cast(0), + tmp_d_kernel_ptr); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + blas.GEMM(CblasNoTrans, + CblasTrans, + M, + K, + N, + static_cast(1), + tmp_out_grad_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_d_x_ptr); + } + + // 4. scatter + x_grad->Resize(x.non_zero_elements().dims()); + dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); + T* x_grad_values_ptr = x_grad->data(); + + DenseTensor out_index = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor unique_key = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor unique_value = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + + SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + rulebook_len, + rulebook_len, + &out_index, + &unique_key, + &unique_value); + + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + + ScatterKernel<<>>(d_x_features_ptr, + unique_value.data(), + out_index.data(), + x.nnz(), + rulebook_len, + in_channels, + x_grad_values_ptr); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_conv3d_grad, + GPU, + ALL_LAYOUT, + phi::sparse::Conv3dGradKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index aeb9409c417..4a533d9d1d5 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -17,7 +17,6 @@ limitations under the License. */ #include #include -#include "glog/logging.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -28,19 +27,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" namespace phi { namespace sparse { -// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex -__global__ void InitByIndexKernel(const int n, int* out1, int* out2) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < n; i += gridDim.x * blockDim.x) { - out1[i] = i; - out2[i] = i; - } -} - /** * @brief: update the out index and indices * unique_keys: save the index of the output feature list @@ -124,7 +115,7 @@ __global__ void ProductRuleBookKernel(const int* x_indices, int in_z = x_indices[i + non_zero_num]; int in_y = x_indices[i + 2 * non_zero_num]; int in_x = x_indices[i + 3 * non_zero_num]; - int in_i = -1, out_index = -1; + int in_i = -1, out_index = -1, kernel_i = -1; if (Check(x_dims, kernel_dims, paddings, @@ -143,9 +134,11 @@ __global__ void ProductRuleBookKernel(const int* x_indices, out_index = PointToIndex(batch, out_x, out_y, out_z, out_dims); atomicAdd(&counter_buf[kernel_index], 1); + kernel_i = kernel_index; } - rulebook[kernel_index * non_zero_num + i] = in_i; - rulebook[kernel_index * non_zero_num + offset + i] = out_index; + rulebook[kernel_index * non_zero_num + i] = kernel_i; + rulebook[kernel_index * non_zero_num + offset + i] = in_i; + rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; ++kernel_index; } } @@ -157,68 +150,6 @@ __global__ void ProductRuleBookKernel(const int* x_indices, } } -// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace -// this kernel with phi::GatherCUDAKernel; -// Vectorization can be used to improve read and write bandwidth -/** - * brief: gather data from params according to indices - * params: the inputs - * indices: the indices you want to gather - * output: the outputs - * index_size: the size of indices - * slice_size: slice size corresponding to each index, here is the channel size -**/ -template -__global__ void GatherKernel(const T* params, - const IndexT* indices, - T* output, - size_t index_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; // offset inside the slice - IndexT gather_i = indices[indices_i]; - int64_t params_i = gather_i * slice_size + slice_i; - *(output + i) = *(params + params_i); - } -} - -/** - * brief: scatter add - * input: the inputs - * unique_value: refer to UpdateIndexKernel notes - * out_index: the output feature index - * non_zero_num: the number of output features - * rulebook_len: the length of rulebook - * channels: the output channel size - * out: the outputs -**/ -template -__global__ void ScatterKernel(const T* input, - const int* unique_value, - const int* out_index, - const int non_zero_num, - const int rulebook_len, - const int channels, - T* out) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { - int indices_i = i / channels; - int channels_i = i - indices_i * channels; - - int start = unique_value[indices_i]; - int end = indices_i == non_zero_num - 1 ? rulebook_len - : unique_value[indices_i + 1]; - // max(end-start) = kernel_size - T sum = static_cast(0); - for (int j = start; j < end; j++) { - const int out_feature_i = out_index[j]; - sum += input[out_feature_i * channels + channels_i]; - } - out[indices_i * channels + channels_i] = sum; - } -} - // brief: calculation the distance between start and end __global__ void DistanceKernel(const int* start, const int* end, @@ -264,16 +195,12 @@ int ProductRuleBook(const Context& dev_ctx, const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); - dev_ctx.Alloc(counter_per_kernel, - counter_per_kernel->dtype(), - sizeof(int) * counter_per_kernel->numel()); int* counter_ptr = counter_per_kernel->data(); - dev_ctx.Alloc(offsets_per_kernel, - offsets_per_kernel->dtype(), - sizeof(int) * offsets_per_kernel->numel()); int* offsets_ptr = offsets_per_kernel->data(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; - rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num}); + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel()); int* rulebook_ptr = rulebook->data(); @@ -312,7 +239,7 @@ int ProductRuleBook(const Context& dev_ctx, int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif rulebook_ptr, - rulebook_ptr + 2 * kernel_size * non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols, -1); #ifdef PADDLE_WITH_HIP @@ -350,6 +277,7 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.Wait(); int rulebook_len = (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1]; + rulebook->Resize({rulebook_rows, rulebook_len}); // 3. sorted or merge the out index out_index->ResizeAndAllocate({rulebook_len}); @@ -365,66 +293,30 @@ int ProductRuleBook(const Context& dev_ctx, unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel()); int* unique_key_ptr = unique_key->data(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - InitByIndexKernel<<>>( - rulebook_len, out_index_ptr, unique_value_ptr); - -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, - rulebook_ptr + rulebook_len, - rulebook_len * sizeof(int), - hipMemcpyDeviceToDevice, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, - rulebook_ptr + rulebook_len, - rulebook_len * sizeof(int), - cudaMemcpyDeviceToDevice, - dev_ctx.stream()); -#endif - -// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher -// performance, but thrust::merge_by_key limited by data size -#ifdef PADDLE_WITH_HIP - thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - unique_key_ptr, - unique_key_ptr + rulebook_len, - out_index_ptr); - - // 4. unique - thrust::pair new_end = -#ifdef PADDLE_WITH_HIP - thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - unique_key_ptr, - unique_key_ptr + rulebook_len, - unique_value_ptr); + int* new_end = SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index, + unique_key, + unique_value); // thrust::distance doesn't support stream parameters // const int out_non_zero_num = thrust::distance(unique_key_ptr, // new_end.first); DistanceKernel<<<1, 1>>>(unique_key_ptr, - new_end.first, - rulebook_ptr + 2 * kernel_size * non_zero_num - 1); + new_end, + rulebook_ptr + rulebook_rows * rulebook_cols - 1); int out_non_zero_num = 0; #ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync( &out_non_zero_num, - rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream()); #else phi::backends::gpu::GpuMemcpyAsync( &out_non_zero_num, - rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream()); @@ -440,8 +332,6 @@ int ProductRuleBook(const Context& dev_ctx, phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - dev_ctx.Alloc( - &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel()); int* out_indices_ptr = out_indices.data(); config = @@ -456,7 +346,7 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_len, d_out_dims, out_indices_ptr, - rulebook_ptr + rulebook_len); + rulebook_ptr + 2 * rulebook_len); out->SetMember(out_indices, out_values, out_dims, true); return rulebook_len; } @@ -499,9 +389,12 @@ void Conv3dKernel(const Context& dev_ctx, DataType::INT32, {kernel_size}, DataLayout::NCHW); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); - DenseTensor out_index = phi::Empty(dev_ctx); - DenseTensor unique_key = phi::Empty(dev_ctx); - DenseTensor unique_value = phi::Empty(dev_ctx); + DenseTensor out_index = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor unique_key = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor unique_value = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); int n = ProductRuleBook(dev_ctx, x, @@ -522,6 +415,7 @@ void Conv3dKernel(const Context& dev_ctx, const int* counter_ptr = counter_per_kernel.data(); const int* offsets_ptr = counter_per_kernel.data(); + const int* rulebook_ptr = rulebook->data(); // 2. gather DenseTensorMeta in_features_meta( @@ -532,11 +426,7 @@ void Conv3dKernel(const Context& dev_ctx, phi::Empty(dev_ctx, std::move(in_features_meta)); phi::DenseTensor out_features = phi::Empty(dev_ctx, std::move(out_features_meta)); - dev_ctx.Alloc( - &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); T* in_features_ptr = in_features.data(); - dev_ctx.Alloc( - &out_features, out_features.dtype(), sizeof(T) * out_features.numel()); T* out_features_ptr = out_features.data(); auto config = @@ -545,7 +435,7 @@ void Conv3dKernel(const Context& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(x.non_zero_elements().data(), - rulebook->data(), + rulebook_ptr + n, in_features_ptr, n, in_channels); @@ -553,8 +443,6 @@ void Conv3dKernel(const Context& dev_ctx, // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); auto* out_values = out->mutable_non_zero_elements(); - dev_ctx.Alloc( - out_values, out_values->dtype(), sizeof(T) * out_values->numel()); T* out_values_ptr = out_values->data(); const T* kernel_ptr = kernel.data(); diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index ace95b55055..c1a8b853b32 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -78,9 +78,6 @@ void TestConv3dBase(const std::vector& indices, DenseTensor indices_tensor = phi::Empty( dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); - dev_ctx_cpu.Alloc(&indices_tensor, - indices_tensor.dtype(), - sizeof(int) * indices_tensor.numel()); memcpy( indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); DenseTensor features_tensor = phi::Empty( @@ -88,9 +85,6 @@ void TestConv3dBase(const std::vector& indices, DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), {non_zero_num, in_channels}, DataLayout::NHWC)); - dev_ctx_cpu.Alloc(&features_tensor, - features_tensor.dtype(), - features_tensor.numel() * sizeof(T)); memcpy( features_tensor.data(), features.data(), features.size() * sizeof(T)); @@ -101,12 +95,18 @@ void TestConv3dBase(const std::vector& indices, DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), kernel_dims, DataLayout::NHWC)); - dev_ctx_cpu.Alloc( - &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T)); memcpy(kernel_tensor.data(), kernel.data(), kernel.size() * sizeof(T)); + auto f_verify = [&](const T* real_data, const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + if (!std::is_same::value) { - DenseTensor rulebook = phi::Empty(dev_ctx_cpu); + DenseTensor rulebook = phi::Empty( + dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); SparseCooTensor out = sparse::Conv3d(dev_ctx_cpu, x_tensor, kernel_tensor, @@ -127,15 +127,6 @@ void TestConv3dBase(const std::vector& indices, correct_out_indices.size() * sizeof(int)); ASSERT_EQ(cmp_indices, 0); - auto f_verify = [&](const T* real_data, - const std::vector& correct_data) { - for (uint64_t i = 0; i < correct_data.size(); i++) { - float tmp = - std::fabs(static_cast(correct_data[i] - real_data[i])); - ASSERT_LT(tmp, diff); - } - }; - f_verify(out.non_zero_elements().data(), correct_out_features); if (backward) { @@ -170,9 +161,6 @@ void TestConv3dBase(const std::vector& indices, DenseTensor d_indices_tensor = phi::Empty( dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); - dev_ctx_gpu.Alloc(&d_indices_tensor, - d_indices_tensor.dtype(), - sizeof(int) * d_indices_tensor.numel()); phi::Copy( dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); @@ -181,9 +169,6 @@ void TestConv3dBase(const std::vector& indices, DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), {non_zero_num, in_channels}, DataLayout::NHWC)); - dev_ctx_gpu.Alloc(&d_features_tensor, - d_features_tensor.dtype(), - sizeof(T) * d_features_tensor.numel()); phi::Copy( dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); @@ -194,13 +179,11 @@ void TestConv3dBase(const std::vector& indices, DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), kernel_dims, DataLayout::NHWC)); - dev_ctx_gpu.Alloc(&d_kernel_tensor, - d_kernel_tensor.dtype(), - sizeof(T) * d_kernel_tensor.numel()); phi::Copy( dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); - DenseTensor d_rulebook = phi::Empty(dev_ctx_gpu); + DenseTensor d_rulebook = phi::Empty( + dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, d_x_tensor, d_kernel_tensor, @@ -219,9 +202,6 @@ void TestConv3dBase(const std::vector& indices, DenseTensor h_indices_tensor = phi::Empty( dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); - dev_ctx_cpu.Alloc(&h_indices_tensor, - h_indices_tensor.dtype(), - sizeof(int) * h_indices_tensor.numel()); phi::Copy(dev_ctx_gpu, d_out.non_zero_indices(), phi::CPUPlace(), @@ -239,18 +219,34 @@ void TestConv3dBase(const std::vector& indices, {d_out.nnz()}, d_out.layout())); - dev_ctx_cpu.Alloc(&h_features_tensor, - h_features_tensor.dtype(), - sizeof(T) * h_features_tensor.numel()); phi::Copy(dev_ctx_gpu, d_out.non_zero_elements(), phi::CPUPlace(), true, &h_features_tensor); - for (uint64_t i = 0; i < correct_out_features.size(); i++) { - float tmp = std::fabs(static_cast(correct_out_features[i] - - h_features_tensor.data()[i])); - ASSERT_LT(tmp, diff); + f_verify(h_features_tensor.data(), correct_out_features); + + if (backward) { + std::vector grads = sparse::Conv3dGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_kernel_tensor, + d_out, + paddings, + dilations, + strides, + 1); + DenseTensor h_features_grad = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); + phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad); + f_verify(h_features_grad.data(), features_grad); + + DenseTensor h_kernel_grad = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout())); + phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad); + f_verify(h_kernel_grad.data(), kernel_grad); } #endif } -- GitLab From 95c343d3c74ff3b2c0733d4f935d9995281d019b Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Wed, 9 Mar 2022 17:37:01 +0800 Subject: [PATCH 144/261] Fix a bug which might occur OOM problem (#40226) * Add wait after Copy * fix wrong place delete --- .../gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu | 2 ++ .../phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu | 1 + 2 files changed, 3 insertions(+) diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index 6fc65006ae2..f61cd2c3967 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -95,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, norm, sizeof(T), dev_ctx.stream()); + dev_ctx.Wait(); auto eps = static_cast(1e-5); *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; @@ -102,6 +103,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, std::vector div_outs = {in_grad}; auto div_functor = DivFunctor(*norm_cpu_ptr); phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); + delete norm_tensor; } delete counts_tensor; diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 4b6e5628c72..b0e9efe5bba 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -95,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, norm, sizeof(T), dev_ctx.stream()); + dev_ctx.Wait(); auto eps = static_cast(1e-5); *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; -- GitLab From ec582895501b1ae4da110ce6b9fcb61ddcacb718 Mon Sep 17 00:00:00 2001 From: wawltor Date: Wed, 9 Mar 2022 17:57:32 +0800 Subject: [PATCH 145/261] fix the full_like with fill the value of inf (#40232) * fix the full_like with fill the value of inf * update the test case for the fill_any_like * updae the comments for the full_like --- paddle/phi/kernels/cpu/full_kernel.cc | 20 ++++++++++++++----- paddle/phi/kernels/gpu/full_kernel.cu | 20 ++++++++++++++----- .../tests/unittests/test_fill_any_like_op.py | 13 ------------ .../tests/unittests/test_full_like_op.py | 9 +++++++++ 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc index 86576a861aa..556de3adcf4 100644 --- a/paddle/phi/kernels/cpu/full_kernel.cc +++ b/paddle/phi/kernels/cpu/full_kernel.cc @@ -54,12 +54,22 @@ void FullLikeKernel(const Context& dev_ctx, auto common_type_value = static_cast(value); - PADDLE_ENFORCE_EQ( - (common_type_value >= + // Check whether the filled value is valid + bool is_out_range = true; + if (std::isinf(value) || std::isnan(value)) { + is_out_range = false; + } + + if ((common_type_value >= static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, + (common_type_value <= + static_cast(std::numeric_limits::max()))) { + is_out_range = false; + } + + PADDLE_ENFORCE_EQ( + is_out_range, + false, phi::errors::InvalidArgument( "The filled value is out of range for target type, " "current kernel type is %s, the range should between %f " diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index a905979f08b..852d209ee01 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -71,12 +71,22 @@ void FullLikeKernel(const Context& dev_ctx, auto common_type_value = static_cast(value); - PADDLE_ENFORCE_EQ( - (common_type_value >= + // Check whether the filled value is valid + bool is_out_range = true; + if (std::isinf(value) || std::isnan(value)) { + is_out_range = false; + } + + if ((common_type_value >= static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, + (common_type_value <= + static_cast(std::numeric_limits::max()))) { + is_out_range = false; + } + + PADDLE_ENFORCE_EQ( + is_out_range, + false, phi::errors::InvalidArgument( "The filled value is out of range for target type, " "current kernel type is %s, the range should between %f " diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py index 9be2e57ff0c..95537d43327 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py @@ -98,19 +98,6 @@ class TestFillAnyLikeOpType(TestFillAnyLikeOp): } -class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp): - def init(self): - self.value = 1e100 - - def test_check_output(self): - exception = None - try: - self.check_output(check_dygraph=False) - except ValueError as ex: - exception = ex - self.assertIsNotNone(exception) - - class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp): def init(self): self.dtype = np.float16 diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py index be6abb17c3c..3ae2e9ff6bd 100644 --- a/python/paddle/fluid/tests/unittests/test_full_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py @@ -62,6 +62,15 @@ class TestFullOp(unittest.TestCase): self.assertTrue((out.numpy() == out_numpy).all(), True) paddle.enable_static() + def test_full_like_fill_inf(self): + paddle.disable_static() + input = paddle.arange(6, 10, dtype='float32') + out = paddle.full_like(input, fill_value=float('inf')) + out_numpy = np.random.random((4)).astype("float32") + out_numpy.fill(float('inf')) + self.assertTrue((out.numpy() == out_numpy).all(), True) + paddle.enable_static() + class TestFullOpError(unittest.TestCase): def test_errors(self): -- GitLab From 2aca8d90813170d364ed0dde6580ffc08451597a Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Wed, 9 Mar 2022 18:51:47 +0800 Subject: [PATCH 146/261] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20eigh=20o?= =?UTF-8?q?p=20to=20phi=20(#40213)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * migrate eigh to phi * optimize code * modify code according to comment * conflict resolution --- paddle/fluid/operators/eigh_op.cc | 63 +-- paddle/fluid/operators/eigh_op.cu | 32 -- paddle/fluid/operators/eigh_op.h | 74 ---- paddle/phi/infermeta/unary.cc | 32 ++ paddle/phi/infermeta/unary.h | 5 + paddle/phi/kernels/CMakeLists.txt | 3 +- paddle/phi/kernels/cpu/eigh_grad_kernel.cc | 28 ++ paddle/phi/kernels/cpu/eigh_kernel.cc | 43 ++ paddle/phi/kernels/eigh_grad_kernel.h | 29 ++ paddle/phi/kernels/eigh_kernel.h | 29 ++ .../kernels/funcs/values_vectors_functor.h | 386 ++++++++++++++++++ paddle/phi/kernels/gpu/eigh_grad_kernel.cu | 29 ++ paddle/phi/kernels/gpu/eigh_kernel.cu | 48 +++ .../phi/kernels/impl/eigh_grad_kernel_impl.h | 79 ++++ paddle/phi/ops/compat/eigh_sig.cc | 31 ++ 15 files changed, 751 insertions(+), 160 deletions(-) delete mode 100644 paddle/fluid/operators/eigh_op.cu delete mode 100644 paddle/fluid/operators/eigh_op.h create mode 100644 paddle/phi/kernels/cpu/eigh_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/eigh_kernel.cc create mode 100644 paddle/phi/kernels/eigh_grad_kernel.h create mode 100644 paddle/phi/kernels/eigh_kernel.h create mode 100644 paddle/phi/kernels/funcs/values_vectors_functor.h create mode 100644 paddle/phi/kernels/gpu/eigh_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/eigh_kernel.cu create mode 100644 paddle/phi/kernels/impl/eigh_grad_kernel_impl.h create mode 100644 paddle/phi/ops/compat/eigh_sig.cc diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc index 553d0e679cc..4e33c567eb6 100644 --- a/paddle/fluid/operators/eigh_op.cc +++ b/paddle/fluid/operators/eigh_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigh_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,42 +25,9 @@ using framework::Tensor; class EighOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", - "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", - "Eigh"); - - auto input_dim = ctx->GetInputDim("X"); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], input_dim[rank - 1], - platform::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - - ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim)); - ctx->SetOutputDim("Eigenvectors", input_dim); - } }; -class EignOpMaker : public framework::OpProtoAndCheckerMaker { +class EighOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor, + PD_INFER_META(phi::EighInferMeta)); -REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker, +REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker, ops::EighGradOpMaker, - ops::EighGradOpMaker); + ops::EighGradOpMaker, + EighInferShapeFunctor); REGISTER_OPERATOR(eigh_grad, ops::EighGradOp); - -REGISTER_OP_CPU_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CPU_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu deleted file mode 100644 index 827c551637d..00000000000 --- a/paddle/fluid/operators/eigh_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eigh_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CUDA_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h deleted file mode 100644 index 5279ec75093..00000000000 --- a/paddle/fluid/operators/eigh_op.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/eigen_values_vectors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EighKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto output_w = ctx.Output("Eigenvalues"); - auto output_v = ctx.Output("Eigenvectors"); - std::string lower = ctx.Attr("UPLO"); - bool is_lower = (lower == "L"); - math::MatrixEighFunctor functor; - functor(ctx, *input, output_w, output_v, is_lower, true); - } -}; - -template -class EighGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::dtype::Real; - auto& x_grad = *ctx.Output(framework::GradVarName("X")); - x_grad.mutable_data(ctx.GetPlace()); - auto& output_w = *ctx.Input("Eigenvalues"); - auto& output_v = *ctx.Input("Eigenvectors"); - auto& output_w_grad = - *ctx.Input(framework::GradVarName("Eigenvalues")); - auto& output_v_grad = - *ctx.Input(framework::GradVarName("Eigenvectors")); - - auto& dims = output_v.dims(); - const int m = dims[dims.size() - 1]; - auto dito = - math::DeviceIndependenceTensorOperations( - ctx); - auto tV = dito.Transpose(dito.Conj(output_v)); - auto W = dito.template Sub(dito.Unsqueeze(output_w, -2), - dito.Unsqueeze(output_w, -1)); - Tensor result = dito.Matmul(tV, output_v_grad); - result.mutable_data(dims, ctx.GetPlace()); - std::vector out_shape = phi::vectorize(dims); - auto constant = dito.Fill(out_shape, 0.5); - result = dito.Sub(result, dito.Conj(dito.Transpose(result))); - result = dito.Mul(result, constant); - result = dito.Div(result, W); - result = dito.DiagFill(m, m, m, 0, output_w_grad, result); - x_grad = dito.Matmul(output_v, dito.Matmul(result, tV)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 32744659163..544a5593014 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1123,6 +1123,38 @@ void TransposeInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v) { + auto input_dim = x.dims(); + auto rank = input_dim.size(); + + PADDLE_ENFORCE_GE(rank, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions." + "But received a %d dimension tensor.", + rank)); + PADDLE_ENFORCE_EQ( + input_dim[rank - 2], + input_dim[rank - 1], + phi::errors::InvalidArgument( + "Eigh op is designed for square matrix, consequently" + "inner-most 2 dimensions of Input(X) should be symmetric." + "But received X's shape[-2] = %d and shape[-1] = %d.", + input_dim[rank - 2], + input_dim[rank - 1])); + + std::vector values_dim; + + for (auto i = 0; i < rank - 1; i++) { + values_dim.emplace_back(input_dim[i]); + } + out_w->set_dims(phi::make_ddim(values_dim)); + out_v->set_dims(input_dim); +} + } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 735a77faefe..c57e1bdec8d 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -163,4 +163,9 @@ void TransposeInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v); + } // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index de3b5b53f46..71e0d9e3479 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) @@ -38,6 +38,7 @@ kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_k kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc new file mode 100644 index 00000000000..5135778db56 --- /dev/null +++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/eigh_grad_kernel.h" +#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(eigh_grad, + CPU, + ALL_LAYOUT, + phi::EighGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc new file mode 100644 index 00000000000..92fd20ca9b8 --- /dev/null +++ b/paddle/phi/kernels/cpu/eigh_kernel.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/eigh_kernel.h" +#include "paddle/phi/kernels/funcs/values_vectors_functor.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +namespace phi { + +template +void EighKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::string& uplo, + DenseTensor* out_w, + DenseTensor* out_v) { + bool is_lower = (uplo == "L"); + phi::funcs::MatrixEighFunctor functor; + functor(dev_ctx, x, out_w, out_v, is_lower, true); +} + +} // namespace phi + +PD_REGISTER_KERNEL(eigh, + CPU, + ALL_LAYOUT, + phi::EighKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/eigh_grad_kernel.h b/paddle/phi/kernels/eigh_grad_kernel.h new file mode 100644 index 00000000000..73df76e676a --- /dev/null +++ b/paddle/phi/kernels/eigh_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void EighGardKernel(const Context& dev_ctx, + const DenseTensor& out_w, + const DenseTensor& out_v, + const DenseTensor& dout_w, + const DenseTensor& dout_v, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h new file mode 100644 index 00000000000..dd28752d929 --- /dev/null +++ b/paddle/phi/kernels/eigh_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void EighKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::string& uplo, + DenseTensor* out_w, + DenseTensor* out_v); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h new file mode 100644 index 00000000000..b3189fc5cc3 --- /dev/null +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -0,0 +1,386 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/memory/memory.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/phi/backends/dynload/cusolver.h" +#endif // PADDLE_WITH_CUDA +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace funcs { + +inline int64_t GetBatchSize(phi::DDim dims) { + int64_t batch_size = 1; + auto dim_size = dims.size(); + for (int i = 0; i < dim_size - 2; i++) { + batch_size *= dims[i]; + } + return batch_size; +} + +static void CheckEighResult(const int batch, const int info) { + PADDLE_ENFORCE_LE( + info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: the [%d] off-diagonal elements of an intermediate" + "tridiagonal form did not converge to zero", + batch, + info)); + PADDLE_ENFORCE_GE( + info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: the [%d] argument had an illegal value", + batch, + info)); +} + +template +struct MatrixEighFunctor { + void operator()(const DeviceContext &dev_ctx, + const DenseTensor &input, + DenseTensor *eigen_values, + DenseTensor *eigen_vectors, + bool is_lower, + bool has_vectors); +}; + +// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real +// symmetric matrices, and uses the variable has_vectors to +// control whether to return the eigenvectors. +template +struct MatrixEighFunctor { + public: + void operator()(const CPUContext &dev_ctx, + const DenseTensor &input, + DenseTensor *eigen_values, + DenseTensor *eigen_vectors, + bool is_lower, + bool has_vectors) { + using ValueType = phi::dtype::Real; + ValueType *out_value = dev_ctx.template Alloc(eigen_values); + + DenseTensor input_trans; + // lapack is a column-major storge, transpose make the input to + // have a continuous memory layout + input_trans = phi::TransposeLast2Dim(dev_ctx, input); + T *input_vector = input_trans.data(); + + auto dims = input.dims(); + int dim_size = dims.size(); + int64_t batch_size = GetBatchSize(dims); + + int vector_stride = dims[dim_size - 1] * dims[dim_size - 2]; + int values_stride = dims[dim_size - 1]; + char uplo = is_lower ? 'L' : 'U'; + char jobz = has_vectors ? 'V' : 'N'; + int n = dims[dim_size - 1]; + int64_t lda = std::max(1, n); + // if work = -1, it means that you need to use the lapack function to query + // the optimal value + int lwork = -1; // The length of the array work + int lrwork = -1; // The dimension of the array rwork,rwork is REAL array + int liwork = -1; // The dimension of the array iwork + int iwork_opt = -1; // The optimal length of the array liwork + T lwork_opt = static_cast(-1); // The optimal length of the array work + ValueType rwork_opt = + static_cast(-1); // The optimal length of the array rwork + + int info = 0; + // Call lapackEigh to get the optimal size of work data + phi::funcs::lapackEigh(jobz, + uplo, + n, + input_vector, + lda, + out_value, + &lwork_opt, + lwork, + &rwork_opt, + lrwork, + &iwork_opt, + liwork, + &info); + lwork = std::max(1, static_cast(lwork_opt)); + liwork = std::max(1, iwork_opt); + + DenseTensor rwork_tensor; + ValueType *rwork_data = nullptr; + + // complex type + if (input.type() == phi::DataType::COMPLEX64 || + input.type() == phi::DataType::COMPLEX128) { + lrwork = std::max(1, static_cast(rwork_opt)); + + rwork_tensor.Resize(phi::make_ddim({lrwork})); + rwork_data = dev_ctx.template Alloc(&rwork_tensor); + } + + DenseTensor iwork_tensor, work_tensor; + + iwork_tensor.Resize(phi::make_ddim({liwork})); + int *iwork_data = dev_ctx.template Alloc(&iwork_tensor); + + work_tensor.Resize(phi::make_ddim({lwork})); + T *work_data = dev_ctx.template Alloc(&work_tensor); + + for (auto i = 0; i < batch_size; i++) { + auto *value_data = out_value + i * values_stride; + auto *input_data = input_vector + i * vector_stride; + phi::funcs::lapackEigh(jobz, + uplo, + n, + input_data, + lda, + value_data, + work_data, + lwork, + rwork_data, + lrwork, + iwork_data, + liwork, + &info); + CheckEighResult(i, info); + } + if (has_vectors) { + PADDLE_ENFORCE_NOT_NULL(eigen_vectors, + phi::errors::InvalidArgument( + "When has_vectors is true," + "the eigenvectors needs to be calculated, " + "so the eigenvectors must be provided.")); + input_trans = phi::TransposeLast2Dim(dev_ctx, input_trans); + eigen_vectors->ShareDataWith(input_trans); + } + } +}; + +#ifdef PADDLE_WITH_CUDA + +// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real +// symmetric matrices on GPU, and uses the variable has_vectors +// to control whether to return the eigenvectors. +template +struct MatrixEighFunctor { + public: + void operator()(const GPUContext &dev_ctx, + const DenseTensor &input, + DenseTensor *eigen_values, + DenseTensor *eigen_vectors, + bool is_lower, + bool has_vectors) { + using ValueType = phi::dtype::Real; + ValueType *out_value = dev_ctx.template Alloc(eigen_values); + + DenseTensor input_trans; + input_trans = phi::TransposeLast2Dim(dev_ctx, input); + T *input_vector = input_trans.data(); + auto &dims = input.dims(); + int dim_size = dims.size(); + int64_t batch_size = GetBatchSize(dims); + + cublasFillMode_t uplo = + is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + cusolverEigMode_t jobz = + has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR; + + int n = dims[dim_size - 1]; + int lda = std::max(1, n); + auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2]; + auto values_stride = dims[dim_size - 1]; + int lwork = 0; + auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size); + auto *info_ptr = reinterpret_cast(info->ptr()); + + // When the input type is float32, and the feature value input dimension + // is greater than or equal to [*,32,32] and less than or equal to + // [*,512,512], Syevj has better performance. + bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 && + values_stride >= 32 && values_stride <= 512); + syevjInfo_t syevj_params; + if (use_syevj) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateSyevjInfo(&syevj_params)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( + dev_ctx.cusolver_dn_handle(), + jobz, + uplo, + n, + reinterpret_cast(input_vector), + lda, + reinterpret_cast(out_value), + &lwork, + syevj_params)); + } else { + EvdBuffer(dev_ctx.cusolver_dn_handle(), + jobz, + uplo, + n, + input_vector, + lda, + out_value, + &lwork); + } + auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork); + auto *work_ptr = reinterpret_cast(work->ptr()); + for (auto i = 0; i < batch_size; i++) { + auto *input_data = input_vector + i * vector_stride; + auto *value_data = out_value + i * values_stride; + auto handle = dev_ctx.cusolver_dn_handle(); + if (use_syevj) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnSsyevj(handle, + jobz, + uplo, + n, + reinterpret_cast(input_data), + lda, + reinterpret_cast(value_data), + reinterpret_cast(work_ptr), + lwork, + info_ptr, + syevj_params)); + } else { + Evd(handle, + jobz, + uplo, + n, + input_data, + lda, + value_data, + work_ptr, + lwork, + info_ptr); + } + int error_info = 0; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info_ptr, + sizeof(int), + dev_ctx.stream()); + CheckEighResult(i, error_info); + } + + if (use_syevj) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDestroySyevjInfo(syevj_params)); + } + if (has_vectors) { + PADDLE_ENFORCE_NOT_NULL(eigen_vectors, + phi::errors::InvalidArgument( + "When has_vectors is true," + "the eigenvectors needs to be calculated," + "so the eigenvectors must be provided.")); + // input_trans = dito.Transpose(input_trans); + input_trans = phi::TransposeLast2Dim(dev_ctx, input_trans); + eigen_vectors->ShareDataWith(input_trans); + } + } + + using ValueType = phi::dtype::Real; + inline void EvdBuffer(cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T *A, + int lda, + const ValueType *W, + int *lwork) const; + + inline void Evd(cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T *A, + int lda, + ValueType *W, + T *work, + int lwork, + int *devInfo) const; +}; + +using phi::dtype::complex; + +#define FUNC_WITH_TYPES(m) \ + m(float, Ssy, float) m(double, Dsy, double) m( \ + complex, Che, cuComplex) m(complex, Zhe, cuDoubleComplex) + +#define EVDBUFFER_INSTANCE(T, C, CastType) \ + template <> \ + inline void MatrixEighFunctor::EvdBuffer( \ + cusolverDnHandle_t handle, \ + cusolverEigMode_t jobz, \ + cublasFillMode_t uplo, \ + int n, \ + const T *A, \ + int lda, \ + const ValueType *W, \ + int *lwork) const { \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##evd_bufferSize( \ + handle, \ + jobz, \ + uplo, \ + n, \ + reinterpret_cast(A), \ + lda, \ + W, \ + lwork)); \ + } + +FUNC_WITH_TYPES(EVDBUFFER_INSTANCE); + +#define EVD_INSTANCE(T, C, CastType) \ + template <> \ + inline void MatrixEighFunctor::Evd(cusolverDnHandle_t handle, \ + cusolverEigMode_t jobz, \ + cublasFillMode_t uplo, \ + int n, \ + T *A, \ + int lda, \ + ValueType *W, \ + T *work, \ + int lwork, \ + int *devInfo) const { \ + PADDLE_ENFORCE_GPU_SUCCESS( \ + dynload::cusolverDn##C##evd(handle, \ + jobz, \ + uplo, \ + n, \ + reinterpret_cast(A), \ + lda, \ + W, \ + reinterpret_cast(work), \ + lwork, \ + devInfo)); \ + } + +FUNC_WITH_TYPES(EVD_INSTANCE); + +#undef FUNC_WITH_TYPES +#undef EVDBUFFER_INSTANCE +#undef EVD_INSTANCE + +#endif // PADDLE_WITH_CUDA + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu new file mode 100644 index 00000000000..fdf61dc7399 --- /dev/null +++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/eigh_grad_kernel.h" +#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +PD_REGISTER_KERNEL(eigh_grad, + GPU, + ALL_LAYOUT, + phi::EighGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu new file mode 100644 index 00000000000..4ff3b371b6a --- /dev/null +++ b/paddle/phi/kernels/gpu/eigh_kernel.cu @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/eigh_kernel.h" +#include "paddle/phi/kernels/funcs/values_vectors_functor.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +namespace phi { + +template +void EighKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::string& uplo, + DenseTensor* out_w, + DenseTensor* out_v) { + bool is_lower = (uplo == "L"); + phi::funcs::MatrixEighFunctor functor; + functor(dev_ctx, x, out_w, out_v, is_lower, true); +} + +} // namespace phi + +PD_REGISTER_KERNEL(eigh, // cuda_only + GPU, + ALL_LAYOUT, + phi::EighKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h new file mode 100644 index 00000000000..2f0530b638f --- /dev/null +++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +void EighGradKernel(const Context& dev_ctx, + const DenseTensor& out_w, + const DenseTensor& out_v, + const DenseTensor& dout_w, + const DenseTensor& dout_v, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + auto& dims = out_v.dims(); + const int m = dims[dims.size() - 1]; + DenseTensor tV = + phi::TransposeLast2Dim(dev_ctx, phi::Conj(dev_ctx, out_v)); + DenseTensor W = + phi::Subtract>(dev_ctx, + phi::funcs::Unsqueeze(out_w, -2), + phi::funcs::Unsqueeze(out_w, -1)); + DenseTensor result = phi::Matmul(dev_ctx, tV, dout_v); + result.Resize(dims); + dev_ctx.template Alloc(&result); + + std::vector out_shape = phi::vectorize(dims); + DenseTensor constant; + constant.Resize(phi::make_ddim(out_shape)); + dev_ctx.template Alloc(&constant); + phi::funcs::SetConstant()(dev_ctx, &constant, T(0.5)); + result = phi::Subtract( + dev_ctx, + result, + phi::Conj(dev_ctx, phi::TransposeLast2Dim(dev_ctx, result))); + result = phi::Multiply(dev_ctx, result, constant); + if (result.type() != W.type()) { + auto x_vector = EigenVector::Flatten(result); + auto y_vector = EigenVector>::Flatten(W); + auto out_vector = EigenVector::Flatten(result); + auto& place = *dev_ctx.eigen_device(); + out_vector.device(place) = x_vector / y_vector; + } else { + result = phi::Divide(dev_ctx, result, W); + } + result = phi::funcs::DiagFill>( + dev_ctx, m, m, m, 0, dout_w, result); + *dx = phi::Matmul(dev_ctx, out_v, phi::Matmul(dev_ctx, result, tV)); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc new file mode 100644 index 00000000000..e50a9a5a12a --- /dev/null +++ b/paddle/phi/ops/compat/eigh_sig.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("eigh_grad", + {"Eigenvalues", + "Eigenvectors", + GradVarName("Eigenvalues"), + GradVarName("Eigenvectors")}, + {}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping); -- GitLab From 1defc8f3b3b4aaf7c1e3c517730cefea23766316 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Wed, 9 Mar 2022 18:59:14 +0800 Subject: [PATCH 147/261] change timeout for pool (#40341) --- python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 6169509e895..8f7b73fc0e0 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -80,7 +80,7 @@ if(WITH_NV_JETSON) set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450) set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450) else() - set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 120) + set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300) set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45) endif() set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60) -- GitLab From f40ed5f421d64028c9781c2b77aeb2958327b090 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Wed, 9 Mar 2022 19:04:50 +0800 Subject: [PATCH 148/261] add_sharding_api (#40129) --- python/paddle/distributed/__init__.py | 1 + .../sharding_optimizer_stage2.py | 6 +- .../meta_parallel/sharding/sharding_stage2.py | 19 +- .../meta_parallel/sharding/sharding_stage3.py | 9 +- .../paddle/distributed/sharding/__init__.py | 17 ++ .../distributed/sharding/group_sharded.py | 211 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 3 + .../unittests/dygraph_group_sharded_api.py | 147 ++++++++++++ .../unittests/dygraph_sharding_stage3.py | 8 +- .../test_dygraph_group_sharded_api.py | 31 +++ python/paddle/framework/io.py | 8 +- python/setup.py.in | 1 + 12 files changed, 437 insertions(+), 24 deletions(-) create mode 100644 python/paddle/distributed/sharding/__init__.py create mode 100644 python/paddle/distributed/sharding/group_sharded.py create mode 100644 python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index fc299bc7b55..a0ae9bc29da 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -55,6 +55,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv # noqa: F401 from . import cloud_utils # noqa: F401 from . import utils # noqa: F401 +from .sharding import * # noqa: F401 __all__ = [ # noqa "spawn", diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index 112c3887fcf..a31f8bbfed0 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -40,8 +40,6 @@ align = { Type.fp32.value: 4, } -__all__ = ["ShardingOptimizerStage2"] - class ShardingOptimizerStage2(Optimizer): """ @@ -136,7 +134,7 @@ class ShardingOptimizerStage2(Optimizer): # Update optimizer parameters and adjust parameter storage and use according to rank. self._update_opt_status() - @paddle.no_grad() + @paddle.autograd.no_grad() def _sync_params_and_buffers(self): """ Sync all model states for all ranks @@ -392,7 +390,7 @@ class ShardingOptimizerStage2(Optimizer): self._dtype_rank_params.clear() self._param2rank.clear() - @fluid.dygraph.no_grad + @paddle.autograd.no_grad() def _broadcast_params(self): """Broadcast the parameters of the current rank to each rank""" diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py index 392a7f3ac5d..548f036067e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py @@ -63,8 +63,7 @@ class ShardingStage2(nn.Layer): sync_buffers=False, buffer_max_size=2**23, #8MB auto_refresh_trainable=True, - device="gpu", - use_grad_storage=True): + device="gpu"): super().__init__() # training options @@ -102,9 +101,10 @@ class ShardingStage2(nn.Layer): # Set grad storage size & Display param sizes and model sizes model_size = sum( [np.prod(p.shape) for p in self._layer.parameters()]).item() + assert buffer_max_size >= 0, "buffer_max_size must be GE than 0." self._buffer_max_size = self._rank_buffer_size(buffer_max_size, model_size) - self._use_grad_storage = use_grad_storage + self._use_grad_storage = buffer_max_size > 0 self._grad_storages = {} # {dtype: {rank: GradStorage}} self._has_grad_storage = [] self._grad_storage_list = [] @@ -255,7 +255,7 @@ class ShardingStage2(nn.Layer): # wait next func hook support self._setup_backward_hooks() - @paddle.no_grad() + @paddle.autograd.no_grad() def __sync_buffers(self): """ Sync all the param buffers from all ranks (exp: batch norm statistics). @@ -277,7 +277,7 @@ class ShardingStage2(nn.Layer): except AttributeError: return getattr(self._layer, name) - @paddle.no_grad() + @paddle.autograd.no_grad() def _clear_counters(self): """Reset all the grad reduce and call counters.""" if self.training: @@ -290,13 +290,13 @@ class ShardingStage2(nn.Layer): def _get_reduce_fn(self, index, param, dst_rank): """ There are two ways to reduce gradient. - - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately. + - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately. - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks. """ if not self._use_grad_storage or not self._has_grad_storage[index]: # Direct reduction - @paddle.no_grad() + @paddle.autograd.no_grad() def reduce(*_): # Skip gradient reduction, do not change status information if self._grad_reduced[index]: @@ -336,7 +336,7 @@ class ShardingStage2(nn.Layer): else: # Buffer reduction - @paddle.no_grad() + @paddle.autograd.no_grad() def reduce(*_): # Skip gradient reduction, do not change status information if self._grad_reduced[index]: @@ -421,9 +421,6 @@ class ShardingStage2(nn.Layer): Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters. """ - if not self._use_grad_storage: - return - # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank self._grad_storages = {} self._has_grad_storage = [False for _ in self._trainable_params] diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index de69836fdba..bcf63a54cc4 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -84,6 +84,7 @@ class ShardingStage3(nn.Layer): self._offload = offload self._sync_comm = sync_comm # segmentation size + assert segment_size >= 0, "segment_size must be GE than 0." self._segment_size = segment_size global DEV @@ -158,7 +159,7 @@ class ShardingStage3(nn.Layer): self._redefine_opt_step() self._redefine_opt_clear() - @paddle.no_grad() + @paddle.autograd.no_grad() def _sync_params_and_buffers(self): """ Sync all model states for all ranks @@ -408,7 +409,7 @@ class ShardingStage3(nn.Layer): # register post forward hooks sub_layer.register_forward_post_hook(_forward_post_hook) - @paddle.no_grad() + @paddle.autograd.no_grad() def _sync_buffers(self): """ Sync all the param buffers from all ranks (exp: batch norm statistics). @@ -521,7 +522,7 @@ class ShardingStage3(nn.Layer): param._register_backward_hook(allreduce_function) def _get_allreduce_fn(self, param): - @paddle.no_grad() + @paddle.autograd.no_grad() def reduce(*_): if param.name in self._task_flow.full_grad.keys(): full_grad = self._task_flow.full_grad[param.name] @@ -840,7 +841,7 @@ def _allgather_buffer(trainable_params, return task_flow -@paddle.no_grad() +@paddle.autograd.no_grad() def _create_params_grad(trainable_params, param2buffer_size, task_flow): for param in trainable_params: if param.name in task_flow.full_grad.keys(): diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py new file mode 100644 index 00000000000..d14e3dd099f --- /dev/null +++ b/python/paddle/distributed/sharding/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .group_sharded import group_sharded_parallel, save_group_sharded_model # noqa: F401 + +__all__ = ['group_sharded_parallel', 'save_group_sharded_model'] diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py new file mode 100644 index 00000000000..2fdb20600f6 --- /dev/null +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import logging +from enum import Enum + +import paddle + +from paddle.optimizer import Optimizer +from paddle.distributed.utils import get_logger +from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler + +logger_ = get_logger(logging.INFO) + + +def group_sharded_parallel(model, + optimizer, + level, + scaler=None, + group=None, + offload=False, + sync_buffers=False, + buffer_max_size=2**23, + segment_size=2**20, + sync_comm=False): + """ + Use this module to configure and wrap up the parameters of the group shared module. + + Args: + model (Layer): The layer to be wrapped with group_sharded_parallel. + optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel. + level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`. + scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None. + group (Group, optional): The group instance. Defaults to None.d + offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False. + sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False. + buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23. + segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20. + sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False. + + Returns: + model: A wrapper for group sharded given model. + optimizer: A wrapper for group sharded given optimizer. + scaler: A wrapper for group sharded given scaler. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + from paddle.fluid.dygraph.nn import Linear + from paddle.distributed import fleet + from paddle.distributed.sharding import group_sharded_parallel + + fleet.init(is_collective=True) + group = paddle.distributed.new_group([0, 1]) + model = Linear(1000, 1000) + + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) + + # wrap sharding model, optimizer and scaler + model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) + + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + out = model(img) + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + loss.backward() + optimizer.step() + optimizer.clear_grad() + """ + # check optition type + assert isinstance( + model, + paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer." + assert isinstance( + optimizer, Optimizer + ), "The optimizer must be the instance of paddle.optimizer.Optimizer." + assert level in ['os', 'os_g', 'p_g_os' + ], "The level must be os, os_g or p_g_os." + + def check_dtype(param): + return param.dtype == paddle.float16 + + params_fp16 = filter(check_dtype, model.parameters()) + if scaler is None and len(params_fp16) > 0: + raise ValueError("Please enter the correct scaler.") + # convert model/optimizer/scaler + if level in ['os', 'os_g']: + logger_.info("*" * 30) + logger_.info("Sharded level os uses sharded level os_g achieved now.") + logger_.info("*" * 30) + optimizer = ShardingOptimizerStage2( + params=model.parameters(), + optim=optimizer, + group=group, + offload=offload) + model = ShardingStage2( + model, + optimizer, + group=group, + sync_buffers=sync_buffers, + buffer_max_size=buffer_max_size) + elif level == 'p_g_os': + model = ShardingStage3( + model, + optimizer=optimizer, + group=group, + sync_buffers=sync_buffers, + segment_size=segment_size, + offload=offload, + sync_comm=sync_comm) + else: + raise ValueError("Please enter the correct level.") + if params_fp16 and isinstance(scaler, paddle.amp.GradScaler): + scaler = ShardingScaler(scaler) + logger_.info("*" * 30) + logger_.info( + "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified." + ) + logger_.info("*" * 30) + + return model, optimizer, scaler + + +def save_group_sharded_model(model, output, optimizer=None): + """ + Group sharded encapsulated model and optimizer state saving module. + + Args: + model (Layer): A wrapper for group sharded given model. + output (str): Save directory. + optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + from paddle.fluid.dygraph.nn import Linear + from paddle.distributed import fleet + from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + + fleet.init(is_collective=True) + group = paddle.distributed.new_group([0, 1]) + model = Linear(1000, 1000) + + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip) + + # wrap sharding model, optimizer and scaler + model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler) + + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + out = model(img) + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + + loss.backward() + optimizer.step() + optimizer.clear_grad() + + # save model and optimizer state_dict + save_group_sharded_model(model, optimizer,output=output_dir) + """ + logger_.info( + "==========Begin to save group sharded model and optimizer==========") + assert not os.path.isfile( + output + ), "Saving directory ({}) should be a directory, not a file".format(output) + os.makedirs(output, exist_ok=True) + output_model = os.path.join(output, "model.pdmodel") + if isinstance(model, ShardingStage2): + paddle.save(model._layer.state_dict(), output_model) + elif isinstance(model, ShardingStage3): + convert2cpu = True if model._offload else False + model.get_all_parameters(convert2cpu=convert2cpu) + paddle.save(model._layer.state_dict(), output_model) + else: + raise ValueError( + "Please use the layer which is wrapped with group_sharded_parallel.") + + if optimizer is not None: + assert hasattr( + optimizer, "_optim" + ), "Please use the optimizer which is wrapped with group_sharded_parallel." + output_opt = os.path.join(output, "model.pdopt") + paddle.save(optimizer._optim.state_dict(), output_opt) + logger_.info( + "==========End to save group sharded model and optimizer==========") diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5d861cddea2..9b0c857576b 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -47,6 +47,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3) +list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api) list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) @@ -282,6 +283,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3) + list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api) list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) @@ -1123,6 +1125,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py new file mode 100644 index 00000000000..d4832782c32 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -0,0 +1,147 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import shutil +import tempfile +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn +from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model + +epoch = 10 +paddle.seed(2022) +np.random.seed(2022) +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 +batch_size = 100 +fleet.init(is_collective=True) + + +class MLP(fluid.Layer): + def __init__(self, linear_size=1000, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) + + def forward(self, inputs): + y = self._linear1(inputs) + y = self._linear2(y) + y = self._linear3(y) + return y + + +def reader_decorator(linear_size=1000): + def __reader__(): + for _ in range(100): + img = np.random.rand(linear_size).astype('float32') + label = np.ones(1).astype('int64') + yield img, label + + return __reader__ + + +def optimizer_setting(model, use_pure_fp16, opt_group=False): + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) + optimizer = paddle.optimizer.Momentum( + parameters=[{ + "params": list(model.parameters()) + }] if opt_group else list(model.parameters()), + learning_rate=0.001, + weight_decay=0.00001, + grad_clip=clip, + multi_precision=use_pure_fp16) + + return optimizer + + +def train_mlp(model, shard_level, use_pure_fp16, output_dir): + group = paddle.distributed.new_group([0, 1]) + + optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + + model, optimizer, scaler = group_sharded_parallel( + model=model, optimizer=optimizer, level=shard_level, scaler=scaler) + + train_reader = paddle.batch( + reader_decorator(), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + + if not use_pure_fp16: + avg_loss.backward() + optimizer.step() + else: + scaler.scale(avg_loss).backward() + scaler.step(optimizer) + scaler.update() + + optimizer.clear_grad() + + save_group_sharded_model(model, output=output_dir, optimizer=optimizer) + return model.parameters() + + +def test_sharding_api(): + mlp, mlp1, mlp2 = MLP(), MLP(), MLP() + state_dict = mlp.state_dict() + mlp1.set_state_dict(state_dict) + mlp2.set_state_dict(state_dict) + + output_dir = tempfile.mkdtemp() + + # fp16 + stage2_params = train_mlp( + mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir) + stage3_params = train_mlp( + mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir) + + for i in range(len(stage3_params)): + np.testing.assert_allclose( + stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + shutil.rmtree(output_dir) + + +if __name__ == '__main__': + test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 6b755cf4c2b..bbbcb621fd4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -83,7 +83,7 @@ def train_mlp(model, accumulate_grad=False, batch_size=100, opt_group=False, - recompute=False, + sync_comm=False, test_minimize=False): group = paddle.distributed.new_group([0, 1]) if opt_group: @@ -104,7 +104,7 @@ def train_mlp(model, model, optimizer, group=group, buffer_max_size=2**21) elif sharding_stage == 3: model = ShardingStage3( - model, optimizer=optimizer, group=group, sync_comm=recompute) + model, optimizer=optimizer, group=group, sync_comm=sync_comm) # check optimizer.minimize() error if test_minimize: @@ -225,7 +225,7 @@ def test_stage2_stage3(): rtol=1e-4, atol=1e-3) - # fp16 recompute + # fp16 sync_comm stage3_params = train_mlp( mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False) stage3_params_re = train_mlp( @@ -233,7 +233,7 @@ def test_stage2_stage3(): sharding_stage=3, use_pure_fp16=True, opt_group=False, - recompute=True) + sync_comm=True) for i in range(len(stage3_params)): np.testing.assert_allclose( stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6) diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py new file mode 100644 index 00000000000..7c296c7e40e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestDygraphGroupSharded(TestMultipleGpus): + + # check group sharded logic as well as the accuracy with single mode + def test_dygraph_group_sharded(self): + self.run_mnist_2gpu('dygraph_group_sharded_api.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 94b8bd29b2c..f2d41b5e9b1 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -46,6 +46,10 @@ def _build_saved_state_dict(state_dict): if value.type == core.VarDesc.VarType.VOCAB: save_dict[key] = value.value().get_map_tensor() else: + if not value.value().get_tensor()._is_initialized(): + raise ValueError( + "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model." + ) save_dict[key] = value.numpy() name_table[key] = value.name else: @@ -466,7 +470,9 @@ def _parse_load_result(obj, return_numpy): def _save_lod_tensor(tensor, file_name): if not tensor._is_initialized(): - raise ValueError("The saved tensor is not initialized.") + raise ValueError( + "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model firstly." + ) if _is_file_path(file_name): _seek = core.save_lod_tensor(tensor, file_name) # '_seek' is the end position of this tensor in the file. diff --git a/python/setup.py.in b/python/setup.py.in index 118f617361f..3ce22892b6e 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -280,6 +280,7 @@ packages=['paddle', 'paddle.incubate.nn', 'paddle.incubate.passes', 'paddle.distribution', + 'paddle.distributed.sharding', 'paddle.distributed.fleet', 'paddle.distributed.fleet.base', 'paddle.distributed.fleet.elastic', -- GitLab From 63fb0347eb9dc72aafdff654a250e22be333de1e Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 9 Mar 2022 19:06:55 +0800 Subject: [PATCH 149/261] [PHI] Fix some bug of code auto-gen in C++ API (#40262) * support code auto-gene for sparse backward api * fix bug of intermediate api and name of return var --- .../final_state_generator/eager_gen.py | 2 +- paddle/phi/api/CMakeLists.txt | 2 +- paddle/phi/api/lib/CMakeLists.txt | 12 ++++- python/paddle/utils/code_gen/api_base.py | 18 +++++-- python/paddle/utils/code_gen/api_gen.py | 53 ++++++++++++++++--- .../paddle/utils/code_gen/backward_api_gen.py | 16 +++--- .../paddle/utils/code_gen/sparse_api_gen.py | 12 ++--- .../utils/code_gen/sparse_bw_api_gen.py | 16 +++--- 8 files changed, 92 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index b594faa80a8..f56cf8ef24c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -671,7 +671,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt index d632db046d1..a1b0af609ca 100644 --- a/paddle/phi/api/CMakeLists.txt +++ b/paddle/phi/api/CMakeLists.txt @@ -1,2 +1,2 @@ add_subdirectory(lib) -cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api) +cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 926ddf8ba49..42bf7a8103f 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -37,8 +37,16 @@ set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_ set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) -set(sparse_api_header_file_tmp ${api_header_file}.tmp) -set(sparse_api_source_file_tmp ${api_source_file}.tmp) +set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp) +set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp) + +# sparse bw api file +set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) +set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) +set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) +set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc) +set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp) +set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp) # sparse bw api file set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 68127fb522c..fe68548a22a 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -301,12 +301,12 @@ class BaseAPI(object): def gene_api_declaration(self): api_declaration = f""" -PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']}); +PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']}); """ if self.is_base_api and self.inplace_map is not None: api_declaration = api_declaration + f""" -PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.args_str['args_declare']}); +PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']}); """ return api_declaration @@ -675,6 +675,14 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. return input_tensor_code, kernel_args[:-2], kernel_signature + # Override by child class + def gene_return_type_code(self): + return self.outputs['return_type'] + + # Override by child class + def gene_return_code(self): + return "api_output" + # Override by child class def gene_output(self, output_type_list, @@ -703,7 +711,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); {code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); -{code_indent} return out;""" +{code_indent} return {self.gene_return_code()};""" def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False): input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args( @@ -726,12 +734,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); {code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); -{code_indent} return out;""" +{code_indent} return {self.gene_return_code()};""" def gene_base_api_code(self, inplace_flag=False): api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') api_code = f""" -PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{ +PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{ {self.gene_kernel_select()} """ diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 1bdfa8b6697..058cc08465f 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -23,7 +23,8 @@ from api_base import BaseAPI class ForwardAPI(BaseAPI): def __init__(self, api_item_yaml): super(ForwardAPI, self).__init__(api_item_yaml) - self.is_dygraph_api = self.parse_intermediate(api_item_yaml) + self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate( + api_item_yaml) def get_api_func_name(self): if self.is_dygraph_api: @@ -33,15 +34,47 @@ class ForwardAPI(BaseAPI): def parse_intermediate(self, api_item_yaml): if 'intermediate' in api_item_yaml: - return True + intermediate_outs = [ + item.strip() + for item in api_item_yaml['intermediate'].split(',') + ] + return True, intermediate_outs else: - return False + return False, [] def get_return_type(self, out_type_list): return out_type_list[0] if len( out_type_list) == 1 else "std::tuple<" + ",".join( out_type_list) + ">" + def gene_return_type_code(self): + if self.is_dygraph_api or len(self.intermediate_outs) == 0: + return self.outputs['return_type'] + else: + return_out_list = [] + for i, name in enumerate(self.outputs['names']): + if name not in self.intermediate_outs: + return_out_list.append(self.outputs['types'][i]) + return return_out_list[0] if len( + return_out_list) == 1 else "std::tuple<" + ",".join( + return_out_list) + ">" + + def gene_return_code(self): + if self.is_dygraph_api or len(self.intermediate_outs) == 0: + return "api_output" + else: + return_out_list = [] + for i, name in enumerate(self.outputs['names']): + if name not in self.intermediate_outs: + return_out_list.append(i) + if len(return_out_list) == 1: + return f"std::get<{return_out_list[0]}>(api_output)" + else: + selected_code = [ + f"std::get<{i}>(api_output)" for i in return_out_list + ] + return '{' + ", ".join(selected_code) + '}' + def gene_output(self, output_type_list, set_out_func, @@ -58,12 +91,12 @@ class ForwardAPI(BaseAPI): 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" -{code_indent} {self.outputs['return_type']} out{inplace_assign}; -{code_indent} auto kernel_out = {set_out_func}(kernel_backend, &out);""" +{code_indent} {self.outputs['return_type']} api_output{inplace_assign}; +{code_indent} auto kernel_out = {set_out_func}(kernel_backend, &api_output);""" elif len(output_type_list) > 1: output_create = f""" -{code_indent} {self.outputs['return_type']} out;""" +{code_indent} {self.outputs['return_type']} api_output;""" for i in range(len(output_type_list)): kernel_output = kernel_output + f'kernel_out_{i}, ' @@ -71,10 +104,10 @@ class ForwardAPI(BaseAPI): if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" -{code_indent} std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};""" +{code_indent} std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};""" output_create = output_create + f""" -{code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(out));""" +{code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));""" kernel_output = kernel_output[:-2] else: @@ -169,6 +202,10 @@ def generate_api(api_yaml_path, header_file_path, source_file_path, if foward_api.is_dygraph_api: dygraph_header_file.write(foward_api.gene_api_declaration()) dygraph_source_file.write(foward_api.gene_api_code()) + + foward_api.is_dygraph_api = False + header_file.write(foward_api.gene_api_declaration()) + source_file.write(foward_api.gene_api_code()) else: header_file.write(foward_api.gene_api_declaration()) source_file.write(foward_api.gene_api_code()) diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 7bd488cc114..7417d6bb030 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -87,33 +87,33 @@ class BackwardAPI(BaseAPI): 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" -{code_indent} {self.outputs['return_type']} out{inplace_assign}; -{code_indent} auto kernel_out = {set_out_func}(kernel_backend, &out);""" +{code_indent} {self.outputs['return_type']} api_output{inplace_assign}; +{code_indent} auto kernel_out = {set_out_func}(kernel_backend, &api_output);""" elif len(output_type_list) > 1: output_create = f""" -{code_indent} {self.outputs['return_type']} out({len(output_type_list)});""" +{code_indent} {self.outputs['return_type']} api_output({len(output_type_list)});""" for i, out_type_item in enumerate(output_type_list): kernel_output = kernel_output + f'kernel_out_{i}, ' output_names.append(f'kernel_out_{i}') if out_type_item == 'Tensor': - get_out_code = f'&out[{i}][0]' + get_out_code = f'&api_output[{i}][0]' if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" -{code_indent} out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" +{code_indent} api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" else: output_create = output_create + f""" -{code_indent} out[{i}].emplace_back();""" +{code_indent} api_output[{i}].emplace_back();""" else: - get_out_code = f'&out[{i}]' + get_out_code = f'&api_output[{i}]' if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" -{code_indent} out[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" +{code_indent} api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" output_create = output_create + f""" {code_indent} auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});""" diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index d845653f488..8ba090f8ca8 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -60,12 +60,12 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" - {self.outputs['return_type']} out{inplace_assign}; - auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + {self.outputs['return_type']} api_output{inplace_assign}; + auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" elif len(output_type_list) > 1: output_create = f""" - {self.outputs['return_type']} out;""" + {self.outputs['return_type']} api_output;""" for i in range(len(output_type_list)): kernel_output = kernel_output + f'kernel_out_{i}, ' @@ -73,10 +73,10 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" - std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};""" + std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};""" output_create = output_create + f""" - auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" + auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" kernel_output = kernel_output[:-2] else: @@ -155,7 +155,7 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_s {kernel_context_code} phi_kernel(&kernel_context); - return out;""" + return api_output;""" def gene_base_api_code(self, inplace_flag=False): api_func_name = self.get_api_func_name() diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py index 6ef294caa14..ff87968f86d 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -53,33 +53,33 @@ class SparseBackwardAPI(SparseAPI, BackwardAPI): 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][0] in self.inplace_map else "" output_create = f""" - {self.outputs['return_type']} out{inplace_assign}; - auto kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + {self.outputs['return_type']} api_output{inplace_assign}; + auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" elif len(output_type_list) > 1: output_create = f""" - {self.outputs['return_type']} out({len(output_type_list)});""" + {self.outputs['return_type']} api_output({len(output_type_list)});""" for i, out_type_item in enumerate(output_type_list): kernel_output = kernel_output + f'kernel_out_{i}, ' output_names.append(f'kernel_out_{i}') if out_type_item == 'Tensor': - get_out_code = f'&out[{i}][0]' + get_out_code = f'&api_output[{i}][0]' if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" - out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" + api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" else: output_create = output_create + f""" - out[{i}].emplace_back();""" + api_output[{i}].emplace_back();""" else: - get_out_code = f'&out[{i}]' + get_out_code = f'&api_output[{i}]' if inplace_flag and self.inplace_map is not None and self.outputs[ 'names'][i] in self.inplace_map: output_create = output_create + f""" - out[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" + api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" output_create = output_create + f""" auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" -- GitLab From cd28cddbfb5f5643947291e9a640ecd414dc8dae Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 9 Mar 2022 20:11:10 +0800 Subject: [PATCH 150/261] [PHI] Move set_value kernel to phi (#40195) * save code * fix bug of set_value * add coverage test --- paddle/fluid/framework/operator.cc | 65 +- paddle/fluid/framework/operator.h | 4 +- paddle/fluid/imperative/execution_context.h | 5 + paddle/fluid/imperative/prepared_operator.h | 61 +- paddle/fluid/operators/set_value_op.cc | 7 - paddle/fluid/operators/set_value_op.cu | 7 - paddle/fluid/operators/set_value_op.h | 195 ----- paddle/phi/core/kernel_utils.h | 1 + paddle/phi/kernels/cpu/set_value_kernel.cc | 38 + paddle/phi/kernels/gpu/set_value_kernel.cu | 38 + .../phi/kernels/impl/set_value_kernel_impl.h | 337 ++++++++ paddle/phi/kernels/set_value_kernel.h | 49 ++ paddle/phi/ops/compat/set_value_sig.cc | 736 ++++++++++++++++++ paddle/phi/tests/ops/test_op_signature.cc | 370 +++++++++ 14 files changed, 1701 insertions(+), 212 deletions(-) create mode 100644 paddle/phi/kernels/cpu/set_value_kernel.cc create mode 100644 paddle/phi/kernels/gpu/set_value_kernel.cu create mode 100644 paddle/phi/kernels/impl/set_value_kernel_impl.h create mode 100644 paddle/phi/kernels/set_value_kernel.h create mode 100644 paddle/phi/ops/compat/set_value_sig.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index eff6d9a9102..f8e30c1ee29 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const { return var != nullptr; } +bool ExecutionContext::HasInputs(const std::string& name) const { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (const auto* input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; +} + bool ExecutionContext::HasOutput(const std::string& name) const { auto* var = OutputVar(name); return var != nullptr; @@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext( std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = Attrs().at(attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = Attrs().at(attr_names[i]); @@ -2212,7 +2271,11 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e33d4feb82a..1a1171f1dba 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -295,6 +295,8 @@ class ExecutionContext { virtual bool HasInput(const std::string& name) const; + virtual bool HasInputs(const std::string& name) const; + virtual bool HasOutput(const std::string& name) const; virtual size_t InputSize(const std::string& name) const { @@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { : ctx_(ctx) {} bool HasInput(const std::string& name) const override { - return ctx_.HasInput(name); + return ctx_.HasInputs(name); } bool HasOutput(const std::string& name) const override { diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index fe5ac73b004..fbc47f81fd3 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext { return (it != var_map_in_.end() && it->second.size() > 0); } + bool HasInputs(const std::string& name) const override { + auto it = var_map_in_.find(name); + return (it != var_map_in_.end() && it->second.size() > 0); + } + bool HasOutput(const std::string& name) const override { auto it = var_map_out_.find(name); return (it != var_map_out_.end() && it->second.size() > 0); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 30dbe07d7af..d7c0c8cc547 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { + VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -432,7 +487,11 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index ec3e04e71fa..7d0d782b837 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu index f9701b0acaa..9f291a863c0 100644 --- a/paddle/fluid/operators/set_value_op.cu +++ b/paddle/fluid/operators/set_value_op.cu @@ -16,13 +16,6 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OP_CUDA_KERNEL( set_value_grad, ops::SetValueGradKernel, diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 9dd72795920..4d459f8c01b 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } - -template -class SetValueKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const int rank = ctx.Input("Input")->dims().size(); - - // TODO(liym27): A more elegent code to do this. C++ has to make template - // integer as constant, but we had better have alternative writing in the - // future. - switch (rank) { - case 1: - SetValueCompute<1>(ctx); - break; - case 2: - SetValueCompute<2>(ctx); - break; - case 3: - SetValueCompute<3>(ctx); - break; - case 4: - SetValueCompute<4>(ctx); - break; - case 5: - SetValueCompute<5>(ctx); - break; - case 6: - SetValueCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", rank)); - } - } - - private: - template - void SetValueCompute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - auto place = ctx.GetPlace(); - auto& eigen_place = - *ctx.template device_context().eigen_device(); - - // Here copy data from input to avoid data loss at PE and Graph level. - // TODO(liym27): Speed up in the future version. - // - Q: Why don't call ShareDataWith to speed up? - // - A: Because it's not supported to ShareDataWith on OP's input and output - // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP - // - Q: Why don't delete Input, after all, the input and output are the same - // Tensor at program level? - // - A: If deleting Input, the graph will be complex, such as there will - // be two ops points to the output in graph: op1 -> output <- set_value. - // In this case, we have to find a way to handle the running order of - // set_value is what we want. - paddle::framework::TensorCopy(*in, place, out); - - Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype()); - slice_tensor.mutable_data(slice_dims, place); - pad_tensor.mutable_data(in_dims, place); - - auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); - auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); - - // Step 1: Set the value of out at `_index` to zero - slice_e.device(eigen_place) = slice_e.constant(T(0)); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto strides_indices = Eigen::DSizes(); - - for (size_t i = 0; i < D; ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - if (starts[i] == ends[i]) { // slice is empty, data will not be changed - return; - } - } - - out_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 2: Set a tensor with the same shape as out tensor. And its data at - // '_index' is the same as value_tensor, and data out of '_index' to zero - - // - Step 2.1 Set slice tensor with value - - // NOTE(liym27): [ Why resize slice_tensor here? ] - // A: When do broadcasting on slice_tensor and value_tensor, the shape of - // slice_tensor should be decreased dims. - // e.g. - // x[:,0] = value_tensor - // x's shape = [3, 4], value_tensor's shape = [3] - // We get slice_dims = [3, 1], decrease_slice_dims = [3] - // If do broadcasting on Tensor with shape [3, 1] and [3], the result's - // shape is [3, 3], which cross the border; - // If do broadcasting on Tensor with shape [3] and [3], the result's shape - // is [3], which is right. - - slice_tensor.Resize(slice_dims_for_assign); - if (value_tensor != nullptr) { - CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims()); - // ElementwiseComputeEx can do broadcasting - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); - } else { - Tensor value_t(in->dtype()); - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, place); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); - } - slice_tensor.Resize(slice_dims); - - // - Step 2.2 Pad slice tensor with 0 - pad_e.device(eigen_place) = pad_e.constant(T(0)); - pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 3: Set out tensor with value_tensor - out_e.device(eigen_place) = out_e - pad_e; - } -}; - template class SetValueGradKernel : public framework::OpKernel { public: diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index baa549d7a66..2cc82772cf8 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -252,6 +252,7 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); + PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); /* Output Helpers */ diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc new file mode 100644 index 00000000000..dcf278cd94e --- /dev/null +++ b/paddle/phi/kernels/cpu/set_value_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value, + CPU, + ALL_LAYOUT, + phi::SetValueKernel, + float, + double, + int, + int64_t, + bool) {} +PD_REGISTER_KERNEL(set_value_with_tensor, + CPU, + ALL_LAYOUT, + phi::SetTensorValueKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu new file mode 100644 index 00000000000..f788da010b6 --- /dev/null +++ b/paddle/phi/kernels/gpu/set_value_kernel.cu @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value, + GPU, + ALL_LAYOUT, + phi::SetValueKernel, + float, + double, + int, + int64_t, + bool) {} +PD_REGISTER_KERNEL(set_value_with_tensor, + GPU, + ALL_LAYOUT, + phi::SetTensorValueKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h new file mode 100644 index 00000000000..5aebffe51b5 --- /dev/null +++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h @@ -0,0 +1,337 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/slice_utils.h" + +namespace phi { + +// check whether the tensor with dimension of second can assign to the +// tensor with dimension of first +inline void CheckIsDimsMatch(const DDim& first, const DDim& second) { + int ignore_axis1 = 0, ignore_axis2 = 0; + for (; ignore_axis1 < first.size(); ++ignore_axis1) { + if (first[ignore_axis1] != 1) { + break; + } + } + for (; ignore_axis2 < second.size(); ++ignore_axis2) { + if (second[ignore_axis2] != 1) { + break; + } + } + + if (second.size() == ignore_axis2) { + // second tensor has only one value + return; + } + + if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) { + auto idx1 = first.size() - 1; + auto idx2 = second.size() - 1; + bool is_match = true; + for (; idx2 >= ignore_axis2; idx2--) { + if (first[idx1--] != second[idx2] && second[idx2] != 1) { + is_match = false; + break; + } + } + if (is_match) { + return; + } + } + PADDLE_THROW(errors::InvalidArgument( + "The shape of tensor assigned value must match the shape " + "of target shape: %d, but now shape is %d.", + second.to_str(), + first.to_str())); +} + +template +void SetValueImpl(const Context& dev_ctx, + const DenseTensor& in, + const DenseTensor& value, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* out) { + auto in_dims = in.dims(); + std::vector starts_local = starts.GetData(); + std::vector ends_local = ends.GetData(); + std::vector steps_local = steps.GetData(); + paddle::operators::CheckAndUpdateSliceAttrs( + in_dims, axes, &starts_local, &ends_local, &steps_local); + auto slice_dims = paddle::operators::GetSliceDims( + in_dims, axes, starts_local, ends_local, &steps_local); + auto decrease_slice_dims = + paddle::operators::GetDecreasedDims(slice_dims, decrease_axes); + + auto slice_dims_for_assign = decrease_slice_dims; + if (!none_axes.empty()) { + std::vector slice_dims_with_none; + + size_t none_axes_cur = 0, decrease_axes_cur = 0; + for (int i = 0; i < slice_dims.size(); ++i) { + while (none_axes_cur < none_axes.size() && + none_axes[none_axes_cur] <= i) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } + if (decrease_axes_cur < decrease_axes.size() && + decrease_axes[decrease_axes_cur] == i) { + decrease_axes_cur++; + } else { + slice_dims_with_none.push_back(slice_dims[i]); + } + } + while (none_axes_cur < none_axes.size()) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } + + slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); + } + + auto place = dev_ctx.GetPlace(); + auto& eigen_place = *dev_ctx.eigen_device(); + + // Here copy data from input to avoid data loss at PE and Graph level. + // TODO(liym27): Speed up in the future version. + // - Q: Why don't call ShareDataWith to speed up? + // - A: Because it's not supported to ShareDataWith on OP's input and output + // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP + // - Q: Why don't delete Input, after all, the input and output are the same + // Tensor at program level? + // - A: If deleting Input, the graph will be complex, such as there will + // be two ops points to the output in graph: op1 -> output <- set_value. + // In this case, we have to find a way to handle the running order of + // set_value is what we want. + Copy(dev_ctx, in, place, false, out); + + DenseTensor slice_tensor = + Empty(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()}); + DenseTensor pad_tensor = + Empty(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()}); + + auto pad_e = EigenTensor::From(pad_tensor, in_dims); + auto out_e = EigenTensor::From(*out); + auto slice_e = EigenTensor::From(slice_tensor, slice_dims); + + // Step 1: Set the value of out at `_index` to zero + slice_e.device(eigen_place) = slice_e.constant(T(0)); + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto strides_indices = Eigen::DSizes(); + + for (size_t i = 0; i < RANK; ++i) { + starts_indices[i] = 0; + ends_indices[i] = slice_dims[i]; + strides_indices[i] = 1; + } + for (size_t i = 0; i < axes.size(); i++) { + int axis_index = axes[i]; + starts_indices[axis_index] = starts_local[i]; + ends_indices[axis_index] = ends_local[i]; + strides_indices[axis_index] = steps_local[i]; + if (starts_local[i] == + ends_local[i]) { // slice is empty, data will not be changed + return; + } + } + + out_e.stridedSlice(starts_indices, ends_indices, strides_indices) + .device(eigen_place) = slice_e; + + // Step 2: Set a tensor with the same shape as out tensor. And its data at + // '_index' is the same as value, and data out of '_index' to zero + + // - Step 2.1 Set slice tensor with value + + // NOTE(liym27): [ Why resize slice_tensor here? ] + // A: When do broadcasting on slice_tensor and value, the shape of + // slice_tensor should be decreased dims. + // e.g. + // x[:,0] = value + // x's shape = [3, 4], value's shape = [3] + // We get slice_dims = [3, 1], decrease_slice_dims = [3] + // If do broadcasting on Tensor with shape [3, 1] and [3], the result's + // shape is [3, 3], which cross the border; + // If do broadcasting on Tensor with shape [3] and [3], the result's shape + // is [3], which is right. + + slice_tensor.Resize(slice_dims_for_assign); + CheckIsDimsMatch(slice_dims_for_assign, value.dims()); + // ElementwiseComputeEx can do broadcasting + funcs::ElementwiseCompute, T>( + dev_ctx, + slice_tensor, + value, + -1, + funcs::SubtractFunctor(), + &slice_tensor); + + slice_tensor.Resize(slice_dims); + + // - Step 2.2 Pad slice tensor with 0 + pad_e.device(eigen_place) = pad_e.constant(T(0)); + pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) + .device(eigen_place) = slice_e; + + // Step 3: Set out tensor with value + out_e.device(eigen_place) = out_e - pad_e; +} + +template +void SetTensorValueKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& value, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* out) { + const int rank = x.dims().size(); + + switch (rank) { + case 1: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + case 2: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + case 3: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + case 4: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + case 5: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + case 6: + SetValueImpl(dev_ctx, + x, + value, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); + break; + default: + PADDLE_THROW(errors::InvalidArgument( + "The rank of input should be less than 7, but received %d.", rank)); + } +} + +template +void SetValueKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + const std::vector& shape, + const std::vector& values, + DenseTensor* out) { + std::vector assgin_values; + assgin_values.reserve(values.size()); + for (const auto& val : values) { + assgin_values.push_back(val.to()); + } + DenseTensor value_tensor = Empty(dev_ctx, shape); + paddle::framework::TensorFromVector(assgin_values, dev_ctx, &value_tensor); + value_tensor.Resize(phi::make_ddim(shape)); + + SetTensorValueKernel(dev_ctx, + x, + value_tensor, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h new file mode 100644 index 00000000000..271691b1a35 --- /dev/null +++ b/paddle/phi/kernels/set_value_kernel.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +template +void SetTensorValueKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& value, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* out); + +template +void SetValueKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + const std::vector& shape, + const std::vector& values, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc new file mode 100644 index 00000000000..eacfff26d53 --- /dev/null +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -0,0 +1,736 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("Input")) { + if (ctx.HasInput("StartsTensorList")) { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } else { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } + } else { + if (ctx.HasInput("StepsTensorList")) { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } else { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } + } + } else { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } else { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } + } else { + if (ctx.HasInput("StepsTensorList")) { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } else { + if (ctx.HasInput("ValueTensor")) { + return KernelSignature("set_value_with_tensor", + {"Input", "ValueTensor"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {"Out"}); + } else if (ctx.HasAttr("fp32_values") && + !paddle::any_cast>( + ctx.Attr("fp32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp32_values"}, + {"Out"}); + } else if (ctx.HasAttr("fp64_values") && + !paddle::any_cast>( + ctx.Attr("fp64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "fp64_values"}, + {"Out"}); + } else if (ctx.HasAttr("int32_values") && + !paddle::any_cast>( + ctx.Attr("int32_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int32_values"}, + {"Out"}); + } else if (ctx.HasAttr("int64_values") && + !paddle::any_cast>( + ctx.Attr("int64_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "int64_values"}, + {"Out"}); + } else if (ctx.HasAttr("bool_values") && + !paddle::any_cast>( + ctx.Attr("bool_values")) + .empty()) { + return KernelSignature("set_value", + {"Input"}, + {"starts", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes", + "shape", + "bool_values"}, + {"Out"}); + } + } + } + } + } + return KernelSignature("unregistered", {}, {}, {}); +} +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping); diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc index a6c9a27de7d..88c9193a8f8 100644 --- a/paddle/phi/tests/ops/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -114,5 +114,375 @@ TEST(ARG_MAP, fill_constant) { ASSERT_EQ(signature9.name, "full_sr"); } +TEST(ARG_MAP, set_value) { + TestArgumentMappingContext arg_case( + {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"}, + {}, + {{"fp32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case).name, + "set_value"); + + TestArgumentMappingContext arg_case1( + {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case1).name, + "set_value"); + + TestArgumentMappingContext arg_case2( + {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case2).name, + "set_value"); + + TestArgumentMappingContext arg_case3( + {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case3).name, + "set_value"); + + TestArgumentMappingContext arg_case4( + {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case4).name, + "set_value"); + + TestArgumentMappingContext arg_case5( + {"Input", "StartsTensorList", "EndsTensorList", "ValueTensor"}, + {}, + {}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case5).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case6( + {"Input", "StartsTensorList", "EndsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case6).name, + "set_value"); + + TestArgumentMappingContext arg_case7( + {"Input", "StartsTensorList", "EndsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case7).name, + "set_value"); + + TestArgumentMappingContext arg_case8( + {"Input", "StartsTensorList", "EndsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case8).name, + "set_value"); + + TestArgumentMappingContext arg_case9( + {"Input", "StartsTensorList", "EndsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case9).name, + "set_value"); + + TestArgumentMappingContext arg_case10( + {"Input", "StartsTensorList", "StepsTensorList", "ValueTensor"}, + {}, + {}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case10).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case11( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case11).name, + "set_value"); + + TestArgumentMappingContext arg_case12( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case12).name, + "set_value"); + + TestArgumentMappingContext arg_case13( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case13).name, + "set_value"); + + TestArgumentMappingContext arg_case14( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case14).name, + "set_value"); + + TestArgumentMappingContext arg_case15( + {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case15).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case16( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"fp32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case16).name, + "set_value"); + + TestArgumentMappingContext arg_case17( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case17).name, + "set_value"); + + TestArgumentMappingContext arg_case18( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case18).name, + "set_value"); + + TestArgumentMappingContext arg_case19( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case19).name, + "set_value"); + + TestArgumentMappingContext arg_case20( + {"Input", "StartsTensorList", "StepsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case20).name, + "set_value"); + + TestArgumentMappingContext arg_case21( + {"Input", "EndsTensorList", "StepsTensorList", "ValueTensor"}, + {}, + {}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case21).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case22( + {"Input", "EndsTensorList", "StepsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case22).name, + "set_value"); + + TestArgumentMappingContext arg_case23( + {"Input", "EndsTensorList", "StepsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case23).name, + "set_value"); + + TestArgumentMappingContext arg_case24( + {"Input", "EndsTensorList", "StepsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case24).name, + "set_value"); + + TestArgumentMappingContext arg_case25( + {"Input", "EndsTensorList", "StepsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case25).name, + "set_value"); + + TestArgumentMappingContext arg_case26( + {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case26).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case27( + {"Input", "EndsTensorList"}, + {}, + {{"fp32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case27).name, + "set_value"); + + TestArgumentMappingContext arg_case28( + {"Input", "EndsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case28).name, + "set_value"); + + TestArgumentMappingContext arg_case29( + {"Input", "EndsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case29).name, + "set_value"); + + TestArgumentMappingContext arg_case30( + {"Input", "EndsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case30).name, + "set_value"); + + TestArgumentMappingContext arg_case31( + {"Input", "EndsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case31).name, + "set_value"); + + TestArgumentMappingContext arg_case32( + {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case32).name, + "set_value_with_tensor"); + + TestArgumentMappingContext arg_case33( + {"Input", "StepsTensorList"}, + {}, + {{"fp32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case33).name, + "set_value"); + + TestArgumentMappingContext arg_case34( + {"Input", "StepsTensorList"}, + {}, + {{"fp64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case34).name, + "set_value"); + + TestArgumentMappingContext arg_case35( + {"Input", "StepsTensorList"}, + {}, + {{"int32_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case35).name, + "set_value"); + + TestArgumentMappingContext arg_case36( + {"Input", "StepsTensorList"}, + {}, + {{"int64_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case36).name, + "set_value"); + + TestArgumentMappingContext arg_case37( + {"Input", "StepsTensorList"}, + {}, + {{"bool_values", paddle::any{std::vector{1}}}}, + {"Out"}, + {}); + ASSERT_EQ( + OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case37).name, + "set_value"); +} + } // namespace tests } // namespace phi -- GitLab From 0604df9e70dfe7be8a21df6a80d9fa6d4939bd9d Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Wed, 9 Mar 2022 20:40:34 +0800 Subject: [PATCH 151/261] [Dy2st]Fix Exception in utils.py function "is_paddle_module" (#40243) --- python/paddle/fluid/dygraph/dygraph_to_static/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 04474dcdfe5..d440e387da5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -191,7 +191,7 @@ def is_api_in_module(node, module_prefix): return eval("_is_api_in_module_helper({}, '{}')".format(func_str, module_prefix)) - except NameError: + except Exception: return False @@ -227,7 +227,7 @@ def is_numpy_api(node): # TODO: find a better way if not module_result: return func_str.startswith("numpy.") or func_str.startswith("np.") - except NameError: + except Exception: return False -- GitLab From 452c75b8034e485a2626e22cac39c95c07b883b4 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 9 Mar 2022 21:37:32 +0800 Subject: [PATCH 152/261] move elementwise mul grad (#40252) --- .../new_executor/standalone_executor_test.cc | 2 +- .../elementwise/elementwise_functor.h | 41 --- .../elementwise/elementwise_mul_op.cc | 49 ---- .../elementwise/elementwise_mul_op.cu | 68 ----- .../elementwise/elementwise_mul_op.h | 238 --------------- .../kernels/cpu/elementwise_grad_kernel.cc | 61 +++- paddle/phi/kernels/elementwise_grad_kernel.h | 39 +++ .../phi/kernels/funcs/elementwise_functor.h | 44 +++ paddle/phi/kernels/gpu/elementwise_grad.h | 37 +++ .../kernels/gpu/elementwise_grad_kernel.cu | 54 ++++ .../impl/elementwise_grad_kernel_impl.h | 273 ++++++++++++++++++ paddle/phi/ops/compat/elementwise_sig.cc | 34 +++ 12 files changed, 539 insertions(+), 401 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 62d87b6917e..a69cc0d6b86 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -46,7 +46,7 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); +USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); USE_OP(tanh_grad); USE_OP(sum); diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 8e0bf78e9b7..14baeaa74d2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -196,47 +196,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e..830e09eeae4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a18..f7b9fd1e265 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d5844..58a3123c7e3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index c9177f1c46e..cd513e809fd 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -121,6 +121,20 @@ void DivideGradKernel(const Context& dev_ctx, dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX(), DivGradDY()); } +template +void MultiplyGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + funcs::ElementwiseGradPreProcess(dout, dx); + auto* out = &dout; // out is not necessary + phi::funcs::ElemwiseGradCompute, MulGradDY>( + dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX(), MulGradDY()); +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -193,8 +207,8 @@ PD_REGISTER_KERNEL(divide_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + phi::dtype::complex, + phi::dtype::complex) {} PD_REGISTER_KERNEL(divide_double_grad, CPU, @@ -204,5 +218,44 @@ PD_REGISTER_KERNEL(divide_double_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_grad, + CPU, + ALL_LAYOUT, + phi::MultiplyGradKernel, + float, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_double_grad, + CPU, + ALL_LAYOUT, + phi::MultiplyDoubleGradKernel, + float, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_triple_grad, + CPU, + ALL_LAYOUT, + phi::MultiplyTripleGradKernel, + float, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index bcd5a98f07e..58ae11a9c42 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -85,4 +85,43 @@ void DivideDoubleGradKernel(const Context& dev_ctx, DenseTensor* dy, DenseTensor* dout, DenseTensor* ddout); + +template +void MultiplyGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy); + +template +void MultiplyDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DenseTensor* ddout); + +template +void MultiplyTripleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + paddle::optional ddx, + paddle::optional ddy, + const DenseTensor& d_dx, + const DenseTensor& d_dy, + paddle::optional d_ddout, + int axis, + DenseTensor* d_x, + DenseTensor* d_y, + DenseTensor* d_dout, + DenseTensor* d_ddx, + DenseTensor* d_ddy); + } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 5615a450b5c..b01d50015f0 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -160,5 +160,49 @@ struct DivGradYFunctor> { } }; +template +struct MultiplyGradFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } +}; +template +struct MultiplyGradFunctor> { + inline HOSTDEVICE ComplexType operator()(const ComplexType a, + const ComplexType b) const { + ComplexType b_conj(b.real, -b.imag); + return a * b_conj; + } +}; + +template +struct MultiplyGradXYFunctor { + inline HOSTDEVICE phi::Array operator()(const InT a, + const InT b, + const InT c) { + phi::Array outs; + // dx = dout * y + outs[0] = a * b; + // dy = dout * x + outs[1] = a * c; + return outs; + } +}; + +template +struct MultiplyGradXYFunctor, ComplexType> { + inline HOSTDEVICE phi::Array, 2> operator()( + const ComplexType a, + const ComplexType b, + const ComplexType c) { + phi::Array, 2> outs; + // dx = dout * y + ComplexType b_conj(b.real, -b.imag); + outs[0] = a * b_conj; + // dy = dout * x + ComplexType c_conj(c.real, -c.imag); + outs[1] = a * c_conj; + return outs; + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index 98df65c92f3..e5432b5f918 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -360,4 +360,41 @@ void ElementwiseDivGrad(const GPUContext &dev_ctx, } } +/* +****************************** + Mul Grad +****************************** +*/ + +template +void ElementwiseMulGrad(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis) { + const auto place = dev_ctx.GetPlace(); + + if (dx != nullptr && dy != nullptr) { + std::vector ins = {&dout, &y, &x}; + GetGradXAndYOut( + dev_ctx, + place, + axis, + ins, + dout, + dx, + dy, + funcs::MultiplyGradXYFunctor()); + } else if (dx != nullptr && dy == nullptr) { + std::vector ins = {&dout, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor()); + } else if (dx == nullptr && dy != nullptr) { + std::vector ins = {&dout, &x}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor()); + } +} } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 45c8b9a2163..81f7fac1088 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -136,6 +136,18 @@ void DivideGradKernel(const Context& dev_ctx, } } +template +void MultiplyGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + funcs::ElementwiseGradPreProcess(dout, dx); + ElementwiseMulGrad(dev_ctx, x, y, dout, dx, dy, axis); +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -228,3 +240,45 @@ PD_REGISTER_KERNEL(divide_double_grad, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_grad, + GPU, + ALL_LAYOUT, + phi::MultiplyGradKernel, + float, + phi::dtype::float16, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_double_grad, + GPU, + ALL_LAYOUT, + phi::MultiplyDoubleGradKernel, + float, + phi::dtype::float16, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(multiply_triple_grad, + GPU, + ALL_LAYOUT, + phi::MultiplyTripleGradKernel, + float, + phi::dtype::float16, + double, + int, + int64_t, + bool, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index e8831f90213..65427e87506 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -259,4 +259,277 @@ void DivideDoubleGradKernel(const Context& dev_ctx, } } +template +struct MulGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } +}; + +template +struct MulGradDX> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex y_conj(y.real, -y.imag); + return dout * y_conj; + } +}; + +/* +****************************** + Multiply Grad +****************************** +*/ + +template +struct MulGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } +}; + +template +struct MulGradDY> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex x_conj(x.real, -x.imag); + return dout * x_conj; + } +}; + +template +void MultiplyDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DenseTensor* ddout) { + if (ddout) dev_ctx.template Alloc(ddout); + + DenseTensor ddx_safe, ddy_safe; + funcs::GetDoubleGradSafeTensor( + dev_ctx, x, ddx.get_ptr(), &ddx_safe); + funcs::GetDoubleGradSafeTensor( + dev_ctx, y, ddy.get_ptr(), &ddy_safe); + + // dx = dout * ddy + // dy = dout * ddx + // ddout = ddx * y + x * ddy + // change computation sequence to save memory, so ddout can inplace ddx and + // dx can be used as 'tmp' tensor + // (1) dx = x * ddy + // (2) dy = dout * ddx + // (3) ddout = ddx * y + // (4) ddout = ddout + dx + // (5) dx = dout * ddy + if (ddout) { + auto& place = *dev_ctx.eigen_device(); + // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace + if (ddout->numel() > ddx.get_ptr()->numel()) { + phi::funcs::ElemwiseGradCompute, MulGradDY>( + dev_ctx, + ddx_safe, + ddy_safe, + dout, + dout, + axis, + dx, + dy, + MulGradDX(), + MulGradDY()); + + DenseTensor ddout_tmp; + ddout_tmp.Resize(ddout->dims()); + dev_ctx.template Alloc(&ddout_tmp); + + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, y, ddx_safe, ddout, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, ddy_safe, x, &ddout_tmp, axis); + + auto ddout_t = phi::EigenVector::Flatten(*ddout); + auto ddout_tmp_t = phi::EigenVector::Flatten(ddout_tmp); + ddout_t.device(place) = ddout_t + ddout_tmp_t; + } else { + // use dx to save memory, other than alloc tmp tensor + DenseTensor* ddout_tmp = dx; + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, x, ddy_safe, ddout_tmp, axis); + // NOTE: in the following ElemwiseGradCompute, for the + // first output tensor is nullptr, the branch to calculate first + // output tensor will not be activated, DivGradDx function will not + // be called and can be ignored, the first branch has little effect + // on running speed. + phi::funcs::ElemwiseGradCompute, MulGradDY>( + dev_ctx, + ddx_safe, + ddy_safe, + dout, + dout, + axis, + nullptr, + dy, + MulGradDX(), + MulGradDY()); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, ddx_safe, y, ddout, axis); + + auto ddout_t = phi::EigenVector::Flatten(*ddout); + auto ddout_tmp_t = phi::EigenVector::Flatten(*ddout_tmp); + ddout_t.device(place) = ddout_t + ddout_tmp_t; + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dout, ddy_safe, dx, axis); + } + } +} + +template +void MultiplyTripleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + paddle::optional ddx, + paddle::optional ddy, + const DenseTensor& d_dx, + const DenseTensor& d_dy, + paddle::optional d_ddout, + int axis, + DenseTensor* d_x, + DenseTensor* d_y, + DenseTensor* d_dout, + DenseTensor* d_ddx, + DenseTensor* d_ddy) { + if (d_x) { + d_x->Resize(x.dims()); + dev_ctx.template Alloc(d_x); + } + if (d_y) { + d_y->Resize(y.dims()); + dev_ctx.template Alloc(d_y); + } + if (d_dout) { + d_dout->Resize(dout.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_ddx) { + d_ddx->Resize(x.dims()); + dev_ctx.template Alloc(d_ddx); + } + if (d_ddy) { + d_ddy->Resize(y.dims()); + dev_ctx.template Alloc(d_ddy); + } + + auto& place = *dev_ctx.eigen_device(); + + DenseTensor ddx_safe, ddy_safe; + funcs::GetDoubleGradSafeTensor( + dev_ctx, x, ddx.get_ptr(), &ddx_safe); + funcs::GetDoubleGradSafeTensor( + dev_ctx, y, ddy.get_ptr(), &ddy_safe); + + if (d_ddout.get_ptr()) { + if (d_x) { + // d_x = ddy * d_ddout + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis); + } + if (d_y) { + // d_y = ddx * d_ddout + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis); + } + } + + if (d_dout) { + // get d_dout + // d_dout = ddy * d_dx + d_dy * ddx + DenseTensor d_dout_tmp; + d_dout_tmp.Resize(dout.dims()); + dev_ctx.template Alloc(&d_dout_tmp); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, d_dy, ddx_safe, d_dout, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis); + auto d_dout_t = phi::EigenVector::Flatten(*d_dout); + auto d_dout_tmp_t = phi::EigenVector::Flatten(d_dout_tmp); + d_dout_t.device(place) = d_dout_t + d_dout_tmp_t; + } + + if (d_ddx) { + // get d_ddx + // d_ddx = dout * d_dy + y * d_ddout + DenseTensor d_ddx_tmp; + d_ddx_tmp.Resize(ddx->dims()); + dev_ctx.template Alloc(&d_ddx_tmp); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dout, d_dy, d_ddx, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis); + auto d_ddx_t = phi::EigenVector::Flatten(*d_ddx); + auto d_ddx_tmp_t = phi::EigenVector::Flatten(d_ddx_tmp); + d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t; + } + + if (d_ddy) { + // get d_ddy + // d_ddy = dout * d_dx + x * d_ddout + DenseTensor d_ddy_tmp; + d_ddy_tmp.Resize(ddy->dims()); + dev_ctx.template Alloc(&d_ddy_tmp); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dout, d_dx, d_ddy, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis); + auto d_ddy_t = phi::EigenVector::Flatten(*d_ddy); + auto d_ddy_tmp_t = phi::EigenVector::Flatten(d_ddy_tmp); + d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t; + } +} + } // namespace phi diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index d4a25866907..fc890fa3a49 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -122,6 +122,31 @@ KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( {GradVarName("Y"), "DOut", "DDOut"}); } +KernelSignature ElementwiseMulGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("multiply_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + +KernelSignature ElementwiseMulDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("multiply_double_grad", + {"X", "Y", "DOut", "DDX", "DDY"}, + {"axis"}, + {GradVarName("X"), GradVarName("Y"), "DDOut"}); +} + +KernelSignature ElementwiseMulTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "multiply_triple_grad", + {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"}, + {"axis"}, + {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -135,6 +160,9 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad); PD_REGISTER_ARG_MAPPING_FN(elementwise_add, phi::ElementwiseAddOpArgumentMapping); @@ -158,3 +186,9 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad, phi::ElementwiseDivGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad, phi::ElementwiseDivDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad, + phi::ElementwiseMulGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad, + phi::ElementwiseMulDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad, + phi::ElementwiseMulTripleGradOpArgumentMapping); -- GitLab From b97e6d13fd552df98bda8156e7851d21399c6579 Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Wed, 9 Mar 2022 22:38:14 +0800 Subject: [PATCH 153/261] [phi] move viterbi_decode to phi (#40186) * move viterbi to phi * move infershape to phi * update infershape * fix * resolve conflicts --- paddle/fluid/operators/viterbi_decode_op.cc | 53 +-- paddle/fluid/operators/viterbi_decode_op.cu | 206 -------- paddle/fluid/operators/viterbi_decode_op.h | 438 ------------------ paddle/phi/infermeta/ternary.cc | 47 ++ paddle/phi/infermeta/ternary.h | 8 + .../phi/kernels/cpu/viterbi_decode_kernel.cc | 319 +++++++++++++ .../kernels/funcs/viterbi_decode_functor.h | 140 ++++++ .../phi/kernels/gpu/viterbi_decode_kernel.cu | 402 ++++++++++++++++ paddle/phi/kernels/viterbi_decode_kernel.h | 30 ++ 9 files changed, 953 insertions(+), 690 deletions(-) delete mode 100644 paddle/fluid/operators/viterbi_decode_op.cu delete mode 100644 paddle/fluid/operators/viterbi_decode_op.h create mode 100644 paddle/phi/kernels/cpu/viterbi_decode_kernel.cc create mode 100644 paddle/phi/kernels/funcs/viterbi_decode_functor.h create mode 100644 paddle/phi/kernels/gpu/viterbi_decode_kernel.cu create mode 100644 paddle/phi/kernels/viterbi_decode_kernel.h diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a..602376d54e0 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 68628fb2748..00000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template