diff --git a/.gitignore b/.gitignore index debec551d9cd7344a31efbbb709bfbb759a15d3f..a2009a1ed30a1c6a17627b06170734fc17390d31 100644 --- a/.gitignore +++ b/.gitignore @@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h paddle/phi/api/include/api.h +paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc +paddle/phi/api/lib/sparse_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index a5b40f8aa07d77e803f2cad36155b7de1bd03719..96bc4a710f8c1c3c38b049368b204daad5dcd3f2 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,4 +1,7 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +if (WITH_DISTRIBUTE) + cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) +endif() cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index e4f272052024245bf7df7fc841d5e3b18978faf7..e43d0e8c183c7005f31b66c4c29dfc95361485e4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -117,6 +117,35 @@ class ProcessGroup { "ProcessGroup%s does not support receive", GetBackendName())); } + virtual std::shared_ptr AllGather( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather", GetBackendName())); + } + + virtual std::shared_ptr AllToAll( + std::vector& in /* tensors */, // NOLINT + std::vector& out /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllToAll", GetBackendName())); + } + + virtual std::shared_ptr Reduce( + std::vector& tensors /* tensors */, // NOLINT + const ReduceOptions& opts) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Reduce", GetBackendName())); + } + + virtual std::shared_ptr Scatter( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */, // NOLINT + const ScatterOptions&) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Scatter", GetBackendName())); + } + protected: const int rank_; const int size_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc new file mode 100644 index 0000000000000000000000000000000000000000..03ad48f560a0a85f791ad7358fb4e975269d6fa1 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#ifdef _WIN32 +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#ifdef _WIN32 +#define GENERATE_FUNC(type, func, ...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT64: \ + func(__VA_ARGS__); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } + +#define HOST_NAME_MAX 256 + +#else +#define GENERATE_FUNC(type, func, args...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(args); \ + break; \ + case experimental::DataType::INT32: \ + func(args); \ + break; \ + case experimental::DataType::INT64: \ + func(args); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } +#endif + +typedef void (*reduce_func)(void*, const void*, const void*, size_t); + +template +reduce_func get_function(const ReduceOp& r) { + switch (r) { + case ReduceOp::SUM: + return reduce_func(&::gloo::sum); + case ReduceOp::PRODUCT: + return reduce_func(&::gloo::product); + case ReduceOp::MIN: + return reduce_func(&::gloo::min); + case ReduceOp::MAX: + return reduce_func(&::gloo::max); + case ReduceOp::AVG: + VLOG(0) << "Error: Unsupported ReduceOp::AVG."; + exit(-1); + } + + VLOG(0) << "Error: Unknown ReduceOp."; + exit(-1); +} + +bool CheckTensorsInCPUPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kCPU; + }); +} + +template +T* get_data(const Tensor& tensor) { + auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); + return static_cast(raw_tensor->data()); +} + +template +std::vector get_multi_data(const std::vector& tensors) { + std::vector ret(tensors.size()); + for (size_t i = 0; i < tensors.size(); i++) { + ret[i] = get_data(tensors[i]); + } + return ret; +} + +template +void set_output(P& opts, const Tensor& tensor) { // NOLINT + opts.setOutput(get_data(tensor), tensor.numel()); +} + +template +void set_input(P& opts, const Tensor& tensor) { // NOLINT + opts.setInput(get_data(tensor), tensor.numel()); +} + +template +void set_outputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setInputs(get_multi_data(tensors), tensors[0].numel()); +} + +ProcessGroupGloo::GlooTask::GlooTask(int rank, + const std::vector& inputs, + CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) { + PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, + platform::errors::Fatal( + "Only CPU place is supported for ProcessGroupGloo.")); +} + +ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, + int rank, int world_size, + const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(store) { + _context = std::make_shared(rank, world_size); + auto prefix_store = + ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + _context->connectFullMesh(prefix_store, options->device); +} + +class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { + public: + BroadcastGlooTask(const std::shared_ptr& context, + const std::vector& inputs, int rank, int root, + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), + _context(context), + _root(root), + _inputs(inputs), + _tag(tag) {} + + void Run() override { _do_broadcast(_inputs[0]); } + + private: + std::shared_ptr _context; + const int _root; + std::vector _inputs{}; + const uint32_t _tag; + + void _do_broadcast(const Tensor& tensor) { + gloo::BroadcastOptions opts(_context); + const auto& dtype = tensor.type(); + GENERATE_FUNC(dtype, set_output, opts, tensor); + opts.setRoot(_root); + opts.setTag(_tag); + gloo::broadcast(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Broadcast( + std::vector& inputs, const BroadcastOptions& opts) { + auto root = opts.source_rank; + std::unique_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_unique(context, inputs, rank_, root, tag); + task->Run(); + return task; +} + +class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllreduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, ReduceOp reduce_op, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), + _context(context), + _inputs(inputs), + _reduce_op(reduce_op), + _tag(tag) {} + + void Run() override { _do_allreduce(_inputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + uint32_t _tag; + + gloo::AllreduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::AllreduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_allreduce(std::vector& tensors) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::AllreduceOptions opts(_context); + GENERATE_FUNC(dtype, set_inputs, opts, tensors); + GENERATE_FUNC(dtype, set_outputs, opts, tensors); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + gloo::allreduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllReduce( + std::vector& inputs, const AllreduceOptions& opts) { + auto tag = next_tag(); + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context, inputs, + opts.reduce_op, tag); + task->Run(); + return task; +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { + ::gloo::transport::tcp::attr attr; + attr.iface = ifname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) { + ::gloo::transport::tcp::attr attr; + attr.hostname = hostname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDefaultDevice() { + std::array hostname{}; + auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal( + "Get hostname error for createDefaultDevice.")); + ::addrinfo* result; + result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC); + ::addrinfo* cur; + for (cur = result; cur != nullptr; cur = cur->ai_next) { + SocketType socket = + ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (socket == -1) { + continue; + } + ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen); +#ifdef _WIN32 + closesocket(socket); +#else + close(socket); +#endif + if (ret == -1) { + continue; + } + break; + } + freeaddrinfo(result); + if (cur != nullptr) { + return createDeviceForHostname(hostname.data()); + } + return createDeviceForHostname("127.0.0.1"); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h new file mode 100644 index 0000000000000000000000000000000000000000..d989939fcb8726ab207e93dbb49bcb5b5e7444dc --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +#ifdef PADDLE_WITH_GLOO +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/distributed/store/tcp_store.h" + +constexpr const char* GLOO_BACKEND_NAME = "GLOO"; + +namespace paddle { +namespace distributed { + +class ProcessGroupGloo : public ProcessGroup { + public: + class GlooTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + explicit GlooTask(int rank, const std::vector& input_tensors, + CommType comm_type); + + ~GlooTask() = default; + + virtual void Run() = 0; + bool Wait(std::chrono::milliseconds timeout) override { return true; } + bool IsCompleted() override { return true; } + void Synchronize() override {} + + protected: + friend class ProcessGroupGloo; + }; + + class GlooStore : public ::gloo::rendezvous::Store { + public: + explicit GlooStore( + const std::shared_ptr& store) + : _store(store) {} + + ~GlooStore() = default; + + std::vector get(const std::string& key) override { + VLOG(3) << "GlooStore::get"; + auto value = _store->get(key); + return std::vector(value.begin(), value.end()); + } + + void wait(const std::vector& keys) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + } + + void set(const std::string& key, const std::vector& value) override { + VLOG(3) << "GlooStore::set"; + std::vector tmp(value.begin(), value.end()); + _store->set(key, tmp); + } + + void wait(const std::vector& keys, + const std::chrono::milliseconds& timeout) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + // wait(keys); + } + + protected: + std::shared_ptr _store; + }; + + class GlooOptions { + public: + GlooOptions() = default; + ~GlooOptions() = default; + static std::shared_ptr create() { + return std::make_shared(); + } + std::shared_ptr<::gloo::transport::Device> device; + }; + + explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, + int world_size, + std::shared_ptr options); + + ~ProcessGroupGloo() = default; + + std::shared_ptr Broadcast( + std::vector& inputs, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr AllReduce( + std::vector& inputs, + const AllreduceOptions& opts = AllreduceOptions()) override; + + std::shared_ptr<::gloo::Context> get_context() { return _context; } + uint64_t next_tag() { return _tag++; } + + const std::string GetBackendName() const override { + return GLOO_BACKEND_NAME; + } + + // Helper functions for Gloo. + static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname( + const std::string& hostname); + static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface( + const std::string& ifname); + static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); + + protected: + uint32_t _tag; + std::shared_ptr _context; + std::shared_ptr _store; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 5d96e730aa4b1aeae3fc242ca43f63d909325a4e..88d8fb69eb69808729b6e0ec3c374569b1575671 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -473,5 +473,148 @@ std::shared_ptr ProcessGroupNCCL::Recv( return task; } +std::shared_ptr ProcessGroupNCCL::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllGather( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), comm, stream); + }, + CommType::ALLGATHER); +} + +void* GetPointerByOffset(void* raw_pointer, size_t offset, + experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + +std::shared_ptr ProcessGroupNCCL::AllToAll( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + input_tensor->data(), output_tensor->data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); + }, + CommType::REDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + if (rank_ == opts.root_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + } + }, + CommType::SCATTER); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index cfeb6467f0dbf21f116b1880f8b64a55bb2314a1..d63a5e768382c6fd9141ff9d96a3187b0adab7de 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup { std::shared_ptr Recv(std::vector& tensors, int src_rank) override; + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 699222ac452dbcc2f0b1b41c70c6036dc915a427..973f7c643542757c0bce68f8ccdefeadc97f15d4 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -36,5 +36,14 @@ struct BarrierOptions { std::vector place_ids; }; +struct ReduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; + int root_rank = 0; +}; + +struct ScatterOptions { + int root_rank = 0; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2673314d222d2b32e42c42a3a94df71a1887914a..2581a74d7e8187b0a38b27a2f27e9b84ddf26b53 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -32,6 +32,8 @@ class Store { virtual int64_t add(const std::string& key, int64_t value) = 0; virtual std::vector get(const std::string& key) = 0; virtual void wait(const std::string& key) = 0; + virtual void set(const std::string& key, + const std::vector& value) = 0; virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index de85ac0d910e93257a308052ca1fcf193680a183..8675981955dacfb4917f32047681fea8b08d7bba 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -27,11 +27,13 @@ namespace detail { constexpr int INFTIME = -1; -std::unique_ptr MasterDaemon::start(SocketType socket) { - return std::make_unique(socket); +std::unique_ptr MasterDaemon::start(SocketType socket, + int nranks) { + return std::make_unique(socket, nranks); } -MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { +MasterDaemon::MasterDaemon(SocketType socket, int nranks) + : _listen_socket(socket), _nranks(nranks) { _background_thread = std::thread{&MasterDaemon::run, this}; } @@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) { tcputils::send_value(socket, new_value); } +void MasterDaemon::_do_set(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_set"; + std::string key = tcputils::receive_string(socket); + auto value = tcputils::receive_vector(socket); + _store[key] = value; +} + void MasterDaemon::_do_get(SocketType socket) { std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); @@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) { iter, _store.end(), platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); std::vector value = iter->second; - VLOG(3) << "TCPStore: value (" - << std::stoll(std::string(reinterpret_cast(value.data()), - value.size())) - << ") for key (" << key << ")."; tcputils::send_vector(socket, value); } void MasterDaemon::_do_stop(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_stop"; ReplyType value = ReplyType::STOP_WAIT; - _stop = true; + if (--_nranks == 0) { + _stop = true; + } tcputils::send_value(socket, value); } @@ -140,21 +148,27 @@ void MasterDaemon::run() { case Command::GET: _do_get(fds[i].fd); break; + case Command::SET: + _do_set(fds[i].fd); + break; case Command::WAIT: _do_wait(fds[i].fd); break; case Command::STOP: _do_stop(fds[i].fd); break; + default: + VLOG(0) << "Unknow command: " << static_cast(command); + exit(-1); } } } } -std::unique_ptr TCPServer::create(uint16_t port) { +std::unique_ptr TCPServer::create(uint16_t port, int nranks) { int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); auto server = std::make_unique(); - server->_master_daemon = MasterDaemon::start(socket); + server->_master_daemon = MasterDaemon::start(socket, nranks); return server; } @@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, size_t num_workers, std::chrono::seconds timeout) : Store(timeout), _is_master(is_master), _num_workers(num_workers) { if (_is_master) { - _server = detail::TCPServer::create(port); + _server = detail::TCPServer::create(port, num_workers); } _client = detail::TCPClient::connect(host, port); @@ -213,36 +227,41 @@ void TCPStore::waitWorkers() { } add(_init_key, 1); - if (_server) { - auto begin = std::chrono::steady_clock::now(); - do { - auto value = get(_init_key); - int completed = std::stoi(std::string(value.begin(), value.end())); - VLOG(3) << completed << " worker ready, total " << _num_workers; - if (completed >= _num_workers) { - break; - } - const auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - begin); - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { - PADDLE_ENFORCE_EQ( - completed, _num_workers, - platform::errors::InvalidArgument( - "TCPStore timeouted and not all workers got ready.")); - } - } while (true); - } + auto begin = std::chrono::steady_clock::now(); + do { + auto value = get(_init_key); + int completed = std::stoi(std::string(value.begin(), value.end())); + VLOG(3) << completed << " worker ready, total " << _num_workers; + if (completed >= _num_workers) { + break; + } + const auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { + PADDLE_ENFORCE_EQ( + completed, _num_workers, + platform::errors::InvalidArgument( + "TCPStore timeouted and not all workers got ready.")); + } + } while (true); VLOG(3) << "TCPStore initialized."; } int64_t TCPStore::add(const std::string& key, int64_t value) { + VLOG(3) << "TCPStore add."; _client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_value(value); return _client->receive_value(); } +void TCPStore::set(const std::string& key, const std::vector& value) { + VLOG(3) << "TCPStore set."; + _client->send_command_for_key(Command::SET, _key_prefix + key); + _client->send_vector(value); +} + std::vector TCPStore::get(const std::string& key) { wait(key); _client->send_command_for_key(Command::GET, _key_prefix + key); @@ -252,6 +271,7 @@ std::vector TCPStore::get(const std::string& key) { void TCPStore::wait(const std::string& key) { ReplyType reply; + VLOG(3) << "TCPStore wait."; do { _client->send_command_for_key(Command::WAIT, _key_prefix + key); @@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) { TCPStore::~TCPStore() { _client->send_command_for_key(Command::STOP, ""); + VLOG(3) << "~TCPStore"; ReplyType ret = _client->receive_value(); PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, platform::errors::InvalidArgument( diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h index cd706dd6640acf5e0b5b3714175dac7a6cecb25a..17c1d8ea30a421f04d054d59ac93c8c60406ef68 100644 --- a/paddle/fluid/distributed/store/tcp_store.h +++ b/paddle/fluid/distributed/store/tcp_store.h @@ -27,15 +27,16 @@ namespace paddle { namespace distributed { enum class ReplyType { WAITING, STOP_WAIT }; -enum class Command { ADD, GET, WAIT, STOP }; +enum class Command { ADD, GET, SET, WAIT, STOP }; namespace detail { class MasterDaemon { public: - static std::unique_ptr start(SocketType listen_socket); + static std::unique_ptr start(SocketType listen_socket, + int nranks); MasterDaemon() = delete; - explicit MasterDaemon(SocketType listen_socket); + explicit MasterDaemon(SocketType listen_socket, int nranks); ~MasterDaemon(); private: @@ -43,18 +44,20 @@ class MasterDaemon { void _do_add(SocketType socket); void _do_wait(SocketType socket); void _do_get(SocketType socket); + void _do_set(SocketType socket); void _do_stop(SocketType socket); SocketType _listen_socket; std::vector _sockets; std::unordered_map> _store; std::thread _background_thread{}; + int _nranks; bool _stop = false; }; class TCPServer { public: TCPServer() = default; - static std::unique_ptr create(std::uint16_t port); + static std::unique_ptr create(std::uint16_t port, int nranks); private: std::unique_ptr _master_daemon; @@ -97,6 +100,7 @@ class TCPStore : public Store { int64_t add(const std::string& key, int64_t value) override; std::vector get(const std::string& key) override; void wait(const std::string& key) override; + void set(const std::string& key, const std::vector& value) override; private: void waitWorkers(); diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc index d0561d0b9a9c5b01c32620e72d21ed562e42637e..a28cba288333d7f1c2a705049c29b59f43a70cc5 100644 --- a/paddle/fluid/distributed/store/tcp_utils.cc +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -46,9 +46,10 @@ void close_socket(SocketType socket) { hints.ai_socktype = SOCK_STREAM; const char* node = host.empty() ? nullptr : host.c_str(); + const char* port_cstr = port.empty() ? nullptr : port.c_str(); int n; - n = ::getaddrinfo(node, port.c_str(), &hints, &res); + n = ::getaddrinfo(node, port_cstr, &hints, &res); const char* gai_err = ::gai_strerror(n); const char* proto = (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 734cabdc3dc914349e2ad30b657bfb6542a7472a..07fa40165167ce2352018c0e1b1cb08222d5a181 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + VLOG(6) << "Construct GradNodeAccumulation"; weak_grad_ = meta->WeakGrad(); SetDefaultGradInOutMeta(); } - ~GradNodeAccumulation() override = default; + ~GradNodeAccumulation() override { + VLOG(6) << "Destruct GradNodeAccumulation"; + } // Functor: perform backward computations virtual std::vector> operator()( diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index c0150a1730d52b3410ba4ea0d31674fbfed596ae..247fde6ed1f869542969b068cdae9f59cedd732a 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { const std::vector& tensors); void SetAttributes_scale(float scale); - + std::string name() override { return ""; } // Members: define fwd input tensors // For Scale there is no fwd input tensor needed private: diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 102fad56373803a19f07afc7dda72e9704ac83d5..2fc846cccc22e8937f8865a5063c77321941582a 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent( // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + // If single output slotname and not duplicable, + // then generate: "egr::AutogradMeta* p_autograd_out = + // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" + for (const proto::OpProto::Var& output : out_vars) { + const std::string& output_name = output.name(); + const std::string& output_autograd_name = "p_autograd_" + output_name; + + if (output.duplicable()) { + const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = + " std::vector %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } + } + VLOG(6) << "Generated outputs autograd_meta"; + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent( } VLOG(6) << "Generated inputs autograd_meta"; - // If single output slotname and not duplicable, - // then generate: "egr::AutogradMeta* p_autograd_out = - // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" - for (const proto::OpProto::Var& output : out_vars) { - const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - - // Skip Intermediate Tensor - - if (output.duplicable()) { - const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } - } - VLOG(6) << "Generated outputs autograd_meta"; - std::string prepare_autograd_meta_str = ""; prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += "\n"; @@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent( " %s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" " egr::EagerUtils::PassStopGradient(%s);\n" "%s\n }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, pass_stop_gradient_args, + compute_require_grad_args, op_type, pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; @@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents( const char* GRAD_NODE_TEMPLATE = "class GradNode%s : public egr::GradNodeBase {\n" " public:\n" - " GradNode%s() : egr::GradNodeBase() {}\n" + " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct " + "GradNode%s \"; }\n" " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " - "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" - " ~GradNode%s() override = default;\n" + "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" " + "Construct GradNode%s \"; }\n" + " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" "\n" " virtual std::vector> " "operator()(const " "std::vector>& grads) " "override;\n" "\n" + " std::string name() override { return \" GradNode%s \"; } \n " + "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" - " std::string name() { return \"GradNode%s\"; }\n" - "\n" " private:\n" " // TensorWrappers\n" "%s\n" @@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents( VLOG(6) << "Generated TensorWrapper"; std::string grad_node_str = paddle::string::Sprintf( - GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, op_type, + GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, + op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 02183e2ca5ce9f0996017eb7df59ee716b0f1ae2..af9540b6fb3adbda9087fb5f6a12a517e1cfea9b 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -213,8 +213,12 @@ def ParseYamlReturns(string): returns = [x.strip() for x in string.strip().split(",")] for i in range(len(returns)): - ret = returns[i] - returns_list.append(["", ret, i]) + ret_type = returns[i] + + assert ret_type in yaml_types_mapping.keys() + ret_type = yaml_types_mapping[ret_type] + + returns_list.append(["", ret_type, i]) return returns_list @@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{ virtual std::vector> operator()( const std::vector>& grads) override; - + std::string name() override {{ return \" {} \"; }} // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes @@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{ """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - set_tensor_wrapper_methods_str, set_attribute_methods_str, - tensor_wrapper_members_str, attribute_members_str) + grad_node_name, set_tensor_wrapper_methods_str, + set_attribute_methods_str, tensor_wrapper_members_str, + attribute_members_str) return node_declaration_str diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 356fdcaf054277085be57491eb1525beeac8d792..934497d7d179c1732bde68c147ed86661c25ddae 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -48,12 +48,16 @@ std::unordered_map getInDegreeMap( } visited.insert(node); + PADDLE_ENFORCE_NOT_NULL( + node, + paddle::platform::errors::Fatal( + "We got null node when we traverse the backward graph, and this " + "should not happened please check your code and contact us.")); // Find and append next nodes const std::vector>& edges = node->GetEdges(); for (const auto& edge_list : edges) { for (const Edge& edge : edge_list) { GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -67,7 +71,6 @@ std::unordered_map getInDegreeMap( } } } - return node_in_degree_map; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index b1189106b8f871ab618972ad93e9812ce443e55d..427be83c3bbee31eaa0c7e3d26d2d9599b344450 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -30,6 +30,7 @@ namespace egr { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { + VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); // adj_edges has the same num as backward outputs @@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { meta->SetGradNode(std::make_shared(meta)); + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "inputs's slot num.")); if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index eeac1cca4acf33190ce30613e4a86e99a95b651b..16513f05e0777a8e57f54c925d68867dda656612 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -76,10 +76,10 @@ class GradSlotMeta { class GradNodeBase { public: - GradNodeBase() = default; + GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; } GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); // TODO(jiabin): Should we have other constructor here? - virtual ~GradNodeBase() = default; + virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; } /** * operator() designed to contian the real backward execution logic, it should diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index bb84e2dda81bafe624fe7734a0a47391eeb0adfa..535c93ac53b1751d9634476e47f32dc0cbe22708 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode(float val, int in_num, int out_num) : GradNodeBase(in_num, out_num), val_(val) {} GradTestNode() : GradNodeBase() { val_ = 1.0; } + std::string name() override { return "GradTestNode"; } std::vector> operator()( const std::vector>& grads) override { diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 39861c80522a920502fff91177256a4b7abf6dc6..8a57d2694535e9c27e88416468fe5a67ce020b43 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " - "detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } autograd_meta->SetGradNode(grad_node); } @@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector* autograd_metas, void EagerUtils::SetHistory(AutogradMeta* autograd_meta, const std::shared_ptr& grad_node) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) - << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } - autograd_meta->SetGradNode(grad_node); } diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 96aa95bde337436dd6eb584b3eea5395b5301a34..11190309814e7c75777a6cddd7e4d24bfc7ba9e6 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include -#include #include -#include -#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -25,7 +26,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/place.h" -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 7b3916bafc93eda8cb1afbf54b706e032c5233dd..bc65231abe7371a931f709c9190b55fde24f0543 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -409,7 +409,7 @@ class ThreadPoolTempl { return false; } platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6414dd455db4f2e39d958760449e3eb9d7d362f0..b68748a687c529e13282ea1398a60defd6cdf83e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2106,15 +2106,19 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } @@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext( vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr - } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 3b5762720e7fb4a9eb0be157f6dabf07aa9353c2..30dbe07d7afca6473785d7a64be6864534b84e3c 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -314,15 +314,18 @@ void BuildDygraphPhiKernelContext( phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]->MutableVar(); - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 949cf021cf0fa322970c210fa26f698fd2bc45b2..174207deb08b84194d6f20fe04e4c27245295caf 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ops::BatchNormDoubleGradMaker); REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, ops::BatchNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index d59396db1517faadaa2dd9e9af770d2e8a23ec56..a19b087245a89a4a12f062b1ce27835b98ecfd66 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -template -static __global__ void BNForwardInference( - const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int num = N * C * HxW; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType x_sub_mean = - static_cast>(x[i]) - mean[c]; - BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); - y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( - const T *x, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - bool test_mode = is_test && (!trainable_stats); - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5" - "But received: the size of input's dimensions is [%d]", - x_dims.size())); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - test_mode || - (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); - - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_y(y->type()); - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, y, - &transformed_y); - } else { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * D * C, 1, W * D * C, D * C, C}; - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor( -// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto &dev_ctx = ctx.template device_context(); - - auto handle = dev_ctx.cudnn_handle(); - - // Now, depending on whether we are running test or not, we have two paths. - // It is training mode when it's not reference AND not using pre-trained - // model. - bool training = !test_mode && !use_global_stats; - if (!training) { - // only when test we use input to do computation. - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - // Run inference mode. - PADDLE_ENFORCE_EQ( - est_mean->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of mean's dimensions must equal to 1." - "But received: the size of mean's dimensions mean is [%d]," - "the dimensions of mean is [%s].", - est_mean->dims().size(), est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of variance's dimensions must equal to 1." - "But received: the size of variance's dimensions is [%d]," - "the dimensions of variance is [%s].", - est_var->dims().size(), est_var->dims())); - PADDLE_ENFORCE_EQ( - est_mean->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of mean must equal to the number of " - "Channels, which is [%d]. But received: the first dimension" - "of mean is [%d], the dimensions of mean is [%s].", - C, est_mean->dims()[0], est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of variance must equal to the number" - "of Channels, which is [%d]. But received: the first dimension of" - "variance is [%d], the dimensions of variance is [%s].", - C, est_var->dims()[0], est_var->dims())); - -#ifdef PADDLE_WITH_HIP - const int block_size = 256; - const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; - if (compute_format == DataLayout::kNCHW) { - BNForwardInference< - T, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } else { - BNForwardInference< - T, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardInference( -// handle, miopenBNSpatial, -// const_cast( -// static_cast(CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// const_cast(static_cast( -// est_mean->template data>())), -// const_cast(static_cast( -// est_var->template data>())), -// epsilon)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); -#endif - } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), - &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // Run training mode. - // obtain running mean and running inv var, and there is no need - // to initialize them. - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - - if ((N * H * W * D) == 1) { - // Only 1 element in normalization dimension, - // skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - } else { - double this_factor = 1. - momentum; - - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), transformed_x.type(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), nullptr, nullptr, data_desc_, - transformed_y.template data(), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), - nullptr, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, block, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); -#endif - } - } - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_y, y); - } -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( - const T *dy, const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const double epsilon, const int N, - const int C, const int HxW, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); - BatchNormParamType mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - ds_sum += static_cast>(dy[index]) * - (static_cast>(x[index]) - mean_i); - db_sum += static_cast>(dy[index]); - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale[i] = ds_sum * inv_var_i; - dbias[i] = db_sum; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - -template -static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - double epsilon, int C, int M, - const int num, const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -class InplaceHelper { - public: - void operator()(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, double epsilon, int C, - int M, const int num, const T *y, int grid2, const int block, - const gpuStream_t &stream) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); - KeBNRestoreData<<>>( - layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( - const T *dy, const T *x, const BatchNormParamType *scale, - const BatchNormParamType *saved_mean, - const BatchNormParamType *saved_inv_variance, const int C, const int N, - const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType inv_var_val; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType dscale_val; - __shared__ BatchNormParamType dbias_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - if (saved_mean && saved_inv_variance) { - if (threadIdx.x == 0) { - inv_var_val = saved_inv_variance[i]; - mean_val = saved_mean[i]; - } - } else { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = - static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = - static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - inv_var_val = - 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); - } - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - ds_sum += - dy_i * (static_cast>(x[index]) - mean_val); - db_sum += dy_i; - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale_val = ds_sum * inv_var_val; - dbias_val = db_sum; - dscale[i] = dscale_val; - dbias[i] = dbias_val; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = scale[i] * inv_var_val * - (static_cast>(dy[index]) - - dbias_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_val) * - inv_var_val * dscale_val / inner_size); - } - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( - const T *dy, const BatchNormParamType *scale, - const BatchNormParamType *mean, const T *x, - const BatchNormParamType *variance, const int C, const int N, - const int HxW, T *dx) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType inv_var_i = variance[i]; - BatchNormParamType mean_i = mean[i]; - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[index]) - mean_i); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) - .Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = - (static_cast>(dy[index]) - - dy_sum_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_i) * - dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * - scale[i] * inv_var_i; - } - } -} - -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - const bool is_test = ctx.Attr("is_test"); - use_global_stats = is_test || use_global_stats; - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5." - "But received: the size of input's dimensions is [%d]," - "the dimensions of input is [%s]", - x_dims.size(), x_dims)); - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of scale's dimensions must equal to 1. But received: " - "the size of scale's dimensions is [%d], the dimensions of scale " - "is [%s].", - scale->dims().size(), scale->dims())); - PADDLE_ENFORCE_EQ( - scale->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of scale must equal to Channels[%d]. But " - "received: the first dimension of scale is [%d]", - C, scale->dims()[0])); - - auto dtype = platform::CudnnDataType::type; - const auto *reserve_space = ctx.Input("ReserveSpace"); -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && - reserve_space != nullptr; - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_d_y(d_y->type()); - Tensor transformed_d_x; - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, d_y, - &transformed_d_y); - TransToChannelFirst(ctx, d_y, - &transformed_d_y); - if (d_x) { - ResizeToChannelFirst(ctx, d_x, - &transformed_d_x); - } - } else { - transformed_x.ShareDataWith(*x); - transformed_d_y.ShareDataWith(*d_y); - if (d_x) { - transformed_d_x.ShareDataWith(*d_x); - } - } - - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - - auto &dev_ctx = ctx.template device_context(); - const int num = transformed_x.numel(); -#ifdef HIPCC - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - auto stream = dev_ctx.stream(); - InplaceHelper inplace_functor; - - if (!use_global_stats) { - if ((N * H * W * D) == 1) { - if (d_x) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - } - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, -// data_desc_, mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - if (is_inplace) { - inplace_functor(compute_format, transformed_x.data(), - scale->template data>(), - bias->template data>(), - saved_mean_data, saved_var_data, epsilon, C, H * W * D, - num, transformed_x.data(), grid2, block, stream); - } - - // This branch calls CUDNN APIs - if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( - ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - if (compute_format == DataLayout::kNCHW) { - BNBackward< - T, block, - DataLayout::kNCHW><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } else { - BNBackward< - T, block, - DataLayout::kNHWC><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_d_x, d_x); - } - } else { - // This branch call CUDA kernels - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto *running_mean_data = - running_mean->template data>(); - const auto *running_var_data = - running_var->template data>(); - - if (is_inplace) { - auto px = *x; - inplace_functor(data_layout, px.mutable_data(ctx.GetPlace()), - scale->template data>(), - bias->template data>(), - running_mean_data, running_var_data, epsilon, C, - H * W * D, num, x->data(), grid2, block, stream); - } - - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } - } - } -}; - -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - NormDoubleGradFunctor( - ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon, - use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY); - } -}; - } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index dff60afd74c02f458b5b3c7428c2703197b61af0..2055bf560e69ca0ed354aadd00cdca331c22c76e 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -25,10 +25,10 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_bool(cudnn_deterministic); DECLARE_uint64(conv_workspace_size_limit); @@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: @@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // cuDNN only supports padding the same amount on every dimension. // So we create a new padded input tensor. int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input(input->type()); Tensor transformed_input_grad(input->type()); std::vector padding_common(data_dim, 0); @@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: @@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 4b8f9d7e6ca8d2f1dae99f1d034c53daf948f922..141a99f60f104c3bf32e16a1254d0f5eec623645 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_input; @@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_output_grad; @@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; case 5: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; default: @@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (dO) { - math::PadFunction( - ctx, input_pad, transformed_dO_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_dO_channel, pad_value, &transformed_dO); } if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90..cdcf683fb92c5a5ef56f61da15e5979fd1364945 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, bool is_fix_seed, int seed_val, const Tensor& x, const Tensor* seed, Tensor* mask, Tensor* y) { auto& place = *dev_ctx.eigen_device(); + int64_t x_numel = x.numel(); + auto stream = dev_ctx.stream(); + auto* x_data = x.data(); + auto* y_data = y->data(); if (!is_test) { - int64_t x_numel = x.numel(); - auto stream = dev_ctx.stream(); auto* mask_data = mask->data(); size_t size = phi::product(mask->dims()); - auto* x_data = x.data(); - auto* y_data = y->data(); if (dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } #endif } else { - auto X = EigenMatrix::Reshape(x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); if (upscale_in_train) { - Y.device(place) = X; +// todo: can y share with data with x directly? +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + hipMemcpyDeviceToDevice, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + cudaMemcpyDeviceToDevice, stream)); +#endif } else { - Y.device(place) = X * static_cast(1.0f - dropout_prob); + T factor = static_cast(1.0f - dropout_prob); + std::vector ins = {&x}; + std::vector outs = {y}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } } diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index bb5b363fe83995faf69f61b0a1a1693ff758fa37..5dbf4fb88b2a78838ce0fe95be653f68f4805416 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 6119af18ce153ac2bcd5d45a69ab7b5d86a3cc10..b3ac3606eaf8ee843a2be98b7a237037afaf524f 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -32,7 +32,7 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 8f1d9284c503813ef3dd9688891048a5bca57b29..e0db2f26d3e0534f924cc709b98689fb3f1a5cc6 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { axis = static_cast(cpu_axis.data()[0]); } else if (axis_type == framework::proto::VarType::INT64) { axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT16) { + axis = static_cast(cpu_axis.data()[0]); } } const auto &place = ctx.GetPlace(); @@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel { } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GatherV2CUDAFunction(x, index, axis, output, dev_ctx); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GPUGather(dev_ctx, *x, *index, output); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 2868c3697eda19ed3e7cc1fb4c74e9beeaca9c0d..7f6c82032fe39da9d4de768330dcbcfc48610bcd 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree"); - OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree"); - - auto ids_dims = ctx->GetInputDim("Ids"); - auto parents_dims = ctx->GetInputDim("Parents"); - PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true, - platform::errors::InvalidArgument( - "The shape of Input(Parents) must be same with the " - "shape of Input(Ids).")); - ctx->SetOutputDim("Out", ids_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -72,4 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); +DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PT_INFER_META(phi::GatherTreeMeta)); + +REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, + GatherTreeInferShapeFunctor); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 774ff0bd065995916562061784f5218336a9da93..6b559885c569d001233525c3d964fff2175950e3 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -26,27 +26,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - - std::normal_distribution dist(mean, std); - auto shape = GetShape(context); - tensor->Resize(shape); - int64_t size = tensor->numel(); - T* data = tensor->mutable_data(context.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - } -}; // namespace operators template class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { @@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker); -REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel, - ops::CPUGaussianRandomKernel); REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 21d827c79200c4a368ce7677b01b18ee4ddedb8d..d419bd70e67db27b49d9abccd3dba3227692337a 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -52,53 +52,6 @@ struct GaussianGenerator { } }; -template -class GPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - auto shape = GetShape(context); - tensor->Resize(shape); - - auto& dev_cxt = - context.template device_context(); - T* data = tensor->mutable_data(dev_cxt.GetPlace()); - - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - distribution::normal_distribution dist; - distribution::normal_transform trans(mean, std); - distribution::distribution_and_transform(dev_cxt, tensor, dist, - trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = - GaussianGenerator(mean, std, seed_offset.first, gen_offset); - IndexKernel>(dev_cxt, tensor, func); - } - } else { - auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); - } - } -}; - template class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { public: @@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - gaussian_random, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel); REGISTER_OP_CUDA_KERNEL( gaussian_random_batch_size_like, paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index e0779249c41adc5005bbaba6e19127d2ced3a9ec..7f5136969980b887bb7bbe013690898e66abeac1 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -17,6 +17,8 @@ #include #include #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { @@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { }; template -class InplaceABNKernel - : public paddle::operators::BatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -213,7 +214,33 @@ class InplaceABNKernel auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); - BatchNormKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); auto cur_y = EigenVector::Flatten(*y); InplaceABNActivation functor; @@ -222,8 +249,7 @@ class InplaceABNKernel }; template -class InplaceABNGradKernel - : public paddle::operators::BatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Input("Y"); @@ -244,7 +270,52 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - BatchNormGradKernel::Compute(ctx); + // BatchNormGradKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } }; diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index be7a7bd71711e379ef4d98eb1f9ac5ee2caaace6..db8f8c72d13f8e46f6f9e332c5c2f5164b6d0836 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -15,14 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { template class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel, - public paddle::operators::BatchNormKernel { + : public paddle::operators::SyncBatchNormKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,7 +37,33 @@ class InplaceABNKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormKernel::Compute(ctx); } else { - BatchNormKernel::Compute(ctx); + // BatchNormKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); } auto cur_y = EigenVector::Flatten(*y); @@ -49,8 +76,7 @@ class InplaceABNKernel // https://kevinzakka.github.io/2016/09/14/batch_normalization/ template class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel, - public paddle::operators::BatchNormGradKernel { + : public paddle::operators::SyncBatchNormGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -74,7 +100,50 @@ class InplaceABNGradKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormGradKernel::Compute(ctx); } else { - BatchNormGradKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } } }; diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index c400a8f4239a605414bf0d99a6a89b0ddae6c535..0ed1f2719de25bd2c138c23dd69b914a66961464 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( } template -void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, +void NormDoubleGradFunctor(const DeviceContext &ctx, const DataLayout data_layout, const Tensor *X, const Tensor *Scale, const Tensor *dY, const Tensor *Saved_mean, - const Tensor *Saved_variance, const double epsilon, + const Tensor *Saved_variance, const Tensor *Mean, + const Tensor *Variance, const double epsilon, const bool use_global_stats, const Tensor *ddX, const Tensor *ddScale, const Tensor *ddBias, Tensor *dX, Tensor *dScale, Tensor *ddY) { @@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; auto &x_dims = X->dims(); const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, Tensor scale_tmp; if (!Scale) { scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_tmp, static_cast(1)); + set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); #ifdef __HIPCC__ @@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, #else const int block = 512; #endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); int grid1 = (num + block - 1) / block; const T *mean_data, *variance_data; if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); + const auto *running_mean = Mean; + const auto *running_var = Variance; const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); mean_data = running_mean_data; @@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } else { const T *smean_data = Saved_mean->data(); const T *svariance_data = Saved_variance->data(); + mean_data = smean_data; variance_data = svariance_data; } if (dX) { T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); + set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } else { DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } @@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (dScale) { T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); + set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } else { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } else { DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } @@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (ddY) { T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); + set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h index 5df167fdf726345074cdc40afd0c5b394467578f..0aedd800e1a237d4baf0092eef9bac9f7dbe862d 100644 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/padding.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); } - math::PaddingFunctor(rank, context, pads, pad_value, - *in_y, out); + phi::funcs::PaddingFunctor( + rank, context.template device_context(), pads, pad_value, + *in_y, out); } }; @@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); } - math::PaddingGradFunctor(rank, context, pads, *in_dout, - d_y); + phi::funcs::PaddingGradFunctor( + rank, context.template device_context(), pads, *in_dout, + d_y); } }; diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 39acba7e58aba51942d7d8de2d89e2783fd591f9..229e61ac9fe79d3c171d1f0612f22f3590587231 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pad_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" namespace paddle { @@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); -REGISTER_OP_CPU_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CPU_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CUDA_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h deleted file mode 100644 index d494c954e1ef73b585761acf7490a5e35beccac4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PadKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - int rank = x->dims().size(); - math::PaddingFunctor(rank, context, pads, - static_cast(pad_value), *x, out); - } -}; - -template -class PadGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x == nullptr) { - return; - } - - d_x->mutable_data(context.GetPlace()); - int rank = d_out->dims().size(); - math::PaddingGradFunctor(rank, context, pads, *d_out, - d_x); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index 40476d5e11f6a3b0cad21038a3f342d824f3575c..18402d908c4ad8d67bf7fc980a9e5c8917beb142 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -20,9 +20,11 @@ namespace cub = hipcub; #endif #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace paddle { namespace operators { @@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) { } template -__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, - const int ignore_index, const int limit, - T *out_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); +struct NonzeroFunctor { + HOSTDEVICE explicit inline NonzeroFunctor() {} + HOSTDEVICE inline T operator()(const T x) const { + return static_cast(static_cast(x) != 0); + } +}; + +template +struct SigmoidFwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); + + HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label) { + T counts; + T out_data; + + T diff = label - static_cast(ignore_index_); if ((diff > -eps) && (diff < eps)) { - out_data[i] = static_cast(0.); - counts[i] = 0; + out_data = static_cast(0.); + counts = 0; } else { T term1 = (x > 0) ? x : 0; T term2 = x * label; T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - out_data[i] = term1 - term2 + term3; - counts[i] = 1; + + out_data = term1 - term2 + term3; + counts = 1; } - } -} + phi::Array outs; -template -__global__ void Sum(const T *counts, int num, const T eps, T *sum) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T in = 0; - for (int i = threadIdx.x; i < num; i += BlockDim) { - in += counts[i]; + outs[0] = out_data; + outs[1] = counts; + return outs; } - __syncthreads(); - auto out = - BlockReduce(temp_storage).Reduce(static_cast(in), cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - T a = out > eps ? out : eps; - sum[0] = a; - } -} +}; template -__global__ void Div(T *loss, const int num, const T *norm) { - CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } -} +struct SigmoidBwdFunctor { + T ignore_index_; + T eps = static_cast(1e-5); -template -__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, - const int ignore_index, const T *dout_data, - const int limit, T *dx_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T dout = dout_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); + HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) + : ignore_index_(ignore_index) {} + + HOSTDEVICE inline phi::Array operator()(const T x, const T label, + const T dout) { + T counts; + T dx_data; + + T diff = label - static_cast(ignore_index_); if ((diff > -eps) && (diff < eps)) { - dx_data[i] = static_cast(0.); - counts[i] = 0; + dx_data = static_cast(0.); + counts = 0; } else { T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); T diff = simoid_x - label; - dx_data[i] = dout * diff; - counts[i] = 1; + dx_data = dout * diff; + counts = 1; } + phi::Array outs; + + outs[0] = dx_data; + outs[1] = counts; + return outs; } -} +}; + +template +struct DivFunctor { + const T norm_; + HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {} + + HOSTDEVICE inline T operator()(T loss) { + loss /= norm_; + return loss; + } +}; // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template @@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { bool normalize = context.Attr("normalize"); // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - + Tensor *counts_tensor = new Tensor(); + counts_tensor->mutable_data(context.GetPlace(), + Labels->numel() * sizeof(T)); + counts_tensor->Resize(Out->dims()); int limit = Out->numel(); int blocks = NumBlocks(limit); int threads = kNumCUDAThreads; - GPUSigmoidForward<<>>( - X->data(), Labels->data(), ignore_index, limit, out_data, counts); + std::vector ins = {X, Labels}; + std::vector outs = {Out, counts_tensor}; + auto functor = SigmoidFwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel(dev_ctx, ins, + &outs, functor); if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(out_data, limit, norm); + T *counts = counts_tensor->mutable_data(context.GetPlace()); + Tensor *norm_tensor = new Tensor(); + norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + TensorReduceImpl>( + context.cuda_device_context(), *counts_tensor, norm_tensor, + NonzeroFunctor(), reduce_dim, dev_ctx.stream()); + T *norm = norm_tensor->mutable_data(context.GetPlace()); + auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, + sizeof(T), dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {Out}; + std::vector div_outs = {Out}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, + div_functor); + + delete norm_tensor; + delete counts_tensor; } } }; @@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel auto &dev_ctx = context.cuda_device_context(); // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); + Tensor *counts_tensor = new Tensor(); + counts_tensor->mutable_data(context.GetPlace(), + Labels->numel() * sizeof(T)); + counts_tensor->Resize(dX->dims()); int limit = dX->numel(); int blocks = NumBlocks(limit); int threads = kNumCUDAThreads; - GPUSigmoidBackward<<>>( - X->data(), Labels->data(), ignore_index, dOut->data(), limit, - dx_data, counts); + std::vector ins = {X, Labels, dOut}; + std::vector outs = {dX, counts_tensor}; + auto functor = SigmoidBwdFunctor(ignore_index); + constexpr int Size = 2; + phi::funcs::ElementwiseKernel(dev_ctx, ins, + &outs, functor); bool normalize = context.Attr("normalize"); if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(dx_data, limit, norm); + T *counts = counts_tensor->mutable_data(context.GetPlace()); + Tensor *norm_tensor = new Tensor(); + norm_tensor->mutable_data(context.GetPlace(), sizeof(T)); + auto dims = phi::vectorize(counts_tensor->dims()); + std::vector reduce_dim = {}; + for (int i = 0; i < dims.size(); i++) { + reduce_dim.push_back(i); + } + + TensorReduceImpl>( + context.cuda_device_context(), *counts_tensor, norm_tensor, + NonzeroFunctor(), reduce_dim, dev_ctx.stream()); + T *norm = norm_tensor->mutable_data(context.GetPlace()); + auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T)); + T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); + memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm, + sizeof(T), dev_ctx.stream()); + auto eps = static_cast(1e-5); + *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; + + std::vector div_ins = {dX}; + std::vector div_outs = {dX}; + auto div_functor = DivFunctor(*norm_cpu_ptr); + phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, + div_functor); + delete norm_tensor; } } }; diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index 2bc5124843c38152d2f5d3ffcef5a5ca24534bfd..a60ec5a4df52b8275a17185a63c8a7d27dd8132b 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -23,9 +23,9 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/padding.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "thrust/device_vector.h" #endif @@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel { std::vector pads(rank * 2, 0); pads[axes.back() * 2 + 1] = zero_length; - paddle::operators::math::PaddingFunctor( - rank, ctx, pads, static_cast(0), *dy, &full_dy); + phi::funcs::PaddingFunctor( + rank, ctx.template device_context(), pads, + static_cast(0), *dy, &full_dy); fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); } diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 6127fcfa8def6f2a6723416c6a29bd41a4871b74..b20e8ac9785cafea7e4f85fbfb9570d3cde5d1f5 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -23,12 +23,9 @@ namespace paddle { namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { - auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == phi::backends::xpu::XPUVersion::XPU2) { - ops = get_kl2_ops(); - } - + auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops() + : get_kl2_ops(); if (ops.find(op_name) != ops.end() && ops[op_name].find(type) != ops[op_name].end()) { return true; @@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) { #ifdef PADDLE_WITH_XPU_KP bool is_xpu_kp_support_op(const std::string& op_name, const pOpKernelType& type) { - auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == phi::backends::xpu::XPUVersion::XPU2) { - ops = get_kp_ops(); - } - + auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops() + : get_kp_ops(); if (ops.find(op_name) != ops.end() && ops[op_name].find(type) != ops[op_name].end()) { return true; diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 71fd0d20143a08b065eb596d6f5f9ac6531057f4..372bfbce2aca232ca5704fa20f17b0c75359ceba 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_XPU @@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) { #endif void InitDevices() { + // set name at the entry point of Paddle + platform::SetCurrentThreadName("MainThread"); // CUPTI attribute should be set before any CUDA context is created (see CUPTI // documentation about CUpti_ActivityAttribute). #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc index b309bb985122d8bbe28a8014bca51b4a5a6b9b10..b3311f1d19e6304a0b232cd936397559224e9b96 100644 --- a/paddle/fluid/platform/os_info_test.cc +++ b/paddle/fluid/platform/os_info_test.cc @@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) { using paddle::platform::GetCurrentThreadName; using paddle::platform::SetCurrentThreadName; using paddle::platform::GetAllThreadNames; - EXPECT_EQ("unset", GetCurrentThreadName()); - EXPECT_TRUE(SetCurrentThreadName("MainThread")); + SetCurrentThreadName("MainThread"); EXPECT_FALSE(SetCurrentThreadName("MainThread")); auto names = GetAllThreadNames(); EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end()); diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index 49f9362527591744dd0685375e0244673a7b3081..afd4135246556624cb022243e0e98b5ad9f9f6da 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -189,7 +189,10 @@ struct ThreadEventSection { class ThreadEventRecorder { public: - ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); } + ThreadEventRecorder() { + thread_id_ = GetCurrentThreadSysId(); + thread_name_ = GetCurrentThreadName(); + } DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); @@ -202,7 +205,7 @@ class ThreadEventRecorder { ThreadEventSection GatherEvents() { ThreadEventSection thr_sec; - thr_sec.thread_name = GetCurrentThreadName(); + thr_sec.thread_name = thread_name_; thr_sec.thread_id = thread_id_; thr_sec.events = std::move(base_evt_cntr_.Reduce()); return thr_sec; @@ -210,6 +213,7 @@ class ThreadEventRecorder { private: uint64_t thread_id_; + std::string thread_name_; EventContainer base_evt_cntr_; }; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 48d42f803a8248f733c6b4b0a9a52c2c70a3ef32..5e61133510d6a27a97e618dcd87f46ce8a6cd26e 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -85,6 +85,9 @@ if(NOT ON_INFER) if (WITH_NCCL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) endif() + if (WITH_GLOO) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) + endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) endif() diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc index a0d2777f825dc592e19230bc2ba4412f943d0c2b..c01accaf598aa849cf5406e96cc9b5743b46e448 100644 --- a/paddle/fluid/pybind/communication.cc +++ b/paddle/fluid/pybind/communication.cc @@ -31,9 +31,15 @@ namespace pybind { using TCPStore = paddle::distributed::TCPStore; void BindTCPStore(py::module* m) { - py::class_(*m, "TCPStore") - .def( - py::init()) + py::class_>(*m, "TCPStore") + .def(py::init([](std::string hostname, uint16_t port, bool is_master, + size_t world_size, std::chrono::seconds timeout) { + return std::make_shared(hostname, port, is_master, + world_size, timeout); + }), + py::arg("hostname"), py::arg("port"), py::arg("is_master"), + py::arg("world_size"), py::arg("timeout"), + py::call_guard()) .def("add", &TCPStore::add) .def("get", &TCPStore::get); } diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index a4a1d07db2cb9771530ddb5be0696cef38b2c344..17512863357d8dfe342f1a841471e1fdf1ac8072 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -35,6 +35,11 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #endif +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/distributed/store/tcp_store.h" +#endif + namespace py = pybind11; namespace paddle { @@ -42,6 +47,14 @@ namespace pybind { using Tensor = paddle::experimental::Tensor; +#if defined(PADDLE_WITH_GLOO) +using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo; +using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore; +using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions; +#endif + +static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT + void BindDistributed(py::module *m) { py::enum_(*m, "ReduceOp") .value("SUM", distributed::ReduceOp::SUM) @@ -64,6 +77,11 @@ void BindDistributed(py::module *m) { .def(py::init<>()) .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); + py::class_(*m, "ReduceOptions") + .def(py::init<>()) + .def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op) + .def_readwrite("source_root", &distributed::ReduceOptions::root_rank); + auto ProcessGroup = py::class_>(*m, "ProcessGroup") @@ -121,6 +139,58 @@ void BindDistributed(py::module *m) { return self.Recv(tensors, src); }, py::arg("tensor"), py::arg("src"), + py::call_guard()) + + .def("all_gather", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.AllGather(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def("alltoall", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.AllToAll(in_tensors, out_tensors); + }, + py::arg("in"), py::arg("out"), + py::call_guard()) + + .def("reduce", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + int dst, distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + distributed::ReduceOptions opts; + opts.reduce_op = op; + opts.root_rank = dst; + std::vector tensors = {in_tensor}; + return self.Reduce(tensors, opts); + }, + py::arg("tensor"), py::arg("dst"), + py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def("scatter", + [](distributed::ProcessGroup &self, py::handle py_in_tensor, + py::handle py_out_tensor, int src) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + distributed::ScatterOptions opts; + opts.root_rank = src; + std::vector in_tensors = {in_tensor}; + std::vector out_tensors = {out_tensor}; + return self.Scatter(in_tensors, out_tensors, opts); + }, + py::arg("in"), py::arg("out"), py::arg("src"), py::call_guard()); #if defined(PADDLE_WITH_NCCL) @@ -129,6 +199,7 @@ void BindDistributed(py::module *m) { *m, "ProcessGroupNCCL", ProcessGroup) .def(py::init(), py::call_guard()); +#endif py::class_>(*m, "task") @@ -138,7 +209,6 @@ void BindDistributed(py::module *m) { py::call_guard()) .def("synchronize", &distributed::ProcessGroup::Task::Synchronize, py::call_guard()); -#endif // define parallel strategy, it will be removed py::class_ pg_strategy( @@ -178,6 +248,45 @@ void BindDistributed(py::module *m) { self.nrings_ = nrings; }); +#if defined(PADDLE_WITH_GLOO) + py::class_(*m, "GlooOptions") + .def(py::init<>()) + .def_readwrite("_device", &GlooOptions::device) + .def_static("create", &GlooOptions::create); + + py::class_>(*m, "GlooStore") + .def(py::init( + [](const std::shared_ptr &store) { + return std::make_shared(store); + }), + py::call_guard()); + + py::class_>( + *m, "ProcessGroupGloo", ProcessGroup) + .def(py::init &, int, int, + std::shared_ptr &>(), + py::call_guard()) + .def(py::init([](const std::shared_ptr &store, int rank, + int world_size) { + auto opts = GlooOptions::create(); + char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); + if (ifname && strlen(ifname) > 1) { + opts->device = ProcessGroupGloo::createDeviceForInterface( + std::string(ifname)); + } else { + opts->device = ProcessGroupGloo::createDefaultDevice(); + } + return std::make_shared(store, rank, world_size, + opts); + }), + py::arg("store"), py::arg("rank"), + py::arg("world_size"), // py::arg("timeout") = + // kProcessGroupDefaultTimeout, + py::call_guard()) + .def_static("create_default_device", + &ProcessGroupGloo::createDefaultDevice); +#endif + m->def("eager_assign_group_by_size", [](py::handle py_tensors, std::vector is_sparse_gradient, std::vector group_size_limits, diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index f11a2ab2517fb481f184c9b68b2558c999d88ec9..e5f22338dc61543a377d4a94307f834b774257d4 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/hooks.h" @@ -30,10 +31,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/slice_utils.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" + namespace paddle { namespace pybind { @@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self, extern PyTypeObject* p_tensor_type; +Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) { + if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { + VLOG(6) << "Call GetSliceIndexFromTensor in Eager"; + paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0); + PADDLE_ENFORCE_EQ( + tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "We can only support initialized tensor in slice, however we got " + "uninitialized tensor %s, please check your code.", + tensor.name())); + return GetSliceIndexFromTensor((*static_cast( + CastPyArg2Tensor(obj, 0).impl().get()))); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We should only get paddle::experimental::Tensor or VarBase in this " + "method, when you reach this means we got another type index.")); + } +} + +bool PyCheckTensor(PyObject* obj) { + return PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type)); +} + static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } -// NOTE(wuweilong): Set value and not change self's original place -static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, - PyObject* kwargs) { +static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, + PyObject* args, + PyObject* kwargs) { EAGER_TRY - VLOG(4) << "Value " << self->tensor.name(); - pybind11::object numpy_value = - pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true); - InitTensorWithNumpyValue(self, numpy_value, false); - Py_INCREF(Py_None); - return Py_None; + PyObject* _index = PyTuple_GET_ITEM(args, 0); + VLOG(4) << "Call _getitem_index_not_tensor"; + std::vector slice_axes, slice_starts, slice_ends, slice_strides, + decrease_axis, none_axes, infer_flags, list_select_idxs; + // if index is a list, list_select_flag will be true + bool list_select_flag = false; + PADDLE_ENFORCE_EQ( + self->tensor.is_initialized(), true, + platform::errors::InvalidArgument( + "tensor %s has not been initialized, we can only slice initialized " + "tensor please init it first with numpy or other tensor.", + self->tensor.name())); + auto tensor = static_cast(self->tensor.impl().get()); + ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends, + &slice_strides, &decrease_axis, &none_axes, &infer_flags, + &list_select_idxs, &list_select_flag); + + auto out = slice_axes.empty() && !list_select_flag + ? self->tensor + : paddle::experimental::Tensor( + egr::Controller::Instance().GenerateUniqueName()); + + if (!slice_axes.empty()) { + framework::AttributeMap attrs = {{"axes", slice_axes}, + {"starts", slice_starts}, + {"ends", slice_ends}, + {"infer_flags", infer_flags}, + {"decrease_axis", decrease_axis}}; + std::string op_type = "slice"; + for (auto stride : slice_strides) { + if (stride != 1) { + op_type = "strided_slice"; + attrs.insert({"strides", slice_strides}); + attrs.erase("decrease_axis"); + break; + } + } + if (op_type == "slice") { + out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(), + paddle::experimental::Tensor(), + std::move(attrs)); + } else if (op_type == "strided_slice") { + out = strided_slice_dygraph_function(self->tensor, attrs); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Slice is only support slice and strided_slice, but we got %s which " + "is impossible, please check your code first or contact us by " + "issue. ", + op_type)); + } + } + + if (!none_axes.empty()) { + // Deal with cases when all axes are decreased. + // After slice, the shape of out is [1], which should have been + // [], but Paddle doesn't support scalar. + // In order to ensure the correctness of the final shape of out, + // one dimension of out needs to be decreased. + // For example: + // # x.shape: (2,3,4) + // out = x[0, 1, 1, None] # out.shape : (1) + if (static_cast(decrease_axis.size()) == tensor->dims().size()) { + none_axes.pop_back(); + } + if (!none_axes.empty()) { + // Deal with cases that decrease_axes is not empty + // For example: + // # x.shape: (2,3,4) + // out = x[0, 0:2, None] # out.shape : (2, 1, 4) + for (auto& axis : none_axes) { + int len = 0; + for (int da : decrease_axis) { + if (da < axis) { + len++; + } + } + axis -= len; + } + + paddle::experimental::Tensor new_out; + framework::AttributeMap attrs = {{"axes", none_axes}}; + new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs))); + return ToPyObject(new_out); + } + } + + // the index is a list + if (list_select_flag) { + auto select_index = paddle::experimental::Tensor( + egr::Controller::Instance().GenerateUniqueName()); + auto idx_tensor = std::make_shared(); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get( + egr::Controller::Instance().GetExpectedPlace()); + paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx, + idx_tensor.get()); + framework::AttributeMap attrs = {{"dim", 0}}; + out = index_select_dygraph_function(self->tensor, select_index, + std::move(attrs)); + } + + return ToPyObject(out); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = { {"get_tensor", (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, + {"_getitem_index_not_tensor", + (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor, METH_VARARGS | METH_KEYWORDS, NULL}, {"_register_grad_hook", (PyCFunction)(void (*)(void))tensor_register_grad_hook, diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index c1e8822eec22179266d69d3b97890aebe678b187..57f37621d3ba4471e2961651ad9a23c6567bb82b 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope_guard.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" @@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { } } +std::shared_ptr CastPyArg2VarBase(PyObject* obj, + ssize_t arg_pos) { + return py::cast>(obj); +} + std::vector CastPyArg2VectorOfTensor( PyObject* obj, ssize_t arg_pos) { std::vector result; @@ -737,5 +745,6 @@ std::vector GetTensorPtrListFromArgs( return result; } + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 0c721d6124791edda7f41d46dcbbbfcccc80fb95..92afc3ae4875c94e6d179669f418d6ba4a325f00 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" - namespace paddle { namespace pybind { @@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); +std::shared_ptr CastPyArg2VarBase(PyObject* obj, + ssize_t arg_pos); std::vector CastPyArg2VectorOfTensor( PyObject* obj, ssize_t arg_pos); platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos); @@ -112,5 +113,7 @@ std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); +// end of Slice related methods + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 8c5ed2d11830195a6fb70c54d12c9ef3eb3fc8b2..3da17b95a66ba85ba8cf72d049c2397414a03ecd 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -54,6 +54,7 @@ limitations under the License. */ #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" namespace paddle { @@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) { } } +Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) { + if (py::isinstance(obj)) { + VLOG(6) << "Call GetSliceIndexFromTensor in Imperative"; + return GetSliceIndexFromTensor( + py::cast>(obj) + ->Var() + .Get()); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We should only get paddle::experimental::Tensor or VarBase in this " + "method, when you reach this means we got another type index.")); + } +} + +bool PyCheckTensor(PyObject *obj) { + return py::isinstance(obj); +} using PyNameVarBaseMap = std::unordered_map; // NOTE(zjl): py::handle is a very light wrapper of PyObject *. @@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { return result; } -static bool IsNumpyType(PyObject *obj) { - // It is not a good way to judge the type of obj by its type'name. Maybe using - // `PyArray_IsScalar` will be better. However, this interface cannot be used - // by including pybind11, and it needs to compile with numpy. - auto type_name = std::string(Py_TYPE(obj)->tp_name); - return type_name == "numpy.int64" || type_name == "numpy.longlong" || - type_name == "numpy.int32" || type_name == "numpy.int16"; -} - -static bool PyCheckTensor(PyObject *obj) { - return py::isinstance(obj); -} // cast numpy type form S to T, this may allocate new memory template @@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( return result; } -static bool PyCheckInteger(PyObject *obj) { -#if PY_VERSION_HEX < 0x03000000 - return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj); -#else - return PyLong_Check(obj) && !PyBool_Check(obj); -#endif -} - -static Py_ssize_t GetSliceIndexFromTensor( - const std::shared_ptr &tensor_index) { - const auto &tensor = tensor_index->Var().Get(); - if (tensor.numel() == 1) { - if (framework::TransToProtoVarType(tensor.dtype()) == - framework::proto::VarType::INT32) { - return static_cast(operators::GetValue(&tensor)); - } else if (framework::TransToProtoVarType(tensor.dtype()) == - framework::proto::VarType::INT64) { - return static_cast(operators::GetValue(&tensor)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, the type of tensor in slice indices only allows " - "int32 and int64, please check the type of index tensor.")); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, tensor in slice indices only allows 1 element, " - "but received %d.", - tensor.numel())); - } -} - -// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From: -// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103 -// Original PySlice_GetIndices return wrong result when -// slice_item contains long int, such as arr[:180L]. -// NOT sure why this happens !!! -// Besides, PySlice_GetIndices cannot raise error when float in slice item. -// So, I make a revised version of PySlice_GetIndices, named to -// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than -// PySlice_GetIndices in the future. -static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length, - Py_ssize_t *start, Py_ssize_t *stop, - Py_ssize_t *step) { - /* XXX support long ints */ - if (r->step == Py_None) { - *step = 1; - } else { - if (PyCheckInteger(r->step) || IsNumpyType(r->step)) { - *step = PyLong_AsLong(r->step); - } else if (PyCheckTensor(r->step)) { - *step = GetSliceIndexFromTensor( - py::cast>(r->step)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->step)->tp_name))); - } - } - if (r->start == Py_None) { - *start = *step < 0 ? length - 1 : 0; - } else { - if (PyCheckInteger(r->start) || IsNumpyType(r->start)) { - *start = PyLong_AsLong(r->start); - } else if (PyCheckTensor(r->start)) { - *start = GetSliceIndexFromTensor( - py::cast>(r->start)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->start)->tp_name))); - } - if (*start < 0) *start += length; - *start = std::max(*start, static_cast(0)); - } - if (r->stop == Py_None) { - *stop = *step < 0 ? -1 : length; - } else { - if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) { - *stop = PyLong_AsLong(r->stop); - } else if (PyCheckTensor(r->stop)) { - *stop = GetSliceIndexFromTensor( - py::cast>(r->stop)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, slice indices only allows None, integers, " - "tensor(int) and numpy(int) in slice item, but received %s.", - std::string(Py_TYPE(r->stop)->tp_name))); - } - if (0 < *step && *stop < 0) *stop += length; - *stop = std::min(*stop, length); - } - if (*stop > length) return -1; - if (*start >= length) return -1; - if (*step == 0) return -1; - return 0; -} - -static void ParseIndexingSlice( - framework::LoDTensor *tensor, PyObject *_index, - std::vector *slice_axes, std::vector *slice_starts, - std::vector *slice_ends, std::vector *slice_strides, - std::vector *decrease_axis, std::vector *none_axes, - std::vector *infer_flags, std::vector *list_select_idxs, - bool *list_select_flag) { - // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those - // types, and list of Bool and Integers. - // wrap to tuple - - // NOTE(zhiqiu): PyTuple_Pack increases refcount. - PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index; - DEFINE_PADDLE_SCOPE_GUARD([index, _index]() { - if (!PyTuple_Check(_index)) { - Py_DECREF(index); - VLOG(4) << "Call Py_DECREF"; - } - }); - PADDLE_ENFORCE_EQ( - tensor->IsInitialized(), true, - platform::errors::InvalidArgument("tensor has not been initialized")); - const auto &shape = tensor->dims(); - const int rank = shape.size(); - const int size = PyTuple_GET_SIZE(index); - - // specified_dims is the number of dimensions which indexed by Interger, - // Slices. - int specified_dims = 0; - int ell_count = 0; - for (int dim = 0; dim < size; ++dim) { - PyObject *slice_item = PyTuple_GetItem(index, dim); - if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) { - specified_dims++; - } else if (slice_item == Py_Ellipsis) { - ell_count++; - } - } - - PADDLE_ENFORCE_LE(ell_count, 1, - platform::errors::InvalidArgument( - "An index can only have a single ellipsis ('...')")); - int none_count = 0; - for (int i = 0, dim = 0; i < size; ++i) { - PyObject *slice_item = PyTuple_GetItem(index, i); - - infer_flags->push_back(1); - int dim_len = shape[dim]; - if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) { - // integer, PyLong_AsLong supports both int and long - int start = static_cast(PyLong_AsLong(slice_item)); - auto s_t = start; - start = start < 0 ? start + dim_len : start; - if (start >= dim_len || start < 0) { - std::string str_error_message = - "The starting index " + std::to_string(s_t) + - " of slice is out of bounds in tensor " + std::to_string(dim) + - "-th axis, it shound be in the range of [" + - std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; - // py::index_error is corresponding to IndexError in Python - // Used to indicate out of bounds access in __getitem__, __setitem__ - throw py::index_error(str_error_message); - } - slice_axes->push_back(dim); - slice_starts->push_back(start); - slice_ends->push_back(start + 1); - slice_strides->push_back(1); - decrease_axis->push_back(dim); - dim++; - } else if (PySlice_Check(slice_item)) { - // slice item - Py_ssize_t start, end, step; - PySliceObject *p = reinterpret_cast(slice_item); - _PySlice_GetIndices(p, dim_len, &start, &end, &step); - - // :: or : or 0:dim_len:1 - if (start == 0 && end == dim_len && step == 1) { - dim++; - continue; - } - slice_axes->push_back(dim); - slice_starts->push_back(start); - slice_ends->push_back(end); - slice_strides->push_back(step); - dim++; - } else if (slice_item == Py_Ellipsis) { - dim += rank - specified_dims; - } else if (slice_item == Py_None) { - none_axes->push_back(dim + none_count); - none_count++; - } else if (PyList_Check(slice_item)) { - *list_select_flag = true; - PADDLE_ENFORCE_EQ( - size, 1, - platform::errors::InvalidArgument( - "When index contains a list, its length is excepted to 1, " - "but received %d", - size)); - bool all_bool = true; - int list_size = PyList_GET_SIZE(slice_item); - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (PyCheckInteger(list_item)) { - all_bool = false; - } else if (!PyBool_Check(list_item)) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support int or bool in index list.")); - } - } - if (all_bool) { - PADDLE_ENFORCE_EQ( - list_size, shape[0], - platform::errors::InvalidArgument( - "The dimension of bool index doesn't match indexed array along " - "dimension 0, the target dimension is %d, but received %d.", - shape[0], list_size)); - - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (list_item == Py_True) { - list_select_idxs->push_back(j); - } - } - } else { - for (int j = 0; j < list_size; ++j) { - PyObject *list_item = PyList_GetItem(slice_item, j); - if (PyCheckInteger(list_item)) { - list_select_idxs->push_back( - static_cast(PyLong_AsLong(list_item))); - } else if (list_item == Py_True) { - list_select_idxs->push_back(1); - } else { - list_select_idxs->push_back(0); - } - } - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Currently, Tensor.__indices__() only allows indexing " - "by Integers, Slices, Ellipsis, None, tuples of these types " - "and list of Bool and Integers, but received " - "%s in %dth slice item", - std::string(Py_TYPE(slice_item)->tp_name), i + 1)); - } - } - - // valid_index is the number of dimensions exclude None index - const int valid_indexs = size - none_axes->size() - ell_count; - PADDLE_ENFORCE_EQ(valid_indexs <= rank, true, - platform::errors::InvalidArgument( - "Too many indices (%d) for tensor of dimension %d.", - valid_indexs, rank)); -} - template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2d9272dd0ed27b7b6e129a8b4c1c689cd0442235..ffc42dc30edfb9d710db09b7d47406c7a604b1e3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -80,6 +80,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/io.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/lod_utils.h" @@ -101,7 +102,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/gloo_context_py.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h" #include "paddle/fluid/pybind/heter_wrapper_py.h" -#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/metrics_py.h" @@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif + BindImperative(&m); BindEager(&m); BindCudaStream(&m); @@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); - BindImperative(&m); - py::class_ framework_tensor(m, "Tensor", py::buffer_protocol()); g_framework_tensor_pytype = diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..a037fa13eb53b94fd8d82413dad55d7f34b0006d --- /dev/null +++ b/paddle/fluid/pybind/slice_utils.h @@ -0,0 +1,294 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope_guard.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +static bool PyCheckTensor(PyObject* obj); +static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj); +// Slice related methods +static bool PyCheckInteger(PyObject* obj) { +#if PY_VERSION_HEX < 0x03000000 + return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj); +#else + return PyLong_Check(obj) && !PyBool_Check(obj); +#endif +} + +static bool IsNumpyType(PyObject* obj) { + // It is not a good way to judge the type of obj by its type'name. Maybe using + // `PyArray_IsScalar` will be better. However, this interface cannot be used + // by including pybind11, and it needs to compile with numpy. + auto type_name = std::string(Py_TYPE(obj)->tp_name); + return type_name == "numpy.int64" || type_name == "numpy.longlong" || + type_name == "numpy.int32" || type_name == "numpy.int16"; +} + +static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) { + if (tensor.numel() == 1) { + if (framework::TransToProtoVarType(tensor.type()) == + framework::proto::VarType::INT32) { + return static_cast(operators::GetValue(&tensor)); + } else if (framework::TransToProtoVarType(tensor.type()) == + framework::proto::VarType::INT64) { + return static_cast(operators::GetValue(&tensor)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, the type of tensor in slice indices only allows " + "int32 and int64, please check the type of index tensor.")); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, tensor in slice indices only allows 1 element, " + "but received %d.", + tensor.numel())); + } +} + +// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From: +// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103 +// Original PySlice_GetIndices return wrong result when +// slice_item contains long int, such as arr[:180L]. +// NOT sure why this happens !!! +// Besides, PySlice_GetIndices cannot raise error when float in slice item. +// So, I make a revised version of PySlice_GetIndices, named to +// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than +// PySlice_GetIndices in the future. +static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length, + Py_ssize_t* start, Py_ssize_t* stop, + Py_ssize_t* step) { + /* XXX support long ints */ + if (r->step == Py_None) { + *step = 1; + } else { + if (PyCheckInteger(r->step) || IsNumpyType(r->step)) { + *step = PyLong_AsLong(r->step); + } else if (PyCheckTensor(r->step)) { + *step = GetSliceIndexFromPyObject(r->step); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->step)->tp_name))); + } + } + if (r->start == Py_None) { + *start = *step < 0 ? length - 1 : 0; + } else { + if (PyCheckInteger(r->start) || IsNumpyType(r->start)) { + *start = PyLong_AsLong(r->start); + } else if (PyCheckTensor(r->start)) { + *start = GetSliceIndexFromPyObject(r->start); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->start)->tp_name))); + } + if (*start < 0) *start += length; + *start = std::max(*start, static_cast(0)); + } + if (r->stop == Py_None) { + *stop = *step < 0 ? -1 : length; + } else { + if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) { + *stop = PyLong_AsLong(r->stop); + } else if (PyCheckTensor(r->stop)) { + *stop = GetSliceIndexFromPyObject(r->stop); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, slice indices only allows None, integers, " + "tensor(int) and numpy(int) in slice item, but received %s.", + std::string(Py_TYPE(r->stop)->tp_name))); + } + if (0 < *step && *stop < 0) *stop += length; + *stop = std::min(*stop, length); + } + if (*stop > length) return -1; + if (*start >= length) return -1; + if (*step == 0) return -1; + return 0; +} + +static void ParseIndexingSlice( + framework::LoDTensor* tensor, PyObject* _index, + std::vector* slice_axes, std::vector* slice_starts, + std::vector* slice_ends, std::vector* slice_strides, + std::vector* decrease_axis, std::vector* none_axes, + std::vector* infer_flags, std::vector* list_select_idxs, + bool* list_select_flag) { + // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those + // types, and list of Bool and Integers. + // wrap to tuple + + // NOTE(zhiqiu): PyTuple_Pack increases refcount. + PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index; + DEFINE_PADDLE_SCOPE_GUARD([index, _index]() { + if (!PyTuple_Check(_index)) { + Py_DECREF(index); + VLOG(4) << "Call Py_DECREF"; + } + }); + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("tensor has not been initialized")); + const auto& shape = tensor->dims(); + const int rank = shape.size(); + const int size = PyTuple_GET_SIZE(index); + + // specified_dims is the number of dimensions which indexed by Interger, + // Slices. + int specified_dims = 0; + int ell_count = 0; + for (int dim = 0; dim < size; ++dim) { + PyObject* slice_item = PyTuple_GetItem(index, dim); + if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) { + specified_dims++; + } else if (slice_item == Py_Ellipsis) { + ell_count++; + } + } + + PADDLE_ENFORCE_LE(ell_count, 1, + platform::errors::InvalidArgument( + "An index can only have a single ellipsis ('...')")); + int none_count = 0; + for (int i = 0, dim = 0; i < size; ++i) { + PyObject* slice_item = PyTuple_GetItem(index, i); + + infer_flags->push_back(1); + int dim_len = shape[dim]; + if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) { + // integer, PyLong_AsLong supports both int and long + int start = static_cast(PyLong_AsLong(slice_item)); + auto s_t = start; + start = start < 0 ? start + dim_len : start; + if (start >= dim_len || start < 0) { + std::string str_error_message = + "The starting index " + std::to_string(s_t) + + " of slice is out of bounds in tensor " + std::to_string(dim) + + "-th axis, it shound be in the range of [" + + std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; + // py::index_error is corresponding to IndexError in Python + // Used to indicate out of bounds access in __getitem__, __setitem__ + throw py::index_error(str_error_message); + } + slice_axes->push_back(dim); + slice_starts->push_back(start); + slice_ends->push_back(start + 1); + slice_strides->push_back(1); + decrease_axis->push_back(dim); + dim++; + } else if (PySlice_Check(slice_item)) { + // slice item + Py_ssize_t start, end, step; + PySliceObject* p = reinterpret_cast(slice_item); + _PySlice_GetIndices(p, dim_len, &start, &end, &step); + + // :: or : or 0:dim_len:1 + if (start == 0 && end == dim_len && step == 1) { + dim++; + continue; + } + slice_axes->push_back(dim); + slice_starts->push_back(start); + slice_ends->push_back(end); + slice_strides->push_back(step); + dim++; + } else if (slice_item == Py_Ellipsis) { + dim += rank - specified_dims; + } else if (slice_item == Py_None) { + none_axes->push_back(dim + none_count); + none_count++; + } else if (PyList_Check(slice_item)) { + *list_select_flag = true; + PADDLE_ENFORCE_EQ( + size, 1, + platform::errors::InvalidArgument( + "When index contains a list, its length is excepted to 1, " + "but received %d", + size)); + bool all_bool = true; + int list_size = PyList_GET_SIZE(slice_item); + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (PyCheckInteger(list_item)) { + all_bool = false; + } else if (!PyBool_Check(list_item)) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support int or bool in index list.")); + } + } + if (all_bool) { + PADDLE_ENFORCE_EQ( + list_size, shape[0], + platform::errors::InvalidArgument( + "The dimension of bool index doesn't match indexed array along " + "dimension 0, the target dimension is %d, but received %d.", + shape[0], list_size)); + + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (list_item == Py_True) { + list_select_idxs->push_back(j); + } + } + } else { + for (int j = 0; j < list_size; ++j) { + PyObject* list_item = PyList_GetItem(slice_item, j); + if (PyCheckInteger(list_item)) { + list_select_idxs->push_back( + static_cast(PyLong_AsLong(list_item))); + } else if (list_item == Py_True) { + list_select_idxs->push_back(1); + } else { + list_select_idxs->push_back(0); + } + } + } + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently, Tensor.__indices__() only allows indexing " + "by Integers, Slices, Ellipsis, None, tuples of these types " + "and list of Bool and Integers, but received " + "%s in %dth slice item", + std::string(Py_TYPE(slice_item)->tp_name), i + 1)); + } + } + + // valid_index is the number of dimensions exclude None index + const int valid_indexs = size - none_axes->size() - ell_count; + PADDLE_ENFORCE_EQ(valid_indexs <= rank, true, + platform::errors::InvalidArgument( + "Too many indices (%d) for tensor of dimension %d.", + valid_indexs, rank)); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 5edb83f8c3fc01d198d3f63b64047b9e45cd747b..4f449c578bab00482fd91528496f4d8788f927b1 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc) set(bw_api_header_file_tmp ${bw_api_header_file}.tmp) set(bw_api_source_file_tmp ${bw_api_source_file}.tmp) +# sparse api file +set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py) +set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) +set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) +set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) +set(sparse_api_header_file_tmp ${api_header_file}.tmp) +set(sparse_api_source_file_tmp ${api_source_file}.tmp) + # wrapped infermeta file set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) @@ -73,6 +81,19 @@ add_custom_command( DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base} VERBATIM) +# generate sparse api +add_custom_command( + OUTPUT ${sparse_api_header_file} ${sparse_api_source_file} + COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file} + --api_yaml_path ${sparse_api_yaml_file} + --api_header_path ${sparse_api_header_file_tmp} + --api_source_path ${sparse_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file} + COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" + DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} + VERBATIM) + # generate wrapped infermeta add_custom_command( OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} @@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) +cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) -cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) +cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) -cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl) -cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl) +cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) +cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) +cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) +cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 19b113838eab5403aca00d9d97b278646228c512..fc1afb26bf4143e5c75398b3dc1042581e1f1546 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_custom_impl.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_gen_utils.cc similarity index 62% rename from paddle/phi/api/lib/api_utils.h rename to paddle/phi/api/lib/api_gen_utils.cc index 6c1fa97c0f52a697383a3526220cc758d778823d..f04e74b45fcd42cfeee860b05f52855ec15ef8f6 100644 --- a/paddle/phi/api/lib/api_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/api/lib/api_gen_utils.h" namespace paddle { namespace experimental { /* ------------------ for input ----------------------- */ -inline std::shared_ptr TensorToDenseTensor( - const Tensor& tensor) { +std::shared_ptr TensorToDenseTensor(const Tensor& tensor) { return std::dynamic_pointer_cast(tensor.impl()); } -inline std::shared_ptr TensorToDenseTensor( +std::shared_ptr TensorToDenseTensor( const paddle::optional& tensor) { if (tensor) { return std::dynamic_pointer_cast(tensor->impl()); @@ -39,7 +31,7 @@ inline std::shared_ptr TensorToDenseTensor( return nullptr; } -inline std::unique_ptr> TensorToDenseTensor( +std::unique_ptr> TensorToDenseTensor( const std::vector& tensors) { auto pt_tensors = std::make_unique>(); pt_tensors->reserve(tensors.size()); @@ -52,12 +44,11 @@ inline std::unique_ptr> TensorToDenseTensor( return std::move(pt_tensors); } -inline std::shared_ptr TensorToSelectedRows( - const Tensor& tensor) { +std::shared_ptr TensorToSelectedRows(const Tensor& tensor) { return std::dynamic_pointer_cast(tensor.impl()); } -inline std::shared_ptr TensorToSelectedRows( +std::shared_ptr TensorToSelectedRows( const paddle::optional& tensor) { if (tensor) { return std::dynamic_pointer_cast(tensor->impl()); @@ -67,11 +58,11 @@ inline std::shared_ptr TensorToSelectedRows( /* ----------------- for infer_meta --------------------- */ -inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { +phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } -inline paddle::optional MakeMetaTensor( +paddle::optional MakeMetaTensor( const paddle::optional& tensor) { if (tensor) { return {phi::MetaTensor(*tensor)}; @@ -79,7 +70,7 @@ inline paddle::optional MakeMetaTensor( return {paddle::none}; } -inline std::vector MakeMetaTensor( +std::vector MakeMetaTensor( const std::vector& tensors) { std::vector meta_tensors; meta_tensors.reserve(tensors.size()); @@ -89,11 +80,11 @@ inline std::vector MakeMetaTensor( return meta_tensors; } -inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { +phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } -inline paddle::optional MakeMetaTensor( +paddle::optional MakeMetaTensor( const paddle::optional& tensor) { if (tensor) { return {phi::MetaTensor(*tensor)}; @@ -103,7 +94,7 @@ inline paddle::optional MakeMetaTensor( /* ------------------ for output ----------------------- */ -inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { +phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { if (!out->initialized()) { auto dense_tensor = std::make_shared( phi::make_intrusive(phi::TransToPhiPlace(backend)), @@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { return static_cast(out->impl().get()); } -inline std::vector SetKernelOutput( - size_t out_size, Backend backend, std::vector* out) { +std::vector SetKernelOutput(size_t out_size, + Backend backend, + std::vector* out) { out->reserve(out_size); std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { @@ -129,8 +121,7 @@ inline std::vector SetKernelOutput( return results; } -inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, - Tensor* out) { +phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) { if (!out->initialized()) { auto select_rows = std::make_shared(); out->set_impl(select_rows); @@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, return static_cast(out->impl().get()); } +phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) { + if (!out->initialized()) { + if (type == TensorType::SPARSE_COO) { + auto sparse_tensor = std::make_shared( + phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1}); + out->set_impl(sparse_tensor); + return sparse_tensor.get(); + } else if (type == TensorType::SPARSE_CSR) { + auto sparse_tensor = + std::make_shared(phi::DenseTensor(), + phi::DenseTensor(), + phi::DenseTensor(), + phi::DDim{-1}); + out->set_impl(sparse_tensor); + return sparse_tensor.get(); + } else { + auto dense_tensor = std::make_shared(); + out->set_impl(dense_tensor); + return dense_tensor.get(); + } + } + return out->impl().get(); +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..109c6e7ab71f5f889e63c410ee84aaad6c6b8110 --- /dev/null +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace paddle { +namespace experimental { + +enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO }; + +/* ------------------ for input ----------------------- */ + +std::shared_ptr TensorToDenseTensor(const Tensor& tensor); + +std::shared_ptr TensorToDenseTensor( + const paddle::optional& tensor); + +std::unique_ptr> TensorToDenseTensor( + const std::vector& tensors); + +std::shared_ptr TensorToSelectedRows(const Tensor& tensor); + +std::shared_ptr TensorToSelectedRows( + const paddle::optional& tensor); + +/* ----------------- for infer_meta --------------------- */ + +phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); + +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + +std::vector MakeMetaTensor( + const std::vector& tensors); + +phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); + +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + +/* ------------------ for output ----------------------- */ + +phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out); + +std::vector SetKernelOutput(size_t out_size, + Backend backend, + std::vector* out); + +phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out); + +phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type); + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc similarity index 86% rename from paddle/phi/api/lib/sparse_api.cc rename to paddle/phi/api/lib/sparse_api_custom_impl.cc index 9e1f59c0aa74329b15efcbff123b137fbf0b1360..832c19361e5eb03419fe988c9a30304b5993afdf 100644 --- a/paddle/phi/api/lib/sparse_api.cc +++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" #include #include "glog/logging.h" @@ -20,31 +20,14 @@ limitations under the License. */ #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/infermeta/unary.h" - -PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT); -#endif namespace paddle { namespace experimental { namespace sparse { -PADDLE_API Tensor to_sparse_coo(const Tensor& x, - Backend backend, - const int64_t sparse_dim) { +Tensor to_sparse_coo_impl(const Tensor& x, + Backend backend, + const int64_t sparse_dim) { if (x.layout() == phi::DataLayout::SPARSE_COO) { return x; } @@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x, return out; } -PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { +Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { if (x.layout() == phi::DataLayout::SPARSE_CSR) { return x; } @@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { return out; } -PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { +Tensor to_dense_impl(const Tensor& x, Backend backend) { if (x.layout() != phi::DataLayout::SPARSE_CSR && x.layout() != phi::DataLayout::SPARSE_COO) { return x; diff --git a/paddle/phi/api/include/sparse_api.h b/paddle/phi/api/lib/sparse_api_custom_impl.h similarity index 74% rename from paddle/phi/api/include/sparse_api.h rename to paddle/phi/api/lib/sparse_api_custom_impl.h index a131804cd6f582c01586671a21851066910b21d4..293b2cfa3d33480ccccd0f601f8e15c639b93e1e 100644 --- a/paddle/phi/api/include/sparse_api.h +++ b/paddle/phi/api/lib/sparse_api_custom_impl.h @@ -21,13 +21,13 @@ namespace paddle { namespace experimental { namespace sparse { -PADDLE_API Tensor to_sparse_coo(const Tensor& x, - Backend backend, - const int64_t sparse_dim); +Tensor to_dense_impl(const Tensor& x, Backend backend); -PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend); +Tensor to_sparse_coo_impl(const Tensor& x, + Backend backend, + const int64_t sparse_dim); -PADDLE_API Tensor to_dense(const Tensor& x, Backend backend); +Tensor to_sparse_csr_impl(const Tensor& x, Backend backend); } // namespace sparse } // namespace experimental diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index 0dd5d543414fee444ee28994fcfd78fcdeee9e18..ca3290f33e61eb730d3f17a0b8cc72cbf1c0db58 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase, void* AllocateFrom(Allocator* allocator, DataType dtype, size_t requested_size = 0) override; + void set_dims(const DDim& dims) { this->dims_ = dims; } private: // save the indices of non zero elements in original dense tensor diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 675e68af74339b508f589a55a9c3cf3aed37cecb..7682f6b3d49b9281f4fabef26137a7db1a5b6126 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out) { + auto ids_dims = ids.dims(); + auto parents_dims = parents.dims(); + PADDLE_ENFORCE_EQ(ids_dims == parents_dims, + true, + phi::errors::InvalidArgument( + "The shape of Input(Parents) must be same with the " + "shape of Input(Ids).")); + out->set_dims(ids_dims); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index a0140c9a5799f79af541b45847d5e44f982a3f58..5906e06b2935504babf993b657dbded403348175 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input, const MetaTensor& label, MetaTensor* out, MetaConfig config = MetaConfig()); + +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c15dbd2f63f588e0bb20ae41a08146e3ac781458 --- /dev/null +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormGradRawKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7ddf32e27c7d73a7249d92f7835afdf6b8f3ed5a --- /dev/null +++ b/paddle/phi/kernels/batch_norm_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..de2343a384a5b413591fed981dc03e97bbb89fed --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -0,0 +1,674 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormGradRawKernel(const Context& ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + const auto* d_y = &y_grad; + + DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + auto* d_x = x_grad; + auto* d_scale = scale_grad; + auto* d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + // batch_norm with inplace as false will take X as grad input, which + // is same as cuDNN batch_norm backward calculation, batch_norm + // with inplace as true only take Y as input and X should be calculate + // by inverse operation of batch_norm on Y + + if (is_inplace) { + if (d_x) { + PADDLE_ENFORCE_EQ(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } else { + if (d_x) { + PADDLE_ENFORCE_NE(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + const T* mean_data = saved_mean.data(); + const T* inv_var_data = saved_variance.data(); + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + T* d_bias_data = nullptr; + T* d_scale_data = nullptr; + if (d_scale && d_bias) { + d_bias_data = ctx.template Alloc(d_bias); + d_scale_data = ctx.template Alloc(d_scale); + } + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + EigenVectorArrayMap d_bias_arr(d_bias_data, C); + EigenVectorArrayMap d_scale_arr(d_scale_data, C); + + if (d_scale && d_bias) { + d_bias_arr.setZero(); + d_scale_arr.setZero(); + } + + if (d_x && (N * sample_size) == 1 && !use_global_stats) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + return; + } + + int scale_coefff = use_global_stats ? 1 : N * sample_size; + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; + + DenseTensor dy_sum; + dy_sum.Resize({C}); + auto dy_sum_data = ctx.template Alloc(&dy_sum); + EigenVectorArrayMap dy_sum_arr(dy_sum_data, C); + + DenseTensor dy_mul_x_sub_mean_mul_invstd_sum; + dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); + auto dy_mul_x_sub_mean_mul_invstd_sum_data = + ctx.template Alloc(&dy_mul_x_sub_mean_mul_invstd_sum); + EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( + dy_mul_x_sub_mean_mul_invstd_sum_data, C); + + dy_sum_arr.setZero(); + dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + + // inplace calculation + // Y: ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + // X: (y - bias) / scale / (inv_var) + est_mean + // formula transform ====> + // (y - bias) / (scale * inv_var) + est_mean + switch (data_layout) { + case DataLayout::kNCHW: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), sample_size, N * C); + ConstEigenArrayMap y_data(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / + scale_inv_var_nhw(nc % C) / scale_coefff + + mean_arr(nc % C); + } + } + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + dy_sum_arr(c) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), sample_size, N * C); + if (!use_global_stats) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c)); + } + } else { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); + } + } + } + break; + } + case DataLayout::kNHWC: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), C, N * sample_size); + ConstEigenArrayMap y_data(x.data(), C, N * sample_size); + for (int nhw = 0; nhw < N * sample_size; nhw++) { + x_data.col(nhw) = + (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff + + mean_arr; + } + } + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + dy_sum_arr += d_y_arr.col(nhw); + dy_mul_x_sub_mean_mul_invstd_sum_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), C, N * sample_size); + if (!use_global_stats) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - + (x_arr.col(nhw) - mean_arr) * + dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); + } + } else { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); + } + } + } + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } +} + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context& ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad) { + const auto* X = &x; + const auto* Scale = &scale; + const auto* dY = &y_grad; + const auto* Saved_mean = &saved_mean; + const auto* Saved_variance = &saved_variance; + + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const auto data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto* ddX = &x_grad_grad; + const auto* ddScale = &scale_grad_grad; + const auto* ddBias = &bias_grad_grad; + + auto* dX = x_grad; + auto* dScale = scale_grad; + auto* ddY = y_grad_grad; + ctx.template Alloc(dX); + ctx.template Alloc(ddY); + + const auto& x_dims = X->dims(); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = X->numel() / C; + phi::funcs::SetConstant set_constant; + + const T* mean_data = Saved_mean->data(); + const T* inv_var_data = Saved_variance->data(); + + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + // transpose NCHW -> NHWC for easy calculate + DenseTensor transformed_x(X->type()); + DenseTensor transformed_dy(dY->type()); + DenseTensor transformed_ddx(ddX->type()); + + DenseTensor transformed_dx(dX->type()); + DenseTensor transformed_ddy(ddY->type()); + if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + // Input Tensor + ResizeToChannelLast(ctx, X, &transformed_x); + TransToChannelLast(ctx, X, &transformed_x); + ResizeToChannelLast(ctx, dY, &transformed_dy); + TransToChannelLast(ctx, dY, &transformed_dy); + ResizeToChannelLast(ctx, ddX, &transformed_ddx); + TransToChannelLast(ctx, ddX, &transformed_ddx); + // Output Tensor + ResizeToChannelLast(ctx, dX, &transformed_dx); + ResizeToChannelLast(ctx, ddY, &transformed_ddy); + } else { + transformed_x.ShareDataWith(*X); + transformed_dy.ShareDataWith(*dY); + transformed_ddx.ShareDataWith(*ddX); + + transformed_dx.ShareDataWith(*dX); + transformed_ddy.ShareDataWith(*ddY); + } + + ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + Tensor mean_tile; + mean_tile.Resize({C, sample_size}); + EigenArrayMap mean_tile_data( + ctx.template Alloc(&mean_tile), C, sample_size); + + DenseTensor inv_var_tile; + inv_var_tile.Resize({C, sample_size}); + EigenArrayMap inv_var_tile_data( + ctx.template Alloc(&inv_var_tile), C, sample_size); + + mean_tile_data = mean_arr.replicate(1, sample_size); + inv_var_tile_data = inv_var_arr.replicate(1, sample_size); + + DenseTensor Scale_data; + if (!Scale) { + Scale_data.Resize({C}); + ctx.template Alloc(&Scale_data); + set_constant(ctx, &Scale_data, static_cast(1)); + } + ConstEigenVectorArrayMap scale_arr( + Scale ? Scale->data() : Scale_data.data(), C); + + Tensor scale_tile; + scale_tile.Resize({C, sample_size}); + EigenArrayMap scale_tile_data( + ctx.template Alloc(&scale_tile), C, sample_size); + scale_tile_data = scale_arr.replicate(1, sample_size); + + ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); + ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); + + DenseTensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({C, sample_size}); + + EigenArrayMap x_sub_mean_mul_invstd_arr( + ctx.template Alloc(&x_sub_mean_mul_invstd), C, sample_size); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dX) { + ctx.template Alloc(dX); + EigenArrayMap dx_arr( + ctx.template Alloc(&transformed_dx), C, sample_size); + dx_arr.setZero(); + if (use_global_stats) { + // math: dx = (ddscale * dy) * inv_var + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; + } + } else { + // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, + // axis=(n,h,w)) * + // np.sum(dy, axis=(n,h,w)) - + // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - + // mean), + // axis=(n,h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / + // NxHxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * + // np.sum(dy, + // axis=(n,h,w)) * (x - mean) * + // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - + // inv_var + // * + // np.mean(dy, axis=(n,h,w)) - + // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), + // axis=(n,h,w))) + + if (ddX) { + dx_arr += + (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size) + .colwise() * + (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - + (dy_arr * ddx_arr).rowwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * + (ddx_arr.rowwise().sum() / sample_size - ddx_arr); + + dx_arr = scale_tile_data * dx_arr; + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr += + (dy_arr * inv_var_tile_data - + (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) * + inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size) * + ddscale_tile_data; + } + } + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_dx, dX); + } + } + if (dScale) { + EigenVectorArrayMap dscale_arr(ctx.template Alloc(dScale), C); + dscale_arr.setZero(); + if (use_global_stats) { + // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var + if (ddX) { + dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); + } + } else { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * + // ddx + if (ddX) { + Tensor first_grad; + first_grad.Resize({C, sample_size}); + EigenArrayMap first_grad_arr( + ctx.template Alloc(&first_grad), C, sample_size); + first_grad_arr.setZero(); + + first_grad_arr += + inv_var_tile_data * + (dy_arr - + dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); + } + } + } + + if (ddY) { + ctx.template Alloc(ddY); + EigenArrayMap ddy_arr( + ctx.template Alloc(&transformed_ddy), C, sample_size); + ddy_arr.setZero(); + if (use_global_stats) { + // math: ddy = r * ddx * inv_var + ddbias + + // ddscale * (x - mean) * inv_var + if (ddX) { + ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; + } + } else { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(n,h,w))) + if (ddX) { + ddy_arr += + scale_tile_data * inv_var_tile_data * + (ddx_arr - + ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + } + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + } + + if (ddBias) { + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({C, sample_size}); + EigenArrayMap ddbias_tile_data( + ctx.template Alloc(&ddbias_tile), C, sample_size); + ddbias_tile_data = ddbias_arr.replicate(1, sample_size); + + ddy_arr += ddbias_tile_data; + } + + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_ddy, ddY); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + CPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double) {} + +PD_REGISTER_KERNEL(batch_norm_grad_grad, + CPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..743128e8dea99296def25f79c47bbcfda8c65f40 --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { + bool test_mode = is_test && (!trainable_statistics); + + bool global_stats = test_mode || use_global_stats; + + auto data_layout = paddle::framework::StringToDataLayout(data_layout_str); + + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensionss is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // alloc memory + ctx.template Alloc(y); + ctx.template Alloc(mean_out); + ctx.template Alloc(variance_out); + ctx.template Alloc(saved_mean); + ctx.template Alloc(saved_variance); + + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + if (!global_stats) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e(ctx.template Alloc(saved_mean), C); + EigenVectorArrayMap saved_variance_e( + ctx.template Alloc(saved_variance), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + EigenVectorArrayMap running_mean_arr(ctx.template Alloc(mean_out), C); + EigenVectorArrayMap running_var_arr(ctx.template Alloc(variance_out), + C); + + if ((N * sample_size) == 1) { + // Only 1 element in normalization dimension, + // we skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + return; + } + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } + + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (global_stats) { + ConstEigenVectorArrayMap var_arr(variance.data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std(saved_variance->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + global_stats ? mean.data() : saved_mean->data(), C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(ctx.template Alloc(y), sample_size, N * C); + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(ctx.template Alloc(y), C, N * sample_size) = + (ConstEigenArrayMap(x.data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d", + data_layout)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e336f18bf80a36d1c954533aa7dc2534c4f7f2c --- /dev/null +++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gaussian_random_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/generator.h" + +namespace phi { + +template +void GaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + std::normal_distribution dist(mean, std); + + tensor->Resize(phi::make_ddim(shape.GetData())); + int64_t size = tensor->numel(); + T* data = dev_ctx.template Alloc(tensor); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gaussian_random, + CPU, + ALL_LAYOUT, + phi::GaussianRandomKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..67e6da7d0e06a572d6d1b5f5353f3fdecc122eaa --- /dev/null +++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad_grad, + CPU, + ALL_LAYOUT, + phi::PadGradKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4a0acdcca267050b04f8063147c140e5631e27b --- /dev/null +++ b/paddle/phi/kernels/cpu/pad_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad, + CPU, + ALL_LAYOUT, + phi::PadKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/fluid/operators/math/padding.h b/paddle/phi/kernels/funcs/padding.h similarity index 67% rename from paddle/fluid/operators/math/padding.h rename to paddle/phi/kernels/funcs/padding.h index 529d39c9ba50f016434b0b14c4d85c84483bad7f..6d10ff2dfcf39c6b57084e99eb31fc1d888f5f75 100644 --- a/paddle/fluid/operators/math/padding.h +++ b/paddle/phi/kernels/funcs/padding.h @@ -15,21 +15,26 @@ limitations under the License. */ #pragma once #include #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { -template -using EigenTensor = framework::EigenTensor; +using EigenTensor = EigenTensor; template -void PadFunction(const framework::ExecutionContext& context, - const std::vector& pads, const framework::Tensor& src, - T pad_value, framework::Tensor* out) { +void PadFunction(const DeviceContext& context, + const std::vector& pads, + const DenseTensor& src, + T pad_value, + DenseTensor* out) { std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { @@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context, auto src_tensor = EigenTensor::From(src); auto out_tensor = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); + auto& place = *(context.eigen_device()); EigenPad, T, D>::Eval( place, out_tensor, src_tensor, paddings, pad_value); } template -void PadGradFunction(const framework::ExecutionContext& context, - const std::vector& pads, const framework::Tensor& src, - framework::Tensor* d_out) { +void PadGradFunction(const DeviceContext& context, + const std::vector& pads, + const DenseTensor& src, + DenseTensor* d_out) { std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { paddings[i].first = -pads[i * 2]; @@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context, auto d_out_tensor = EigenTensor::From(*d_out); auto src_tensor = EigenTensor::From(src); - auto& place = - *context.template device_context().eigen_device(); + auto& place = *(context.eigen_device()); EigenPad, T, D>::Eval( place, d_out_tensor, src_tensor, paddings, static_cast(0)); } template -void PaddingFunctor(int rank, const framework::ExecutionContext& context, - const std::vector& pads, T pad_value, - const framework::Tensor& src, framework::Tensor* out) { +void PaddingFunctor(int rank, + const DeviceContext& context, + const std::vector& pads, + T pad_value, + const DenseTensor& src, + DenseTensor* out) { switch (rank) { case 1: PadFunction(context, pads, src, pad_value, out); @@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context, PadFunction(context, pads, src, pad_value, out); break; default: - PADDLE_THROW(platform::errors::Unimplemented( - "PadOp only support tensors with no more" - " than 6 dimensions currently.")); + PADDLE_THROW( + phi::errors::Unimplemented("PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } template -void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, +void PaddingGradFunctor(int rank, + const DeviceContext& context, const std::vector& pads, - const framework::Tensor& src, framework::Tensor* out) { + const DenseTensor& src, + DenseTensor* out) { switch (rank) { case 1: PadGradFunction(context, pads, src, out); @@ -118,9 +127,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, PadGradFunction(context, pads, src, out); break; default: - PADDLE_THROW(platform::errors::Unimplemented( - "PadOp only support tensors with no more" - " than 6 dimensions currently.")); + PADDLE_THROW( + phi::errors::Unimplemented("PadOp only support tensors with no more" + " than 6 dimensions currently.")); } } @@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector& pads, } return is_sys_pad; } -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2903d80d22d46bcdc492009afdac4e6e6572929e --- /dev/null +++ b/paddle/phi/kernels/gaussian_random_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void GaussianRandomKernel(const Context& ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2c9ee5ede010367697bb9477a536f807625fd02b --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -0,0 +1,1038 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( + const T *dy, + const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const double epsilon, + const int N, + const int C, + const int HxW, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); + BatchNormParamType mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += static_cast>(dy[index]) * + (static_cast>(x[index]) - mean_i); + db_sum += static_cast>(dy[index]); + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum * inv_var_i; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, + const int C, + const int HxW, + const int num, + T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + +template +static __global__ void KeBNRestoreData(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C; + auto y_i = static_cast>(y[i]); + auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; + x[i] = static_cast(x_i); + } +} + +template +class InplaceHelper { + public: + void operator()(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y, + int grid2, + const int block, + const gpuStream_t &stream) { + PADDLE_ENFORCE_EQ(x, + y, + phi::errors::InvalidArgument( + "X and Y should be inplaced in inplace mode")); + KeBNRestoreData<<>>( + layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); + } +}; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, + const int C, + const int N, + const int HxW, + const double epsilon, + T *dx, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, + const int N, + const int HxW, + T *dx) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType inv_var_i = variance[i]; + BatchNormParamType mean_i = mean[i]; + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[index]) - mean_i); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) + .Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = + (static_cast>(dy[index]) - + dy_sum_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_i) * + dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * + scale[i] * inv_var_i; + } + } +} + +template +void BatchNormGradRawKernel(const Context &ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + double epsilon = static_cast(epsilon_f); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto *d_y = &y_grad; + + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + const auto &x_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5." + "But received: the size of input's dimensions is [%d]," + "the dimensions of input is [%s]", + x_dims.size(), + x_dims)); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + if (d_scale && d_bias) { + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + } + + PADDLE_ENFORCE_EQ( + scale.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of scale's dimensions must equal to 1. But received: " + "the size of scale's dimensions is [%d], the dimensions of scale " + "is [%s].", + scale.dims().size(), + scale.dims())); + PADDLE_ENFORCE_EQ( + scale.dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of scale must equal to Channels[%d]. But " + "received: the first dimension of scale is [%d]", + C, + scale.dims()[0])); + + auto dtype = paddle::platform::CudnnDataType::type; +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && + FLAGS_cudnn_batchnorm_spatial_persistent && + (reserve_space.get_ptr() != nullptr); + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_d_y(d_y->type()); + DenseTensor transformed_d_x; + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, d_y, &transformed_d_y); + TransToChannelFirst(ctx, d_y, &transformed_d_y); + if (d_x) { + ResizeToChannelFirst(ctx, d_x, &transformed_d_x); + } + } else { + transformed_x.ShareDataWith(x); + transformed_d_y.ShareDataWith(*d_y); + if (d_x) { + transformed_d_x.ShareDataWith(*d_x); + } + } + + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else + const int block = 512; +#endif + int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + auto stream = ctx.stream(); + InplaceHelper inplace_functor; + + if (!use_global_stats) { + if ((N * H * W * D) == 1) { + if (d_x) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + } + phi::funcs::SetConstant> functor; + functor(ctx, d_scale, static_cast>(0)); + functor(ctx, d_bias, static_cast>(0)); + return; + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor( + &bn_param_desc_)); +#endif + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); +#endif + + const auto *saved_mean_data = + saved_mean.template data>(); + const auto *saved_var_data = + saved_variance.template data>(); + + if (is_inplace) { + inplace_functor(compute_format, + transformed_x.data(), + scale.template data>(), + bias.template data>(), + saved_mean_data, + saved_var_data, + epsilon, + C, + H * W * D, + num, + transformed_x.data(), + grid2, + block, + stream); + } + + // This branch calls CUDNN APIs + if (d_x && d_scale && d_bias) { + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/transformed_x.template data(), + /*yDesc=*/nullptr, + /*yData=*/nullptr, + /*dyDesc=*/data_desc_, + /*dyData=*/transformed_d_y.template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/ctx.template Alloc(&transformed_d_x), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale.template data>(), + /*bnBiasData=*/nullptr, + /*dBnScaleData=*/d_scale + ->template mutable_data>( + ctx.GetPlace()), + /*dBnBiasData=*/d_bias + ->template mutable_data>( + ctx.GetPlace()), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/nullptr, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast( + reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + if (compute_format == DataLayout::kNCHW) { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackward( + ctx.cudnn_handle(), + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + transformed_d_y.template data(), + data_desc_, + ctx.template Alloc(&transformed_d_x), + bn_param_desc_, + scale.template data>(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean_data, + saved_var_data)); +#endif + } + + if (data_layout == DataLayout::kNHWC && + compute_format == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_d_x, d_x); + } + } else { + // This branch call CUDA kernels + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor( + bn_param_desc_)); +#endif + } else { + const auto *running_mean = mean.get_ptr(); + const auto *running_var = variance.get_ptr(); + + const auto *running_mean_data = + running_mean->template data>(); + const auto *running_var_data = + running_var->template data>(); + + if (is_inplace) { + auto px = x; + inplace_functor(data_layout, + ctx.template Alloc(&px), + scale.template data>(), + bias.template data>(), + running_mean_data, + running_var_data, + epsilon, + C, + H * W * D, + num, + x.data(), + grid2, + block, + stream); + } + + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } +} + +template +void BatchNormGradKernel(const Context &dev_ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context &ctx, + const DenseTensor &x_grad_grad, + const DenseTensor &scale_grad_grad, + const DenseTensor &bias_grad_grad, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *y_grad_grad) { + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const DenseTensor *running_mean = nullptr; + const DenseTensor *running_variance = nullptr; + if (use_global_stats) { + running_mean = mean.get_ptr(); + running_variance = variance.get_ptr(); + } + paddle::operators::NormDoubleGradFunctor(ctx, + data_layout, + &x, + &scale, + &y_grad, + &saved_mean, + &saved_variance, + running_mean, + running_variance, + epsilon, + use_global_stats, + &x_grad_grad, + &scale_grad_grad, + &bias_grad_grad, + x_grad, + scale_grad, + y_grad_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} + +#else +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..6ad12245d2a45ad91148837bdf2617f83e597f6c --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -0,0 +1,680 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void BNForwardInference(const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + +template +void BatchNormKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space) { + double epsilon = epsilon_f; + const bool trainable_stats = trainable_statistics; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + bool test_mode = is_test && (!trainable_stats); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x.dims(); + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + ctx.template Alloc(y); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = + test_mode || + (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); + + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_y(y->type()); + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, y, &transformed_y); + } else { + transformed_x.ShareDataWith(x); + transformed_y.ShareDataWith(*y); + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); +#endif + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + // Note: PERSISTENT not implemented for inference + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, + data_desc_, + test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); +#endif + + auto handle = ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + // It is training mode when it's not reference AND not using pre-trained + // model. + bool training = !test_mode && !use_global_stats; + if (!training) { + // only when test we use input to do computation. + const auto *est_mean = &mean; + const auto *est_var = &variance; + // Run inference mode. + PADDLE_ENFORCE_EQ( + est_mean->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of mean's dimensions must equal to 1." + "But received: the size of mean's dimensions mean is [%d]," + "the dimensions of mean is [%s].", + est_mean->dims().size(), + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of variance's dimensions must equal to 1." + "But received: the size of variance's dimensions is [%d]," + "the dimensions of variance is [%s].", + est_var->dims().size(), + est_var->dims())); + PADDLE_ENFORCE_EQ( + est_mean->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of mean must equal to the number of " + "Channels, which is [%d]. But received: the first dimension" + "of mean is [%d], the dimensions of mean is [%s].", + C, + est_mean->dims()[0], + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of variance must equal to the number" + "of Channels, which is [%d]. But received: the first dimension of" + "variance is [%d], the dimensions of variance is [%s].", + C, + est_var->dims()[0], + est_var->dims())); + +#ifdef PADDLE_WITH_HIP + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + est_mean->template data>(), + est_var->template data>(), + epsilon)); +#endif + } else { + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + // need to solve here + // if (ctx.HasInput("MomentumTensor")) { + // const auto *mom_tensor = MomentumTensor; + // DenseTensor mom_cpu; + // paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + // &mom_cpu); + // momentum = mom_cpu.data()[0]; + // } + + // Run training mode. + // obtain running mean and running inv var, and there is no need + // to initialize them. + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + + if ((N * H * W * D) == 1) { + // Only 1 element in normalization dimension, + // skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + } else { + double this_factor = 1. - momentum; + + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + // auto *reserve_space = ctx.Output("ReserveSpace"); + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + phi::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*zDesc=*/nullptr, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*activationDesc=*/nullptr, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space_ptr = reserve_space->mutable_data( + ctx.GetPlace(), transformed_x.type(), reserve_space_size); + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, + mode_, + CUDNN_BATCHNORM_OPS_BN, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + nullptr, + nullptr, + data_desc_, + transformed_y.template data(), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()), + nullptr, + workspace_ptr, + workspace_size, + reserve_space_ptr, + reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, + block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, + block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); +#endif + } + } + } + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_y, y); + } +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); +#endif +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..c9c62026edfa7ad4cd3124cc5a612f8220ab00f5 --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_utils.h @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d5acc60a36097f02c579671860c0b480199ae09a --- /dev/null +++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gaussian_random_kernel.h" + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +#include "paddle/fluid/framework/generator.h" + +DECLARE_bool(use_curand); + +namespace phi { + +template +struct GaussianGenerator { + T mean_, std_; + unsigned int seed_; + unsigned int offset_ = 0; + + __host__ __device__ GaussianGenerator(T mean, T std, int seed) + : mean_(mean), std_(std), seed_(seed) {} + + __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset) + : mean_(mean), std_(std), seed_(seed), offset_(offset) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + using MT = typename phi::kps::details::MPTypeTrait::Type; + thrust::normal_distribution dist(mean_, std_); + unsigned int new_n = n + offset_; + rng.discard(new_n); + MT out = dist(rng); + return static_cast(out); + } +}; + +template +void GaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + tensor->Resize(phi::make_ddim(shape.GetData())); + + T* data = dev_ctx.template Alloc(tensor); + + int64_t size = tensor->numel(); + + int device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); + + using MT = typename phi::kps::details::MPTypeTrait::Type; + if (gen_cuda->GetIsInitPy() && seed_flag) { + if (FLAGS_use_curand) { + funcs::normal_distribution dist; + funcs::normal_transform trans(mean, std); + funcs::distribution_and_transform(dev_ctx, tensor, dist, trans); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + auto func = + GaussianGenerator(mean, std, seed_offset.first, gen_offset); + IndexKernel>(dev_ctx, tensor, func); + } + } else { + auto func = GaussianGenerator(mean, std, seed); + IndexKernel>(dev_ctx, tensor, func); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gaussian_random, + GPU, + ALL_LAYOUT, + phi::GaussianRandomKernel, + phi::dtype::float16, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..a25472d122b837fcc3928af15e0678f0362abf0c --- /dev/null +++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(pad_grad, + GPU, + ALL_LAYOUT, + phi::PadGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b77a5f1aeb6cb3f24f274beb6939c480022fe49 --- /dev/null +++ b/paddle/phi/kernels/gpu/pad_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/complex.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/pad_kernel_impl.h" +#include "paddle/phi/kernels/pad_kernel.h" + +PD_REGISTER_KERNEL(pad, + GPU, + ALL_LAYOUT, + phi::PadKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 9223a94c12aeb0912f634d1d5ca8b2c03653e8b9..94c2e980e36a1c6e1f7af3a92da6a7c0f0ed291c 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -178,6 +178,8 @@ struct IndexCalculator { : dim(dim) { dims = details::VectorToArray(cal_dims); strides = details::VectorToArray(full_strides); + reduce_strides = details::VectorToArray(cal_strides); +#ifndef PADDLE_WITH_XPU_KP std::vector cal_divmoders; // fast divmod for (auto i : cal_strides) { @@ -185,9 +187,22 @@ struct IndexCalculator { } divmoders = details::VectorToArray( cal_divmoders); +#endif } __device__ inline int operator()(int offset) const { +#ifdef PADDLE_WITH_XPU_KP + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + index += (offset / reduce_strides[i]) * strides[dims[i]]; + offset = offset % reduce_strides[i]; + } + return index; +#else int index = 0; #pragma unroll for (int i = 0; i < kMaxRank; ++i) { @@ -199,12 +214,16 @@ struct IndexCalculator { offset = divmod.val[1]; } return index; +#endif } int dim; phi::Array dims; phi::Array strides; + phi::Array reduce_strides; +#ifndef PADDLE_WITH_XPU2 phi::Array divmoders; +#endif }; template @@ -247,7 +266,7 @@ struct ReduceIndexMapping { __device__ __forceinline__ int BlockDimY() { #ifdef PADDLE_WITH_XPU2 - return dim.deal_size_y; + return 1; #else return blockDim.y; #endif @@ -454,10 +473,14 @@ struct ReduceConfig { bool is_last_dim = (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); if (rank == reduce_rank || is_last_dim) { +#ifdef PADDLE_WITH_XPU_KP + reduce_type = static_cast(ReduceType::kReduceAny); +#else reduce_type = static_cast(ReduceType::kReduceLastDim); +#endif } else if (reduce_rank == 1) { // ReduceFirstDim and reduceSecondDim -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP if (reduce_dim[0] == 0) { reduce_type = static_cast(ReduceType::kReduceHigherDim); } else { @@ -471,6 +494,7 @@ struct ReduceConfig { } } +#ifndef PADDLE_WITH_XPU_KP void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { constexpr int min_reduce_num_per_thread = 16; constexpr int max_reduce_num_per_thread = 256; @@ -569,6 +593,7 @@ struct ReduceConfig { grid_dim->y = details::AlignUp(reduce_num, blocking_size); } } +#endif void SetBlockDim() { // init @@ -577,14 +602,14 @@ struct ReduceConfig { dim3 block_dim(block_num, 1, 1); dim3 grid_dim(left_num, 1, 1); blocking_size = reduce_num; -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP if (reduce_last_dim) { - block_dim.x = 128; + block_dim.x = 64; block_dim.y = reduce_num; - grid_dim.x = 8; - grid_dim.y = 1; + grid_dim.x = 1; + grid_dim.y = 8; } else { - block_dim.x = 128; + block_dim.x = 64; block_dim.y = left_num; grid_dim.x = 8; grid_dim.y = 1; @@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x, store_offset = block.BlockIdY() * left_num + left_idx; loop_left = min(block.GetLoopSize(), left_num - left_idx); stride_left = 1; - tid = threadIdx.x; + tid = THREAD_ID_X; } else { auto block = ReduceIndexMapping(dim); input_idx = block.BlockIdY() * block.BlockDimY(); @@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x, loop_left = min(block.GetLoopSize(), left_num - left_idx); stride_left = block.BlockDimX() * block.GridDimX(); store_offset = block.BlockIdY() * left_num + left_idx; - tid = threadIdx.y; + tid = THREAD_ID_Y; } // calculate the offset, means the addr where each thread really start. // 1. reduce for each thread MPType input_compute[REDUCE_VEC_SIZE]; Tx input_reg[REDUCE_VEC_SIZE]; + int input_idx_tmp = input_idx; for (int i = 0; i < loop_left; i += stride_left) { int input_offset = left_index_calculator(left_idx + i); - const Tx* input = x + input_offset; + const _ptr_ Tx* input = x + input_offset; MPType reduce_var = init; // load REDUCE_VEC_SIZE data once, and then compute int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; + input_idx = input_idx_tmp; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { kps::ReadDataReduce config) { if (config.reduce_type == kReduceLastDim) { int stride_reduce = 1; @@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceAnyKernel<<<8, 128, stream>>>(x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + OneDimIndexCal><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + init, + config.reduce_num, + config.left_num, + config.reduce_last_dim, + reduce_index_calculator, + left_index_calculator, + dim); #else ReduceAnyKernel<<<8, 128, stream>>>( + IndexCalculator><<<8, 64, 0, stream>>>( x_data, config.output_data, reducer, @@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig dim = kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 - ReduceHigherDimKernel><<<8, 128, stream>>>( +#ifdef PADDLE_WITH_XPU_KP + ReduceHigherDimKernel< + Ty, + Ty, + MPType, + ReduceOp, + kps::IdentityFunctor><<<8, 64, 0, stream>>>( config.output_data, y_data, reducer, @@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data, const TransformOp& transform, int reduce_num, const paddle::platform::Place& place, - gpuStream_t stream) { + KPStream stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, transform); @@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data, const TransformOp& transform, int reduce_num, const paddle::platform::Place& place, - gpuStream_t stream) { + KPStream stream) { PADDLE_THROW(phi::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } @@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, phi::DenseTensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims, - gpuStream_t stream) { + KPStream stream) { y->mutable_data(x.place()); auto x_dim = phi::vectorize(x.dims()); @@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, config.SetOutputData(y_data, x.place(), &tmp); constexpr bool kIsTxFP16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; +#ifndef PADDLE_WITH_XPU_KP if (use_cub_reduce) { CubTensorReduceImpl( x_data, y_data, transform, config.reduce_num, x.place(), stream); return; } +#endif using MPType = typename kps::details::MPTypeTrait::Type; auto reducer = ReduceOp(); @@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, config.reduce_num % config.blocking_size, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceHigherDimKernel, - TransformOp><<<8, 128, stream>>>(x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); + TransformOp><<<8, 64, 0, stream>>>( + x_data, + config.output_data, + reducer, + transform, + reducer.initial(), + config.reduce_num, + config.left_num, + config.blocking_size, + dim); #else ReduceHigherDimKernel< Tx, @@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0); -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP ReduceHigherDimKernel< Ty, Ty, MPType, ReduceOp, - kps::IdentityFunctor><<<8, 128, stream>>>( + kps::IdentityFunctor><<<8, 64, 0, stream>>>( config.output_data, y_data, reducer, @@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, template class ReduceOp, template class TransformOp> -void Reduce(const GPUContext& dev_ctx, +void Reduce(const KPDevice& dev_ctx, const DenseTensor& x, bool reduce_all, const std::vector& dims, @@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx, reduce_num *= (x.dims())[i]; } - gpuStream_t stream = dev_ctx.stream(); + KPStream stream = dev_ctx.stream(); if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { auto tmp_tensor = phi::Cast(dev_ctx, x, out_dtype); diff --git a/paddle/phi/kernels/impl/pad_grad_kernel_impl.h b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..91f198f9fb681e4fabf7029fcc22343bb81953fd --- /dev/null +++ b/paddle/phi/kernels/impl/pad_grad_kernel_impl.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/padding.h" +namespace phi { +template +void PadGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const std::vector& paddings, + float pad_value, + DenseTensor* d_x) { + if (d_x == nullptr) { + return; + } + dev_ctx.template Alloc(d_x); + int rank = d_out.dims().size(); + phi::funcs::PaddingGradFunctor( + rank, dev_ctx, paddings, d_out, d_x); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..8e3ebb0dfe03b2f13e2a321bb813f7d10e306b7a --- /dev/null +++ b/paddle/phi/kernels/impl/pad_kernel_impl.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/padding.h" +namespace phi { +template +void PadKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& paddings, + float pad_value, + DenseTensor* out) { + dev_ctx.template Alloc(out); + int rank = x.dims().size(); + funcs::PaddingFunctor( + rank, dev_ctx, paddings, static_cast(pad_value), x, out); +} +} // namespace phi diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f39d87e5c0ef6503d772e5f9ee95e307a13eda13 --- /dev/null +++ b/paddle/phi/kernels/pad_grad_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PadGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const std::vector& paddings, + float pad_value, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..511e8cf73df97ffb250b1106aa98155de33a97d1 --- /dev/null +++ b/paddle/phi/kernels/pad_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PadKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& paddings, + float pad_value, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt index 3e4a968b7a8a569fc518366c321a57e2738bf12a..a319e9a13c3f7e62d7670cc3a34d837cba11b080 100644 --- a/paddle/phi/kernels/sparse/CMakeLists.txt +++ b/paddle/phi/kernels/sparse/CMakeLists.txt @@ -1,3 +1,3 @@ -set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) +set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function) register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel") diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index ab2fef5320f716b6bc780ad14b8e2adef44427dd..1031f76917920adba26ec75d166f18d85435be70 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx, f_calc_rulebook(nullptr); // alloc the rulebook - rulebook->ResizeAndAllocate({3, rulebook_len}); + DenseTensorMeta rulebook_meta( + DataType::INT32, {3, rulebook_len}, DataLayout::NCHW); + rulebook->set_meta(rulebook_meta); dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); int* rulebook_ptr = rulebook->data(); f_calc_rulebook(rulebook_ptr); diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index fdf255bd542e66245b44b2ec906dc207ee51a422..93397d4c9310007e99500bdb77a6d14a2fb0d4c2 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" namespace phi { namespace sparse { @@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx, // 1. product rulebook DenseTensorMeta counter_meta( DataType::INT32, {kernel_size}, DataLayout::NCHW); - // DenseTensor rulebook = phi::Empty(dev_ctx); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); ProductRuleBook(dev_ctx, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..aeb9409c417ba926deffa8e5af1310ce5205418e --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -0,0 +1,612 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" + +namespace phi { +namespace sparse { + +// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex +__global__ void InitByIndexKernel(const int n, int* out1, int* out2) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + out1[i] = i; + out2[i] = i; + } +} + +/** + * @brief: update the out index and indices + * unique_keys: save the index of the output feature list + * unique_values: indiates the index of key before deduplication + * out_indexs: indicates the position of the output index in the rulebook + * rulebook_len: indicates the length of rulebook + * out_dims: indicates the output dims + * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) + * rulebook_out_indexs: the output index in rulebook +**/ +__global__ void UpdateIndexKernel(const int* unique_keys, + const int* unique_values, + const int* out_indexs, + const int non_zero_num, + const int rulebook_len, + const Dims4D out_dims, + int* out_indices, + int* rulebook_out_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + const int index = unique_keys[i]; + int batch, x, y, z; + IndexToPoint(index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + + // update rulebook + int start = unique_values[i]; + int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; + // max(end-start) = kernel_size + for (int j = start; j < end; j++) { + rulebook_out_indexs[out_indexs[j]] = i; + } + } +} + +/** + * @brief product rulebook + * for input_i in x_indices: + * if input_i participate in the convolution calculation: + * infer the output_i by input_i and kernel_i + * save output_i + * + * x_indices: the indices of input features + * x_dims: the input dims + * kernel_dims: the kernel dims + * out_dims: the output dims + * non_zero_num: the number of input features + * rulebook: the rulebook to save the kernel index, input index and output index + * counter: save the number of times each location in the kernel participates in + *the caculation +**/ +__global__ void ProductRuleBookKernel(const int* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + int* rulebook, + int* counter) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int counter_buf[]; // kernel_size + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + int in_i = -1, out_index = -1; + if (Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + in_i = i; + out_index = + PointToIndex(batch, out_x, out_y, out_z, out_dims); + atomicAdd(&counter_buf[kernel_index], 1); + } + rulebook[kernel_index * non_zero_num + i] = in_i; + rulebook[kernel_index * non_zero_num + offset + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicAdd(&counter[i], counter_buf[i]); + } +} + +// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace +// this kernel with phi::GatherCUDAKernel; +// Vectorization can be used to improve read and write bandwidth +/** + * brief: gather data from params according to indices + * params: the inputs + * indices: the indices you want to gather + * output: the outputs + * index_size: the size of indices + * slice_size: slice size corresponding to each index, here is the channel size +**/ +template +__global__ void GatherKernel(const T* params, + const IndexT* indices, + T* output, + size_t index_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = indices[indices_i]; + int64_t params_i = gather_i * slice_size + slice_i; + *(output + i) = *(params + params_i); + } +} + +/** + * brief: scatter add + * input: the inputs + * unique_value: refer to UpdateIndexKernel notes + * out_index: the output feature index + * non_zero_num: the number of output features + * rulebook_len: the length of rulebook + * channels: the output channel size + * out: the outputs +**/ +template +__global__ void ScatterKernel(const T* input, + const int* unique_value, + const int* out_index, + const int non_zero_num, + const int rulebook_len, + const int channels, + T* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { + int indices_i = i / channels; + int channels_i = i - indices_i * channels; + + int start = unique_value[indices_i]; + int end = indices_i == non_zero_num - 1 ? rulebook_len + : unique_value[indices_i + 1]; + // max(end-start) = kernel_size + T sum = static_cast(0); + for (int j = start; j < end; j++) { + const int out_feature_i = out_index[j]; + sum += input[out_feature_i * channels + channels_i]; + } + out[indices_i * channels + channels_i] = sum; + } +} + +// brief: calculation the distance between start and end +__global__ void DistanceKernel(const int* start, + const int* end, + int* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +// the basic algorithm can refer to convolution_kernel.cc or +// the second paper +// example: +// 1. the rulebook: +// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... +// the out_index(key): 20, 30, 33, 30, 33, 20, 25 +// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... +// 3. sorted the (key, value) +// 4. unique the (key, value): +// unique_key: 20, 25, 30, 33 +// unique_values: 0, 2, 3, 5 +// the index of unique_values is: 0, 1, 2, 3 +// 5. update the out_index by unique_key, uniqe_value and the index of +// unique_value: +// the new out_index: 0, 2, 3, 2, 3, 0, 1 +template +int ProductRuleBook(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const DDim& out_dims, + DenseTensor* rulebook, + DenseTensor* counter_per_kernel, + DenseTensor* offsets_per_kernel, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value, + SparseCooTensor* out, + std::vector* h_counter, + std::vector* h_offsets) { + const auto& kernel_dims = kernel.dims(); + const int64_t non_zero_num = x.nnz(); + const auto& non_zero_indices = x.non_zero_indices(); + const int* indices_ptr = non_zero_indices.data(); + dev_ctx.Alloc(counter_per_kernel, + counter_per_kernel->dtype(), + sizeof(int) * counter_per_kernel->numel()); + int* counter_ptr = counter_per_kernel->data(); + dev_ctx.Alloc(offsets_per_kernel, + offsets_per_kernel->dtype(), + sizeof(int) * offsets_per_kernel->numel()); + int* offsets_ptr = offsets_per_kernel->data(); + int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num}); + dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel()); + int* rulebook_ptr = rulebook->data(); + + const auto x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); + Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); + Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); + Dims4D d_strides(1, strides[2], strides[1], strides[0]); + Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); + + // 1. product rule book + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, counter_per_kernel, 0); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + rulebook_ptr, + counter_ptr); + +// 2. remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 2 * kernel_size * non_zero_num, + -1); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + int rulebook_len = + (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1]; + + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({rulebook_len}); + unique_value->ResizeAndAllocate({rulebook_len}); + unique_key->ResizeAndAllocate({rulebook_len}); + dev_ctx.Alloc( + out_index, out_index->dtype(), sizeof(int) * out_index->numel()); + int* out_index_ptr = out_index->data(); + dev_ctx.Alloc( + unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel()); + int* unique_value_ptr = unique_value->data(); + dev_ctx.Alloc( + unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel()); + int* unique_key_ptr = unique_key->data(); + + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + InitByIndexKernel<<>>( + rulebook_len, out_index_ptr, unique_value_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, + rulebook_ptr + rulebook_len, + rulebook_len * sizeof(int), + hipMemcpyDeviceToDevice, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr, + rulebook_ptr + rulebook_len, + rulebook_len * sizeof(int), + cudaMemcpyDeviceToDevice, + dev_ctx.stream()); +#endif + +// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher +// performance, but thrust::merge_by_key limited by data size +#ifdef PADDLE_WITH_HIP + thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key_ptr, + unique_key_ptr + rulebook_len, + out_index_ptr); + + // 4. unique + thrust::pair new_end = +#ifdef PADDLE_WITH_HIP + thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + unique_key_ptr, + unique_key_ptr + rulebook_len, + unique_value_ptr); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1>>>(unique_key_ptr, + new_end.first, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1); + int out_non_zero_num = 0; +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + 2 * kernel_size * non_zero_num - 1, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + dev_ctx.Alloc( + &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel()); + int* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>(unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void Conv3dKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + SparseCooTensor* out, + DenseTensor* rulebook) { + // update padding and dilation + // Currently, only support x.layout is NDHWC, groups = 1 + // if x.layout != NDHWC then transpose(x), transpose(weight) + + const auto& x_dims = x.dims(); + const auto& kernel_dims = kernel.dims(); + int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + DDim out_dims = {1, 1, 1, 1, 1}; + GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + out->set_dims(out_dims); + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + std::vector offsets(kernel_size + 1), h_counter(kernel_size); + + // Second algorithm: + // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf + // 1. product rulebook + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensorMeta offsets_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); + DenseTensor out_index = phi::Empty(dev_ctx); + DenseTensor unique_key = phi::Empty(dev_ctx); + DenseTensor unique_value = phi::Empty(dev_ctx); + + int n = ProductRuleBook(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + out_dims, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_key, + &unique_value, + out, + &h_counter, + &offsets); + + const int* counter_ptr = counter_per_kernel.data(); + const int* offsets_ptr = counter_per_kernel.data(); + + // 2. gather + DenseTensorMeta in_features_meta( + x.dtype(), {n, in_channels}, DataLayout::NCHW); + DenseTensorMeta out_features_meta( + x.dtype(), {n, out_channels}, DataLayout::NCHW); + phi::DenseTensor in_features = + phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::DenseTensor out_features = + phi::Empty(dev_ctx, std::move(out_features_meta)); + dev_ctx.Alloc( + &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); + T* in_features_ptr = in_features.data(); + dev_ctx.Alloc( + &out_features, out_features.dtype(), sizeof(T) * out_features.numel()); + T* out_features_ptr = out_features.data(); + + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook->data(), + in_features_ptr, + n, + in_channels); + + // 3. call gemm for every werght + auto blas = phi::funcs::GetBlas(dev_ctx); + auto* out_values = out->mutable_non_zero_elements(); + dev_ctx.Alloc( + out_values, out_values->dtype(), sizeof(T) * out_values->numel()); + T* out_values_ptr = out_values->data(); + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (h_counter[i] <= 0) { + continue; + } + + // call gemm: (n, in_channels) * (in_channels, out_channels) + const int M = h_counter[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * K * N; + T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels; + + blas.GEMM(CblasNoTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_out_ptr); + } + + // 4. scatter + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + ScatterKernel<<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_conv3d, + GPU, + ALL_LAYOUT, + phi::sparse::Conv3dKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..011d4c12ecefc5b69eec4bf15425aaa648666159 --- /dev/null +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); +} + +KernelSignature BatchNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "batch_norm_grad", + {GradVarName("Y"), + "X", + "Scale", + "Bias", + "SavedMean", + "SavedVariance", + "ReserveSpace", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); +} + +KernelSignature BatchNormGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm_grad_grad", + {"DDX", + "DDScale", + "DDBias", + "DY", + "X", + "Scale", + "SavedMean", + "SavedVariance", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"DX", "DScale", "DDY"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad, + phi::BatchNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad, + phi::BatchNormGradGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..cddcb80ebea3ddcae345789497ca8006301f7a6e --- /dev/null +++ b/paddle/phi/ops/compat/gaussian_random_sig.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GaussianRandomOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("gaussian_random", + {}, + {"ShapeTensorList", "mean", "std", "seed", "dtype"}, + {"Out"}); + } + + const auto& shape = paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("gaussian_random", + {}, + {"ShapeTensor", "mean", "std", "seed", "dtype"}, + {"Out"}); + } + + return KernelSignature("gaussian_random", + {}, + {"shape", "mean", "std", "seed", "dtype"}, + {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gaussian_random, + phi::GaussianRandomOpArgumentMapping); diff --git a/paddle/phi/ops/compat/pad_sig.cc b/paddle/phi/ops/compat/pad_sig.cc new file mode 100644 index 0000000000000000000000000000000000000000..4eadbfa98beded121c4e6738384487a9ec10be42 --- /dev/null +++ b/paddle/phi/ops/compat/pad_sig.cc @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pad_grad", + {GradVarName("Out")}, + {"paddings", "pad_value"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping); diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index cde085423e482e62a280815700ead9a0b6c64262..be12960d1d675b46987996713a0631399d1f0652 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_ cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils) diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..16d7cb66f4cc5f14abf31bb0a16d58c266bc15fb --- /dev/null +++ b/paddle/phi/tests/api/test_sparse_conv_api.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See +the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/api/include/api.h" + +#include "paddle/phi/api/include/sparse_api.h" + +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/sparse_coo_tensor.h" + +template +void TestConv3dBase(const std::vector& indices, + const std::vector& features, + const phi::DDim& x_dims, + const std::vector& kernel, + const phi::DDim& kernel_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const phi::DDim& correct_out_dims, + const int non_zero_num, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3) { + const auto alloc = std::make_unique( + paddle::platform::CPUPlace()); + + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + + phi::DenseTensor indices_tensor( + alloc.get(), + phi::DenseTensorMeta( + phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW)); + memcpy( + indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); + + phi::DenseTensor features_tensor( + alloc.get(), + phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + phi::DataLayout::NHWC)); + memcpy( + features_tensor.data(), features.data(), features.size() * sizeof(T)); + + auto x_tensor = std::make_shared( + indices_tensor, features_tensor, x_dims); + paddle::experimental::Tensor x(x_tensor); + + auto kernel_tensor = std::make_shared( + alloc.get(), + phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + kernel_dims, + phi::DataLayout::NHWC)); + paddle::experimental::Tensor weight(kernel_tensor); + + memcpy(kernel_tensor->mutable_data(paddle::platform::CPUPlace()), + kernel.data(), + kernel.size() * sizeof(T)); + + if (!std::is_same::value) { + auto outs = paddle::experimental::sparse::conv3d( + x, weight, paddings, dilations, strides, 1); + + auto out = std::dynamic_pointer_cast( + std::get<0>(outs).impl()); + ASSERT_EQ(correct_out_dims.size(), out->dims().size()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], out->dims()[i]); + } + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz()); + + int cmp_indices = memcmp(correct_out_indices.data(), + out->non_zero_indices().data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices, 0); + + for (uint64_t i = 0; i < correct_out_features.size(); i++) { + float tmp = std::fabs(static_cast( + correct_out_features[i] - out->non_zero_elements().data()[i])); + ASSERT_LT(tmp, diff); + } + } +} + +void TestConv3d(const std::vector& indices, + const std::vector& features, + const phi::DDim& x_dims, + const std::vector& kernel, + const phi::DDim& kernel_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const phi::DDim& correct_out_dims, + const int non_zero_num, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations) { + // test float + TestConv3dBase(indices, + features, + x_dims, + kernel, + kernel_dims, + correct_out_indices, + correct_out_features, + correct_out_dims, + non_zero_num, + paddings, + strides, + dilations); +} + +TEST(API, sparse_conv2d) { + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + const int in_channels = 1; + const int out_channels = 1; + phi::DDim x_dims = {1, 1, 5, 5, in_channels}; + phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels}; + phi::DDim out_dims = {1, 1, 3, 3, out_channels}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4}; + + std::vector features = {-0.79394531, -0.3125, -0.55029297}; + // 3*3*3=27 + std::vector kernel = {0.65820312, + 0.75048828, + 0.21411133, + 0.17370605, + 0.85546875, + 0.53076172, + 0.28833008, + 0.71044922, + 0.00659943}; + + std::vector out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 1, 2, 0, 1, 2}; + + std::vector out_features = { + -0.17004, -0.71338, -0.00206, -0.22205, -0.09009}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations); +} diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 00b2a256a9504595dd8ac4ffd492564557f2d783..ace95b55055a1e704630ed246b90811dac277421 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" @@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector& indices, f_verify(grads[1].data(), kernel_grad); } } + +// test gpu +#if defined(PADDLE_WITH_CUDA) + phi::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + + DenseTensor d_indices_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + dev_ctx_gpu.Alloc(&d_indices_tensor, + d_indices_tensor.dtype(), + sizeof(int) * d_indices_tensor.numel()); + phi::Copy( + dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); + + DenseTensor d_features_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + dev_ctx_gpu.Alloc(&d_features_tensor, + d_features_tensor.dtype(), + sizeof(T) * d_features_tensor.numel()); + phi::Copy( + dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); + + SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); + + DenseTensor d_kernel_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + kernel_dims, + DataLayout::NHWC)); + dev_ctx_gpu.Alloc(&d_kernel_tensor, + d_kernel_tensor.dtype(), + sizeof(T) * d_kernel_tensor.numel()); + phi::Copy( + dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); + + DenseTensor d_rulebook = phi::Empty(dev_ctx_gpu); + SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + paddings, + dilations, + strides, + 1, + &d_rulebook); + + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]); + } + + DenseTensor h_indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); + dev_ctx_cpu.Alloc(&h_indices_tensor, + h_indices_tensor.dtype(), + sizeof(int) * h_indices_tensor.numel()); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_indices(), + phi::CPUPlace(), + true, + &h_indices_tensor); + + int cmp_indices2 = memcmp(correct_out_indices.data(), + h_indices_tensor.data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices2, 0); + + DenseTensor h_features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {d_out.nnz()}, + d_out.layout())); + + dev_ctx_cpu.Alloc(&h_features_tensor, + h_features_tensor.dtype(), + sizeof(T) * h_features_tensor.numel()); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_elements(), + phi::CPUPlace(), + true, + &h_features_tensor); + for (uint64_t i = 0; i < correct_out_features.size(); i++) { + float tmp = std::fabs(static_cast(correct_out_features[i] - + h_features_tensor.data()[i])); + ASSERT_LT(tmp, diff); + } +#endif } void TestConv3d(const std::vector& indices, diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index bc59b87e2ffa5c653e89c759f951de5f520773ba..236322ccfca6aad442e76af6f57c6c5f83ca59bb 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -1430,6 +1430,22 @@ class Fleet(object): # cache original feed forward program self.origin_main_program = loss.block.program + # add distributed attr + if not hasattr(self.origin_main_program, "distributed_info_"): + setattr(self.origin_main_program, "distributed_info_", dict()) + self.origin_main_program.distributed_info_[ + "dp_degree"] = self._user_defined_strategy.sharding_configs[ + "dp_degree"] + self.origin_main_program.distributed_info_[ + "mp_degree"] = self._user_defined_strategy.sharding_configs[ + "mp_degree"] + self.origin_main_program.distributed_info_[ + "pp_degree"] = self._user_defined_strategy.sharding_configs[ + "pp_degree"] + self.origin_main_program.distributed_info_[ + "sharding_degree"] = self._user_defined_strategy.sharding_configs[ + "sharding_degree"] + context["origin_main_program"] = self.origin_main_program context["loss"] = loss if startup_program == None: diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index f75a0fa50a59c1dd570f3b35ff5b3c9108564e78..807f7c151964e39c8d14136b91b5ecb15b88a4ce 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -351,10 +351,10 @@ endif() set_tests_properties(test_graph PROPERTIES TIMEOUT 120) set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120) -set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120) -set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120) +set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200) +set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200) +set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200) if(LINUX AND WITH_MKLDNN) set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index c4318b8bf8ef629d4bcba1f43be350298beff7da..7b9cd7958b2d3d0704a3770156d86f9cae44592a 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -26,7 +26,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid import core from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.framework import IrGraph +from paddle.fluid.framework import IrGraph, _test_eager_guard from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX @@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer): class TestImperativeOutSclae(unittest.TestCase): - def test_out_scale_acc(self): + def func_out_scale_acc(self): seed = 1000 lr = 0.001 @@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase): loss_list[i] > loss_list[i + 1], msg='Failed to do the imperative qat.') + def test_out_scale_acc(self): + with _test_eager_guard(): + self.func_out_scale_acc() + self.func_out_scale_acc() + class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): - def test_save_quantized_model(self): + def func_save_quantized_model(self): lr = 0.001 load_param_path = "test_save_quantized_model/lenet.pdparams" @@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): loss_list[i] > loss_list[i + 1], msg='Failed to do the imperative qat.') + def test_save_quantized_model(self): + with _test_eager_guard(): + self.func_save_quantized_model() + self.func_save_quantized_model() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py index fb92b12cb0d870d185b4c31a7bcdb1bebfe5b38d..fad4c8f9d580b389b861c4c9a992af0c48cd892d 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py @@ -29,6 +29,7 @@ import paddle.fluid as fluid from paddle.fluid.contrib.slim.quantization import * from paddle.fluid.log_helper import get_logger from paddle.dataset.common import download +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn from imperative_test_utils import ImperativeLinearBn_hook @@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase): break return top1_correct_num / total_num - def test_ptq(self): + def func_ptq(self): start_time = time.time() self.set_vars() @@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase): end_time = time.time() print("total time: %ss \n" % (end_time - start_time)) + def test_ptq(self): + with _test_eager_guard(): + self.func_ptq() + self.func_ptq() + class TestImperativePTQfuse(TestImperativePTQ): - def test_ptq(self): + def func_ptq(self): start_time = time.time() self.set_vars() @@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ): end_time = time.time() print("total time: %ss \n" % (end_time - start_time)) + def test_ptq(self): + with _test_eager_guard(): + self.func_ptq() + self.func_ptq() + class TestImperativePTQHist(TestImperativePTQ): def set_vars(self): diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 677ccb52e242cf0c95a7b03acaefbf4d424ac401..5db720b028ffec8c1c02731a44c0a4466e827cc8 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose - +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet paddle.enable_static() @@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase): self.activation_quantize_type = 'moving_average_abs_max' print('weight_quantize_type', self.weight_quantize_type) - def test_qat(self): + def func_qat(self): self.set_vars() imperative_qat = ImperativeQuantAware( @@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase): np.allclose(after_save, before_save.numpy()), msg='Failed to save the inference quantized model.') + def test_qat(self): + with _test_eager_guard(): + self.func_qat() + self.func_qat() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py index d1bf76f472465f2803cd95ad1dc468bbc5289051..2dcf7a6f168e20393a3e1a3432ac75b652e2a063 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py @@ -27,7 +27,7 @@ import paddle.fluid as fluid from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.log_helper import get_logger from paddle.dataset.common import download - +from paddle.fluid.framework import _test_eager_guard from imperative_test_utils import fix_model_dict, ImperativeLenet os.environ["CPU_NUM"] = "1" diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 270e8ee566ab57c4e244e1bbb52a8c3d9a41db52..0bc80694a12cb6a9ff2f857444a06e7764038e36 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D from paddle.fluid.dygraph import Linear from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger - +from paddle.fluid.framework import _test_eager_guard os.environ["CPU_NUM"] = "1" _logger = get_logger( @@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase): _logger.info("test act_preprocess") self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT) - def test_quant_aware_training(self): + def func_quant_aware_training(self): imperative_qat = self.imperative_qat seed = 1 np.random.seed(seed) @@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase): train(lenet) test(lenet) + def test_quant_aware_training(self): + with _test_eager_guard(): + self.func_quant_aware_training() + self.func_quant_aware_training() + class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess): def setUp(self): diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py index 8d2e0f753c0167295dd18c0266b613e963de5d8a..d77134d72a9596956f063612bc16e6a37f81037a 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py @@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.log_helper import get_logger from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant +from paddle.fluid.framework import _test_eager_guard os.environ["CPU_NUM"] = "1" if core.is_compiled_with_cuda(): @@ -42,7 +43,8 @@ _logger = get_logger( class TestImperativeOutSclae(unittest.TestCase): - def test_out_scale_acc(self): + def func_out_scale_acc(self): + paddle.disable_static() seed = 1000 lr = 0.1 @@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase): if find_matmul: self.assertTrue(matmul_skip_count == 1) + def test_out_scale_acc(self): + with _test_eager_guard(): + self.func_out_scale_acc() + self.func_out_scale_acc() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index 937fcdf0463beed7d9116be1a4800a0d02238e7d..ffa12ac70460084fd49a14d0193be6e913495b9a 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -155,8 +155,7 @@ def prune_model(main_program=None, n=2, m=4, mask_algo='mask_1d', - with_mask=True, - sharding=False): + with_mask=True): r""" Pruning parameters of supported layers in :attr:`main_program` via specified mask generation function given by :attr:`mask_algo`. This @@ -179,7 +178,6 @@ def prune_model(main_program=None, mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`. The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'. with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True. - sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False. Returns: dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable. Examples: @@ -221,7 +219,10 @@ def prune_model(main_program=None, # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model` sparsity.prune_model(main_program, mask_algo='mask_2d_best') """ - if sharding: + if main_program is not None and hasattr( + main_program, + "distributed_info_") and main_program.distributed_info_[ + "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda(): gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = paddle.CUDAPlace(gpu_id) else: diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 8c2ff140ea4d5531a0ab6e284b1661573d9a2670..8149d69d36a27fadcefa8dc6b6ff1dd89792e29e 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -99,18 +99,19 @@ def param_guard(parameters): yield -def _convert_into_variable(var_base): +def _convert_into_variable(tensor): """ Convert Varbase into Variable. """ - if isinstance(var_base, core.VarBase): + if isinstance(tensor, (core.eager.Tensor, core.VarBase)): # Check whether has been created before. - new_var = var_base.block._find_var_recursive(var_base.name) + new_var = tensor.block._find_var_recursive(tensor.name) if new_var is not None: assert isinstance(new_var, framework.Variable) # Convert ParamBase into Parameter with same attributes in dy2stat. - elif isinstance(var_base, framework.ParamBase): - new_var = var_base._to_static_var(to_parameter=True) + elif isinstance(tensor, + (framework.EagerParamBase, framework.ParamBase)): + new_var = tensor._to_static_var(to_parameter=True) else: # Note(Aurelius84): Convert VarBase in self._buffers into Variable with # same attributes and set persistable=True to allow saving this var. @@ -120,13 +121,13 @@ def _convert_into_variable(var_base): # But if its shape is empty while created from `create_variable()`, we consider this buffer # non-persistable. See case of `drop_state` in lstm api. - is_persistable = len(var_base.shape) > 0 + is_persistable = len(tensor.shape) > 0 - new_var = var_base._to_static_var( + new_var = tensor._to_static_var( to_parameter=False, persistable=is_persistable) return new_var else: - return var_base + return tensor def enabled(): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 94fc5558ab162636e59a5569904d770970f812d1..a442a8b92b6f7cf6c5c366e63ace110e9fb94e01 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -61,7 +61,8 @@ class NestSequence(object): def _get_var_ids(self): var_ids = [] for idx, var in enumerate(self.__input_list): - if isinstance(var, (framework.Variable, core.VarBase)): + if isinstance(var, (framework.Variable, core.VarBase, + core.eager.Tensor)): var_ids.append(idx) return var_ids @@ -73,7 +74,8 @@ class NestSequence(object): if need_check: warning_types = set() for var in self.__input_list: - if not isinstance(var, (framework.Variable, core.VarBase)): + if not isinstance(var, (framework.Variable, core.VarBase, + core.eager.Tensor)): warning_types.add(type(var)) if warning_types: logging_utils.warn( @@ -301,10 +303,17 @@ class PartialProgramLayer: for name in block.vars: if "@GRAD" in name: var_desc = block.vars[name].desc - var_base = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), - var_desc.type(), False) + var_base = None + if not core._in_eager_mode(): + var_base = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) + else: + var_base = core.eager.Tensor(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) double_grads.append(var_base) return self._valid_vars(double_grads) @@ -386,13 +395,22 @@ class PartialProgramLayer: expected_place = framework._current_expected_place() for i, value in enumerate(flatten_inputs): if isinstance(value, np.ndarray): - var = core.VarBase( - value=value, - name=self._inputs[i].desc.name(), - persistable=False, - place=expected_place, - zero_copy=True) - elif isinstance(value, core.VarBase): + var = None + if not core._in_eager_mode(): + var = core.VarBase( + value=value, + name=self._inputs[i].desc.name(), + persistable=False, + place=expected_place, + zero_copy=True) + else: + var = core.eager.Tensor( + value=value, + name=self._inputs[i].desc.name(), + persistable=False, + place=expected_place, + zero_copy=True) + elif isinstance(value, (core.VarBase, core.eager.Tensor)): # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times # into CUDAPlace when it's as input of multi Ops. so we move it in advance # to avoid this problem. @@ -411,9 +429,16 @@ class PartialProgramLayer: var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc - var_base = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), var_desc.type(), False) + varbase = None + if not core._in_eager_mode(): + var_base = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) + else: + var_base = core.eager.Tensor(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) return var_base # Create VarBase to receive output data. @@ -423,12 +448,19 @@ class PartialProgramLayer: def _create_scope_vec(self): # Hold forward variables - tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], - "program_out_scope", - core.VarDesc.VarType.STEP_SCOPES, True) - - inner_scope = core.Scope() - tmp_scope_vec.value().set_scope(inner_scope) + tmp_scope_vec = None + if not core._in_eager_mode(): + tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], + "program_out_scope", + core.VarDesc.VarType.STEP_SCOPES, True) + # TODO(jiabin): Support this later. + # else: + # tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [], + # "program_out_scope", + # core.VarDesc.VarType.STEP_SCOPES, True) + + inner_scope = core.Scope() + tmp_scope_vec.value().set_scope(inner_scope) return tmp_scope_vec def _restore_out(self, out_vars): @@ -450,7 +482,8 @@ class PartialProgramLayer: return main_program.clone(for_test=True) def _is_no_value(self, var): - if isinstance(var, core.VarBase) and var.shape == [1]: + if isinstance(var, + (core.VarBase, core.eager.Tensor)) and var.shape == [1]: # NOTE: .numpy() will insert MemcpySync operation, it hits performance. if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM: return True @@ -460,7 +493,7 @@ class PartialProgramLayer: """ Removes invalid value for various-length return statement """ - if isinstance(out_vars, core.VarBase): + if isinstance(out_vars, (core.VarBase, core.eager.Tensor)): if self._is_no_value(out_vars): return None return out_vars @@ -527,7 +560,7 @@ class PartialProgramLayer: param_and_buffer_names_set = set() for i, var in enumerate(self._params): # self._params constains parameters and buffers with persistable=True. - if not isinstance(var, core.VarBase): + if not isinstance(var, (core.VarBase, core.eager.Tensor)): raise TypeError( 'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'. format(i, type(var))) @@ -559,10 +592,18 @@ def _create_fake_var(): """ Create a fake_var (force on CPU) to handle empty input or output """ - return [ - core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", - core.VarDesc.VarType.RAW, False) - ] + if not core._in_eager_mode(): + return [ + core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + else: + return [] + # TODO(jiabin): Support this later + # return [ + # core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + # core.VarDesc.VarType.RAW, False) + # ] def partial_program_from(concrete_program): diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 4bfdc3c27fad628bba3fd16237c12d3ca43244d7..b1865691b2475c4f855f51244e627965047d7720 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -25,7 +25,7 @@ import threading import six import paddle -from paddle.fluid import core +from paddle.fluid import core, dygraph from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy from paddle.fluid.data_feeder import check_type from paddle.fluid.layers.utils import flatten, pack_sequence_as @@ -898,30 +898,33 @@ def save(layer, path, input_spec=None, **configs): state_var_dict[var.name] = var # 3. share parameters from Layer to scope & record var info - for param_or_buffer in concrete_program.parameters: - # share to scope - if param_or_buffer.type == core.VarDesc.VarType.VOCAB: - scr_tensor = param_or_buffer.value().get_map_tensor() - tgt_var = scope.var(param_or_buffer.name) - tgt_var.set_vocab(scr_tensor) - else: - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[param_or_buffer.name].value( - ).get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) - # record var info - if param_or_buffer.name not in extra_var_info: - extra_info_dict = dict() - if param_or_buffer.name in state_names_dict: - extra_info_dict['structured_name'] = state_names_dict[ - param_or_buffer.name] - extra_info_dict[ - 'stop_gradient'] = param_or_buffer.stop_gradient - if isinstance(param_or_buffer, ParamBase): - extra_info_dict['trainable'] = param_or_buffer.trainable - extra_var_info[param_or_buffer.name] = extra_info_dict + with dygraph.guard(): + for param_or_buffer in concrete_program.parameters: + # share to scope + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[param_or_buffer.name].value( + ).get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) + # record var info + if param_or_buffer.name not in extra_var_info: + extra_info_dict = dict() + if param_or_buffer.name in state_names_dict: + extra_info_dict[ + 'structured_name'] = state_names_dict[ + param_or_buffer.name] + extra_info_dict[ + 'stop_gradient'] = param_or_buffer.stop_gradient + if isinstance(param_or_buffer, ParamBase): + extra_info_dict[ + 'trainable'] = param_or_buffer.trainable + extra_var_info[param_or_buffer.name] = extra_info_dict # 4. build input & output of save_infernece_model # NOTE(chenweihang): [ Get input variables name ] diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 65bfba3f6c32e072a6db0e1d294a8c5fc07d9d74..6843c0e4c3fa85f20b408e7536cf1902dafe3f45 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -94,7 +94,7 @@ def monkey_patch_varbase(): # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None). attr_not_need_keys = ['grad', 'T'] - if isinstance(self, ParamBase): + if isinstance(self, (ParamBase, EagerParamBase)): attr_kwargs = self.__dict__.copy() else: attr_names = [] @@ -111,7 +111,7 @@ def monkey_patch_varbase(): attr_kwargs.update(kwargs) - if to_parameter or isinstance(self, ParamBase): + if to_parameter or isinstance(self, (ParamBase, EagerParamBase)): del attr_kwargs['persistable'] # NOTE(Aurelius84): All parameters should be placed into global block. attr_kwargs['block'] = attr_kwargs['block'].program.global_block() diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 4bbc0ba03c9342afc4a0d2edee6c2b963ad6e0f8..a48cfd9150c657784f46fc6316a797c767fd5dd4 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj): @static_only def _legacy_save(param_dict, model_path, protocol=2): def get_tensor(var): - if isinstance(var, core.VarBase): + if isinstance(var, (core.VarBase, core.eager.Tensor)): return var.numpy() elif isinstance(var, core.LoDTensor): return np.array(var) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f022e1791daefb7cbe18434aae8ac1dbc63d39c5..fd7226c48661fdb2cd4dcf7227d0f8383c6c9439 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None): check_variable_and_dtype( x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'], 'flatten') + if in_dygraph_mode(): + return _C_ops.flatten2(x, 'axis', axis)[0] + helper = LayerHelper('flatten', **locals()) if not (isinstance(x, Variable)): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 76414ea942465d1e7a54084a2e4ee31b9ee41a2d..c63ad42288fd057d8456f31a675c9f1912bdc12f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -663,7 +663,9 @@ def assign(input, output=None): }) if is_inplace and in_dygraph_mode(): - output._bump_inplace_version() + # TODO(jiabin): Remove this when we support inplace + if not core._in_eager_mode(): + output._bump_inplace_version() return output diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py index 26170015ae8c249fb3a36d13285f5b34491acb3a..d9ddd6c88d727a4cca5e94cf19b122355f3ea6c5 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py +++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py @@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase): feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) exe.run(startup_prog) - sparsity.prune_model(train_prog, sharding=True) + sparsity.prune_model(train_prog) data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) exe.run(train_prog, feed=feeder.feed([data])) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index 30c1955adcf9f652e77b53eca8375955fb26eb35..c6f491a5484d9f1601993e1dd581e46008f0e27a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -520,6 +520,7 @@ def predict_static(args, data): paddle.enable_static() exe = fluid.Executor(args.place) # load inference model + [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model( args.model_save_dir, diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py new file mode 100644 index 0000000000000000000000000000000000000000..5420e1d36b369b1dcd5763ed1ba4d5bdbef8005b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +import datetime +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float32" + self.shape = (2, 10, 5) + + def test_create_process_group_gloo(self): + with _test_eager_guard(): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, + nranks, datetime.timedelta(0)) + gloo_store = paddle.fluid.core.GlooStore(store) + opt = paddle.fluid.core.GlooOptions() + pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks) + + # test allreduce sum + # rank 0 + paddle.device.set_device('cpu') + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = x + y + if rank == 0: + task = pg.allreduce(tensor_x) + task.wait() + assert np.array_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + # test allreduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if rank == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if rank == 0: + task = pg.broadcast(tensor_x, 0) + task.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + task.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_y) + print("test broadcast api ok") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py index 8ec5d13c569fe85fab0f67a7961e183a2084dcf7..4833cea9a8d1ab7aafa82e4bb12f0c52902fa634 100644 --- a/python/paddle/fluid/tests/unittests/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py @@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase): print("test barrier api ok\n") - # test send/recv + # test allgather # rank 0 x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], + [out_shape[0]]) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test alltoall + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], + [self.shape[0]]) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], + [self.shape[0] // 2]) if pg.rank() == 0: - task = pg.send(tensor_x, dst=1) + task = pg.alltoall(tensor_x, tensor_out1) task.wait() paddle.device.cuda.synchronize() # rank 1 else: - y = np.random.random(self.shape).astype(self.dtype) - tensor_y = paddle.to_tensor(y) - task = pg.recv(tensor_y, src=0) + task = pg.alltoall(tensor_y, tensor_out2) task.wait() paddle.device.cuda.synchronize() - assert np.array_equal(tensor_x, tensor_y) - print("test send/recv api ok\n") + out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], + [self.shape[0]]) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], + [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") class TestProcessGroupFp16(TestProcessGroupFp32): diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py index 4552d600bafd748633b45b9d25293026c5b0cf2e..2b281d7d6f7c5f186fd73ab3152a2e7652ae6ce1 100644 --- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py +++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py @@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase): for k, v in self.get_strategy().items(): setattr(build_strategy, k, v) self.check_before_applied(main2, startup2) + apply_build_strategy(main2, startup2, build_strategy, {"use_cuda": self.use_cuda}) self.check_after_applied(main2, startup2) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index cce13a8bf3b74a7641710e853ff6c48e86ccba63..b02df024518a86fd152b89ae890b92f0f6df3b32 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase): def test_check_output(self): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): def test_check_output(self): places = [] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): places.append(place) - for place in places: - for data_format in ["NCHW", "NHWC"]: + #for data_format in ["NCHW", "NHWC"]: + for data_format in ["NCHW"]: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3]) @@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): class TestDygraphBatchNormTrainableStats(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 6a6f85a48320681b430ab9d6a9363c28cf5c912e..c9abac8fb7946d51987645e11673f5d64a06c4ce 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -28,7 +28,7 @@ import paddle class TestBatchNorm(unittest.TestCase): def test_name(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: with fluid.dygraph.guard(p): @@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase): def test_error(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: #paddle.disable_static() @@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase): else: paddle.set_default_dtype("float64") self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) def tearDown(self): @@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase): class TestBatchNormUseGlobalStats(unittest.TestCase): def setUp(self): self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) self.init_test() @@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py index 6ae5424a882daea54145a31612f61909871fe05c..58baa0a2fa9443289f24a7e2f23e18fae4877f95 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py +++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py @@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus): def test_process_group_nccl(self): self.run_mnist_2gpu('process_group_nccl.py') + def test_process_group_gloo(self): + self.run_mnist_2gpu('process_group_gloo.py') + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 8ea4e369d323619db35ebf41d4dd053fc5ffe4d9..826f886dab1725e9e26a8826a2277a2d99f93fb6 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle import paddle import paddle.fluid.core as core @@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class( TestWithDilation_AsyPadding, grad_check=False) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 252482fa6d270edbc1bec3a0d6023933521d7f7e..156fdcb9b0abe1ea2dcca0e15bbcfec87b8ebf7a 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr)) ori_place = egr_tensor.place - new_arr = np.random.rand(4, 4, 16, 32).astype('float32') + new_arr = np.random.rand(4, 16, 16, 32).astype('float32') self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr)) - egr_tensor._set_value(new_arr) + egr_tensor.set_value(new_arr) self.assertEqual(egr_tensor.stop_gradient, True) self.assertTrue(egr_tensor.place._equals(ori_place)) - self.assertEqual(egr_tensor.shape, [4, 4, 16, 32]) + self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr)) @@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase): new_weight = np.ones([1, 3]).astype('float32') self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight)) - linear.weight._set_value(new_weight) + linear.weight.set_value(new_weight) self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight)) self.assertTrue(linear.weight.place._equals(ori_place)) diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index aee6ca249f535b9c06c00a6806ac491be16cd4b3..a204c26c1b823fa228dba63dc8351db0adf31708 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index 077496200d988fafc67bb6f85892adc99c170daf..67f6b91021472ec3f678010f22bafb615cd8eb2c 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -23,6 +23,7 @@ import paddle.fluid as fluid from paddle.fluid.layer_helper import LayerHelper from paddle.fluid import compiler import paddle.fluid.unique_name as unique_name +import paddle class TestInplaceANBOpTraining(unittest.TestCase): @@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase): outs[0].name if not only_forward else None, build_strategy=build_strategy, exec_strategy=exec_strategy) - bn_fetches = exe.run(program=comp_prog1, + bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_name) fetch_outs.append(bn_fetches) fetch_names.append(fetch_name) - for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs + - fetch_names)): + for bn_val, inplace_abn_val, name1, name2 in zip(*( + fetch_outs + fetch_names)): self.assertTrue( np.allclose( bn_val, inplace_abn_val, atol=1e-2), @@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase): def test_op(self): use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] + #use_cudas = [False] for use_cuda in use_cudas: place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() layouts = ["NCHW", "NHWC"] @@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py index fe8c181b7904989d75d85520aaf6f658ac673d29..49fe397644dc6e79d1b0d436b31c706d77906b6a 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py @@ -21,6 +21,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core import gradient_checker +import paddle from decorator_helper import prog_scope @@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index b01c7cf179955d89746555e3d085361784193b8c..a1a3b31a9766e093973ed927f90eebb989e1263b 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm import seresnext_net from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from fake_reader import fake_imdb_reader +import paddle def lstm_net(use_feed): @@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index c860d6972fb762f5c9cde1fb17c36bfb415a3f8a..40481b097827cbc7b66d72403d24fd41853af0a7 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 3238876b89414b89d09a8b4161ef9e5ba2450261..aac8b6a99b649176d29224e22c3d3258d96c194e 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -533,12 +533,8 @@ class TestTensorRegisterHook(unittest.TestCase): size=[self.batch_size, self.in_size]).astype('float32') data_t = paddle.to_tensor(data) - if _in_eager_mode(): - with self.assertRaises(TypeError): - out = jit_net(data_t) - else: - with self.assertRaises(AssertionError): - out = jit_net(data_t) + with self.assertRaises(AssertionError): + out = jit_net(data_t) def test_register_hook_in_dy2static_mode(self): with _test_eager_guard(): diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index fbd6197c1b92ee8481a1ce6f4a2cec8482eaefb0..32ccecbc6d9f0282b86f100e1b910667fab41cb2 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None): return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False) check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], + x, 'x', + ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'], 'gather') check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather') diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index cfd817c24c7367f69673353a8aaceeedec506e15..6c07cdec2ee19c9e689f354d4a5314049235402c 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -43,7 +43,9 @@ class BaseAPI(object): self.is_base_api = False self.invoke = api_item_yaml['invoke'] else: - self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta']) + if 'infer_meta' in api_item_yaml: + self.infer_meta = self.parse_infer_meta(api_item_yaml[ + 'infer_meta']) self.kernel = self.parse_kernel(api_item_yaml['kernel']) self.support_selected_rows_kernel = False if len(self.kernel[ 'func']) == 1 else True @@ -182,9 +184,9 @@ class BaseAPI(object): 'Tensor': 'Tensor', 'Tensor[]': 'std::vector' } - if re.search(r'\(\w*\)', output_item): + if re.search(r'\([a-zA-Z0-9_@]*\)', output_item): result = re.search( - r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P\w+)\)", + r"(?P[a-zA-Z0-9_[\]]+)\s*\((?P[a-zA-Z0-9_@]+)\)", output_item) out_type = result.group('out_type') assert out_type in output_type_map, \ @@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. def get_kernel_args(self, code_indent): input_trans_map = { 'const Tensor&': 'const phi::DenseTensor&', - 'const Tensor &': 'const phi::DenseTensor&', 'const std::vector&': 'const std::vector&', - 'const std::vector &': - 'const std::vector&', 'const paddle::optional&': 'paddle::optional', 'const paddle::optional>&': @@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. def get_selected_rows_kernel_args(self, code_indent): input_trans_map = { 'const Tensor&': 'const phi::SelectedRows&', - 'const Tensor &': 'const phi::SelectedRows&', 'const paddle::optional&': 'paddle::optional' } diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index a26630ad04100fbebdb7c270b83912bb722040d4..1bdfa8b66972eb0d4ff45509ada066ce92ae5f78 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -105,7 +105,7 @@ def source_include(header_file_path): #include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 2d33cd5b1812ada8fca118c0e0f616cfbe511dd1..b9f991f9b0f88daa3ae07cba33b439c073d8fbe0 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI): # check the attributes of backward for attr in self.attrs['names']: - assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \ - f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \ + assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \ + self.attrs['attr_info'][attr][1] is not None, \ + f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \ Please check the args of {self.api} in yaml." # check the output of backward @@ -145,7 +146,7 @@ def source_include(header_file_path): #include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" -#include "paddle/phi/api/lib/api_utils.h" +#include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml new file mode 100644 index 0000000000000000000000000000000000000000..135989121cca695b0e629192774af0eb3e41c812 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -0,0 +1,21 @@ +- sparse_api : conv3d + args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) + output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) + kernel : + func : sparse_conv3d + layout : x + +- sparse_api : to_dense + args : (Tensor x, Backend backend) + output : Tensor(out@DenseTensor) + invoke : to_dense_impl(x, backend) + +- sparse_api : to_sparse_coo + args : (Tensor x, Backend backend, int64_t sparse_dim) + output : Tensor(out@SparseCooTensor) + invoke : to_sparse_coo_impl(x, backend, sparse_dim) + +- sparse_api : to_sparse_csr + args : (Tensor x, Backend backend) + output : Tensor(out@SparseCsrTensor) + invoke : to_sparse_csr_impl(x, backend) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..99c5a4f49f8c41920135953ca02a17148164eb45 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -0,0 +1,282 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import yaml +import argparse +import re + +from api_base import BaseAPI + + +class SparseAPI(BaseAPI): + def __init__(self, api_item_yaml): + super(SparseAPI, self).__init__(api_item_yaml) + + def get_api_name(self, api_item_yaml): + return api_item_yaml['sparse_api'] + + def get_api_func_name(self): + return self.api + + def get_return_type(self, out_type_list): + return out_type_list[0] if len( + out_type_list) == 1 else "std::tuple<" + ",".join( + out_type_list) + ">" + + def gene_api_declaration(self): + return f""" +// {", ".join(self.outputs['names'])} +PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']}); +""" + + def get_kernel_tensor_out_type(self, output_name): + sparse_type = 'TensorType::DENSE_TENSOR' + if output_name.endswith('@SparseCooTensor'): + sparse_type = 'TensorType::SPARSE_COO' + elif output_name.endswith('@SparseCsrTensor'): + sparse_type = 'TensorType::SPARSE_CSR' + return sparse_type + + def gene_output(self, + output_type_list, + set_out_func, + code_indent, + inplace_flag=False): + kernel_output = "" + output_names = [] + output_create = "" + + if len(output_type_list) == 1: + kernel_output = 'kernel_out' + output_names.append('kernel_out') + inplace_assign = " = " + self.inplace_map[self.outputs['names'][ + 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][0] in self.inplace_map else "" + output_create = f""" + {self.outputs['return_type']} out{inplace_assign}; + auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.outputs['return_type']} out;""" + + for i in range(len(output_type_list)): + kernel_output = kernel_output + f'kernel_out_{i}, ' + output_names.append(f'kernel_out_{i}') + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};""" + + output_create = output_create + f""" + auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.api)) + + return kernel_output, output_names, output_create + + def gen_sparse_kernel_context(self, kernel_output_names): + input_trans_map = { + 'const Tensor&': 'const phi::TenseBase&', + 'const std::vector&': 'const std::vector&', + 'const paddle::optional&': + 'paddle::optional' + } + out_trans_map = { + 'Tensor': 'phi::TenseBase*', + 'std::vector': 'std::vector' + } + input_names = self.inputs['names'] + input_infos = self.inputs['input_info'] + + attr_names = self.attrs['names'] + kernel_param = self.kernel['param'] + if kernel_param is None: + kernel_param = input_names + attr_names + + kernel_context_code = "" + for param in kernel_param: + if param in input_names: + if param in self.optional_vars: + raise ValueError( + f"{self.api} : Unsupport optional input({param}) for sparse api." + ) + else: + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackInput({param}.impl().get());""" + + continue + if param in attr_names: + # set attr for kernel_context + if 'ScalarArray' in self.attrs['attr_info'][param][0]: + param = 'phi::ScalarArray(' + param + ')' + elif 'Scalar' in self.attrs['attr_info'][param][0]: + param = 'phi::Scalar(' + param + ')' + elif isinstance(param, bool): + param = str(param).lower() + else: + param + str(param) + ", " + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackAttr({param});""" + + for out_name in kernel_output_names: + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackOutput({out_name});""" + + return kernel_context_code + + def gen_sparse_kernel_code(self, inplace_flag=False): + _, kernel_output_names, output_create = self.gene_output( + self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag) + + kernel_context_code = self.gen_sparse_kernel_context( + kernel_output_names) + + return f""" + auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); + VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; + VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + auto kernel_context = phi::KernelContext(dev_ctx); +{output_create} +{kernel_context_code} + phi_kernel(&kernel_context); + + return out;""" + + def gene_base_api_code(self, inplace_flag=False): + api_func_name = self.get_api_func_name() + return f""" +PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{ +{self.gene_kernel_select()} +{self.gen_sparse_kernel_code(inplace_flag)} +}} +""" + + +def header_include(): + return """ +#include + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" +""" + + +def source_include(header_file_path): + return f""" +#include "{header_file_path}" +#include + +#include "glog/logging.h" + +#include "paddle/phi/api/lib/api_registry.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/data_transform.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/declarations.h" +""" + + +def api_register(): + return """ +PD_REGISTER_API(Test); +""" + + +def api_namespace(): + return (""" +namespace paddle { +namespace experimental { +namespace sparse { + +""", """ + +} // namespace sparse +} // namespace experimental +} // namespace paddle +""") + + +def generate_api(api_yaml_path, header_file_path, source_file_path): + + with open(api_yaml_path, 'r') as f: + apis = yaml.load(f, Loader=yaml.FullLoader) + header_file = open(header_file_path, 'w') + source_file = open(source_file_path, 'w') + + namespace = api_namespace() + + header_file.write("#pragma once\n") + header_file.write(header_include()) + header_file.write(namespace[0]) + + include_header_file = "paddle/phi/api/include/sparse_api.h" + source_file.write(source_include(include_header_file)) + source_file.write(namespace[0]) + + for api in apis: + sparse_api = SparseAPI(api) + header_file.write(sparse_api.gene_api_declaration()) + source_file.write(sparse_api.gene_api_code()) + + header_file.write(namespace[1]) + source_file.write(namespace[1]) + + source_file.write(api_register()) + + header_file.close() + source_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle C++ Sparse API files') + parser.add_argument( + '--api_yaml_path', + help='path to sparse api yaml file', + default='python/paddle/utils/code_gen/sparse_api.yaml') + + parser.add_argument( + '--api_header_path', + help='output of generated api header code file', + default='paddle/phi/api/include/sparse_api.h') + + parser.add_argument( + '--api_source_path', + help='output of generated api source code file', + default='paddle/phi/api/lib/sparse_api.cc') + + options = parser.parse_args() + + api_yaml_path = options.api_yaml_path + header_file_path = options.api_header_path + source_file_path = options.api_source_path + + generate_api(api_yaml_path, header_file_path, source_file_path) + + +if __name__ == '__main__': + main()