提交 9e00395a 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_yolo_box_to_phi

......@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/phi/api/backward/backward_api.h
paddle/phi/api/include/api.h
paddle/phi/api/include/sparse_api.h
paddle/phi/api/lib/api.cc
paddle/phi/api/lib/dygraph_api.*
paddle/phi/api/lib/backward_api.cc
paddle/phi/api/lib/sparse_api.cc
paddle/phi/extension.h
paddle/phi/include/*
paddle/phi/infermeta/generated.*
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
if (WITH_DISTRIBUTE)
cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
endif()
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL)
......
......@@ -117,6 +117,35 @@ class ProcessGroup {
"ProcessGroup%s does not support receive", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllGather", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in /* tensors */, // NOLINT
std::vector<Tensor>& out /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors /* tensors */, // NOLINT
const ReduceOptions& opts) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Reduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */, // NOLINT
const ScatterOptions&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Scatter", GetBackendName()));
}
protected:
const int rank_;
const int size_;
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#ifdef _WIN32
#include <gloo/common/win.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <gloo/broadcast.h>
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
#ifdef _WIN32
#define GENERATE_FUNC(type, func, ...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(__VA_ARGS__); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(__VA_ARGS__); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(__VA_ARGS__); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#define HOST_NAME_MAX 256
#else
#define GENERATE_FUNC(type, func, args...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(args); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(args); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(args); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(args); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(args); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#endif
typedef void (*reduce_func)(void*, const void*, const void*, size_t);
template <typename T>
reduce_func get_function(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return reduce_func(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return reduce_func(&::gloo::product<T>);
case ReduceOp::MIN:
return reduce_func(&::gloo::min<T>);
case ReduceOp::MAX:
return reduce_func(&::gloo::max<T>);
case ReduceOp::AVG:
VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
exit(-1);
}
VLOG(0) << "Error: Unknown ReduceOp.";
exit(-1);
}
bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
return t.place() == PlaceType::kCPU;
});
}
template <typename T>
T* get_data(const Tensor& tensor) {
auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
return static_cast<T*>(raw_tensor->data());
}
template <typename T>
std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
std::vector<T*> ret(tensors.size());
for (size_t i = 0; i < tensors.size(); i++) {
ret[i] = get_data<T>(tensors[i]);
}
return ret;
}
template <typename T, typename P>
void set_output(P& opts, const Tensor& tensor) { // NOLINT
opts.setOutput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_input(P& opts, const Tensor& tensor) { // NOLINT
opts.setInput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_outputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
}
template <typename T, typename P>
void set_inputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
}
ProcessGroupGloo::GlooTask::GlooTask(int rank,
const std::vector<Tensor>& inputs,
CommType comm_type)
: ProcessGroup::Task(rank, inputs, comm_type) {
PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
platform::errors::Fatal(
"Only CPU place is supported for ProcessGroupGloo."));
}
ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
int rank, int world_size,
const std::shared_ptr<GlooOptions> options)
: ProcessGroup(rank, world_size), _tag(0), _store(store) {
_context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
auto prefix_store =
::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
_context->connectFullMesh(prefix_store, options->device);
}
class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
public:
BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
const std::vector<Tensor>& inputs, int rank, int root,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
_context(context),
_root(root),
_inputs(inputs),
_tag(tag) {}
void Run() override { _do_broadcast(_inputs[0]); }
private:
std::shared_ptr<gloo::Context> _context;
const int _root;
std::vector<Tensor> _inputs{};
const uint32_t _tag;
void _do_broadcast(const Tensor& tensor) {
gloo::BroadcastOptions opts(_context);
const auto& dtype = tensor.type();
GENERATE_FUNC(dtype, set_output, opts, tensor);
opts.setRoot(_root);
opts.setTag(_tag);
gloo::broadcast(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
auto root = opts.source_rank;
std::unique_ptr<BroadcastGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
task->Run();
return task;
}
class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
public:
AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
std::vector<Tensor>& inputs, ReduceOp reduce_op, // NOLINT
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
_context(context),
_inputs(inputs),
_reduce_op(reduce_op),
_tag(tag) {}
void Run() override { _do_allreduce(_inputs); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<Tensor> _inputs;
const ReduceOp _reduce_op;
uint32_t _tag;
gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
const ReduceOp op) {
gloo::AllreduceOptions::Func fn;
GENERATE_FUNC(type, _get_function_impl, fn, op);
return fn;
}
template <typename T>
void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
const ReduceOp op) {
fn = get_function<T>(op);
}
void _do_allreduce(std::vector<Tensor>& tensors) { // NOLINT
const auto& dtype = tensors[0].type();
gloo::AllreduceOptions opts(_context);
GENERATE_FUNC(dtype, set_inputs, opts, tensors);
GENERATE_FUNC(dtype, set_outputs, opts, tensors);
opts.setReduceFunction(_get_function(dtype, _reduce_op));
opts.setTag(_tag);
gloo::allreduce(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
auto tag = next_tag();
std::shared_ptr<GlooTask> task;
auto context = get_context();
task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
opts.reduce_op, tag);
task->Run();
return task;
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
::gloo::transport::tcp::attr attr;
attr.iface = ifname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
::gloo::transport::tcp::attr attr;
attr.hostname = hostname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDefaultDevice() {
std::array<char, HOST_NAME_MAX> hostname{};
auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
"Get hostname error for createDefaultDevice."));
::addrinfo* result;
result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
::addrinfo* cur;
for (cur = result; cur != nullptr; cur = cur->ai_next) {
SocketType socket =
::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
if (socket == -1) {
continue;
}
ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
#ifdef _WIN32
closesocket(socket);
#else
close(socket);
#endif
if (ret == -1) {
continue;
}
break;
}
freeaddrinfo(result);
if (cur != nullptr) {
return createDeviceForHostname(hostname.data());
}
return createDeviceForHostname("127.0.0.1");
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <future>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
constexpr const char* GLOO_BACKEND_NAME = "GLOO";
namespace paddle {
namespace distributed {
class ProcessGroupGloo : public ProcessGroup {
public:
class GlooTask : public ProcessGroup::Task,
public std::enable_shared_from_this<GlooTask> {
public:
explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
CommType comm_type);
~GlooTask() = default;
virtual void Run() = 0;
bool Wait(std::chrono::milliseconds timeout) override { return true; }
bool IsCompleted() override { return true; }
void Synchronize() override {}
protected:
friend class ProcessGroupGloo;
};
class GlooStore : public ::gloo::rendezvous::Store {
public:
explicit GlooStore(
const std::shared_ptr<paddle::distributed::TCPStore>& store)
: _store(store) {}
~GlooStore() = default;
std::vector<char> get(const std::string& key) override {
VLOG(3) << "GlooStore::get";
auto value = _store->get(key);
return std::vector<char>(value.begin(), value.end());
}
void wait(const std::vector<std::string>& keys) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
}
void set(const std::string& key, const std::vector<char>& value) override {
VLOG(3) << "GlooStore::set";
std::vector<uint8_t> tmp(value.begin(), value.end());
_store->set(key, tmp);
}
void wait(const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
// wait(keys);
}
protected:
std::shared_ptr<paddle::distributed::TCPStore> _store;
};
class GlooOptions {
public:
GlooOptions() = default;
~GlooOptions() = default;
static std::shared_ptr<GlooOptions> create() {
return std::make_shared<GlooOptions>();
}
std::shared_ptr<::gloo::transport::Device> device;
};
explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
int world_size,
std::shared_ptr<GlooOptions> options);
~ProcessGroupGloo() = default;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& inputs,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& inputs,
const AllreduceOptions& opts = AllreduceOptions()) override;
std::shared_ptr<::gloo::Context> get_context() { return _context; }
uint64_t next_tag() { return _tag++; }
const std::string GetBackendName() const override {
return GLOO_BACKEND_NAME;
}
// Helper functions for Gloo.
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
const std::string& hostname);
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
const std::string& ifname);
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
protected:
uint32_t _tag;
std::shared_ptr<gloo::rendezvous::Context> _context;
std::shared_ptr<GlooStore> _store;
};
} // namespace distributed
} // namespace paddle
......@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
return task;
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::ncclAllGather(
input_tensor->data(), output_tensor->data(), input_tensor->numel(),
platform::ToNCCLDataType(input.type()), comm, stream);
},
CommType::ALLGATHER);
}
void* GetPointerByOffset(void* raw_pointer, size_t offset,
experimental::DataType type) {
if (type == experimental::DataType::FLOAT32) {
return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in nccl is not supported."));
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
tensors, tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
input_tensor->data(), output_tensor->data(), input.numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
},
CommType::REDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
const ScatterOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
if (rank_ == opts.root_rank) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
}
},
CommType::SCATTER);
}
} // namespace distributed
} // namespace paddle
......@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup {
std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in, std::vector<Tensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors,
const ScatterOptions&) override;
protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places, int rank, CommType opType,
......
......@@ -36,5 +36,14 @@ struct BarrierOptions {
std::vector<int> place_ids;
};
struct ReduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
int root_rank = 0;
};
struct ScatterOptions {
int root_rank = 0;
};
} // namespace distributed
} // namespace paddle
......@@ -32,6 +32,8 @@ class Store {
virtual int64_t add(const std::string& key, int64_t value) = 0;
virtual std::vector<uint8_t> get(const std::string& key) = 0;
virtual void wait(const std::string& key) = 0;
virtual void set(const std::string& key,
const std::vector<uint8_t>& value) = 0;
virtual const std::chrono::seconds& timeout() const { return _timeout; }
......
......@@ -27,11 +27,13 @@ namespace detail {
constexpr int INFTIME = -1;
std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
return std::make_unique<MasterDaemon>(socket);
std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
int nranks) {
return std::make_unique<MasterDaemon>(socket, nranks);
}
MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
MasterDaemon::MasterDaemon(SocketType socket, int nranks)
: _listen_socket(socket), _nranks(nranks) {
_background_thread = std::thread{&MasterDaemon::run, this};
}
......@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
tcputils::send_value<int64_t>(socket, new_value);
}
void MasterDaemon::_do_set(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_set";
std::string key = tcputils::receive_string(socket);
auto value = tcputils::receive_vector<uint8_t>(socket);
_store[key] = value;
}
void MasterDaemon::_do_get(SocketType socket) {
std::string key = tcputils::receive_string(socket);
auto iter = _store.find(key);
......@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
iter, _store.end(),
platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
std::vector<uint8_t> value = iter->second;
VLOG(3) << "TCPStore: value ("
<< std::stoll(std::string(reinterpret_cast<char*>(value.data()),
value.size()))
<< ") for key (" << key << ").";
tcputils::send_vector<uint8_t>(socket, value);
}
void MasterDaemon::_do_stop(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_stop";
ReplyType value = ReplyType::STOP_WAIT;
if (--_nranks == 0) {
_stop = true;
}
tcputils::send_value<ReplyType>(socket, value);
}
......@@ -140,21 +148,27 @@ void MasterDaemon::run() {
case Command::GET:
_do_get(fds[i].fd);
break;
case Command::SET:
_do_set(fds[i].fd);
break;
case Command::WAIT:
_do_wait(fds[i].fd);
break;
case Command::STOP:
_do_stop(fds[i].fd);
break;
default:
VLOG(0) << "Unknow command: " << static_cast<int>(command);
exit(-1);
}
}
}
}
std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
auto server = std::make_unique<TCPServer>();
server->_master_daemon = MasterDaemon::start(socket);
server->_master_daemon = MasterDaemon::start(socket, nranks);
return server;
}
......@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
size_t num_workers, std::chrono::seconds timeout)
: Store(timeout), _is_master(is_master), _num_workers(num_workers) {
if (_is_master) {
_server = detail::TCPServer::create(port);
_server = detail::TCPServer::create(port, num_workers);
}
_client = detail::TCPClient::connect(host, port);
......@@ -213,7 +227,6 @@ void TCPStore::waitWorkers() {
}
add(_init_key, 1);
if (_server) {
auto begin = std::chrono::steady_clock::now();
do {
auto value = get(_init_key);
......@@ -233,16 +246,22 @@ void TCPStore::waitWorkers() {
"TCPStore timeouted and not all workers got ready."));
}
} while (true);
}
VLOG(3) << "TCPStore initialized.";
}
int64_t TCPStore::add(const std::string& key, int64_t value) {
VLOG(3) << "TCPStore add.";
_client->send_command_for_key(Command::ADD, _key_prefix + key);
_client->send_value<std::int64_t>(value);
return _client->receive_value<std::int64_t>();
}
void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
VLOG(3) << "TCPStore set.";
_client->send_command_for_key(Command::SET, _key_prefix + key);
_client->send_vector<std::uint8_t>(value);
}
std::vector<uint8_t> TCPStore::get(const std::string& key) {
wait(key);
_client->send_command_for_key(Command::GET, _key_prefix + key);
......@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
void TCPStore::wait(const std::string& key) {
ReplyType reply;
VLOG(3) << "TCPStore wait.";
do {
_client->send_command_for_key(Command::WAIT, _key_prefix + key);
......@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
TCPStore::~TCPStore() {
_client->send_command_for_key(Command::STOP, "");
VLOG(3) << "~TCPStore";
ReplyType ret = _client->receive_value<ReplyType>();
PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
platform::errors::InvalidArgument(
......
......@@ -27,15 +27,16 @@ namespace paddle {
namespace distributed {
enum class ReplyType { WAITING, STOP_WAIT };
enum class Command { ADD, GET, WAIT, STOP };
enum class Command { ADD, GET, SET, WAIT, STOP };
namespace detail {
class MasterDaemon {
public:
static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
int nranks);
MasterDaemon() = delete;
explicit MasterDaemon(SocketType listen_socket);
explicit MasterDaemon(SocketType listen_socket, int nranks);
~MasterDaemon();
private:
......@@ -43,18 +44,20 @@ class MasterDaemon {
void _do_add(SocketType socket);
void _do_wait(SocketType socket);
void _do_get(SocketType socket);
void _do_set(SocketType socket);
void _do_stop(SocketType socket);
SocketType _listen_socket;
std::vector<SocketType> _sockets;
std::unordered_map<std::string, std::vector<uint8_t>> _store;
std::thread _background_thread{};
int _nranks;
bool _stop = false;
};
class TCPServer {
public:
TCPServer() = default;
static std::unique_ptr<TCPServer> create(std::uint16_t port);
static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
private:
std::unique_ptr<MasterDaemon> _master_daemon;
......@@ -97,6 +100,7 @@ class TCPStore : public Store {
int64_t add(const std::string& key, int64_t value) override;
std::vector<uint8_t> get(const std::string& key) override;
void wait(const std::string& key) override;
void set(const std::string& key, const std::vector<uint8_t>& value) override;
private:
void waitWorkers();
......
......@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
hints.ai_socktype = SOCK_STREAM;
const char* node = host.empty() ? nullptr : host.c_str();
const char* port_cstr = port.empty() ? nullptr : port.c_str();
int n;
n = ::getaddrinfo(node, port.c_str(), &hints, &res);
n = ::getaddrinfo(node, port_cstr, &hints, &res);
const char* gai_err = ::gai_strerror(n);
const char* proto =
(family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
......
......@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
public:
// Constructor: configure fwd input tensors to grad node
explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
VLOG(6) << "Construct GradNodeAccumulation";
weak_grad_ = meta->WeakGrad();
SetDefaultGradInOutMeta();
}
~GradNodeAccumulation() override = default;
~GradNodeAccumulation() override {
VLOG(6) << "Destruct GradNodeAccumulation";
}
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
......
......@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
const std::vector<paddle::experimental::Tensor>& tensors);
void SetAttributes_scale(float scale);
std::string name() override { return ""; }
// Members: define fwd input tensors
// For Scale there is no fwd input tensor needed
private:
......
......@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
std::string get_autograd_meta_str = " // Prepare Autograd Meta \n";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
for (const proto::OpProto::Var& input : in_vars) {
const std::string& input_name = input.name();
const std::string& input_autograd_name = "p_autograd_" + input_name;
......@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
}
VLOG(6) << "Generated inputs autograd_meta";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
// Skip Intermediate Tensor
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
std::string prepare_autograd_meta_str = "";
prepare_autograd_meta_str += get_autograd_meta_str;
prepare_autograd_meta_str += "\n";
......@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
" %s"
" bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
" if(require_any_grad) {\n"
" VLOG(6) << \" Construct Grad for %s \"; \n"
" egr::EagerUtils::PassStopGradient(%s);\n"
"%s\n }";
std::string grad_node_creation_body_str = paddle::string::Sprintf(
GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
compute_require_grad_args, pass_stop_gradient_args,
compute_require_grad_args, op_type, pass_stop_gradient_args,
grad_node_creation_str);
return grad_node_creation_body_str;
......@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
const char* GRAD_NODE_TEMPLATE =
"class GradNode%s : public egr::GradNodeBase {\n"
" public:\n"
" GradNode%s() : egr::GradNodeBase() {}\n"
" GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
"GradNode%s \"; }\n"
" GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
"egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
" ~GradNode%s() override = default;\n"
"egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
"Construct GradNode%s \"; }\n"
" ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
"\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"override;\n"
"\n"
" std::string name() override { return \" GradNode%s \"; } \n "
"\n"
" // SetX, SetY, ...\n"
"%s\n"
" // SetAttrMap\n"
"%s\n"
" std::string name() { return \"GradNode%s\"; }\n"
"\n"
" private:\n"
" // TensorWrappers\n"
"%s\n"
......@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
VLOG(6) << "Generated TensorWrapper";
std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
set_tensor_wrappers_str, set_attr_map_str, op_type,
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
tensor_wrapper_members_str, attr_members_str);
return grad_node_str;
......
......@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)):
ret = returns[i]
returns_list.append(["", ret, i])
ret_type = returns[i]
assert ret_type in yaml_types_mapping.keys()
ret_type = yaml_types_mapping[ret_type]
returns_list.append(["", ret_type, i])
return returns_list
......@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
std::string name() override {{ return \" {} \"; }}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
// SetAttributes
......@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{
"""
node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name,
set_tensor_wrapper_methods_str, set_attribute_methods_str,
tensor_wrapper_members_str, attribute_members_str)
grad_node_name, set_tensor_wrapper_methods_str,
set_attribute_methods_str, tensor_wrapper_members_str,
attribute_members_str)
return node_declaration_str
......
......@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
}
visited.insert(node);
PADDLE_ENFORCE_NOT_NULL(
node,
paddle::platform::errors::Fatal(
"We got null node when we traverse the backward graph, and this "
"should not happened please check your code and contact us."));
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
......@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
}
}
}
return node_in_degree_map;
}
......
......@@ -30,6 +30,7 @@
namespace egr {
GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
VLOG(6) << "Construct GradNodeBase";
bwd_in_meta_.resize(bwd_in_slot_num);
bwd_out_meta_.resize(bwd_out_slot_num);
// adj_edges has the same num as backward outputs
......@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
// its pre-ops
if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode();
if (node) {
if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
} else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
}
......@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
"inputs's slot num."));
if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode();
if (node) {
if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
......
......@@ -76,10 +76,10 @@ class GradSlotMeta {
class GradNodeBase {
public:
GradNodeBase() = default;
GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
// TODO(jiabin): Should we have other constructor here?
virtual ~GradNodeBase() = default;
virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
/**
* operator() designed to contian the real backward execution logic, it should
......
......@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode(float val, int in_num, int out_num)
: GradNodeBase(in_num, out_num), val_(val) {}
GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override {
......
......@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
const std::shared_ptr<GradNodeBase>& grad_node) {
for (const auto& autograd_meta : *autograd_metas) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
"detected";
if (autograd_meta->GradNode()) {
VLOG(7) << "Should not set grad node twice, original node is:"
<< autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
}
autograd_meta->SetGradNode(grad_node);
}
......@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
const std::shared_ptr<GradNodeBase>& grad_node) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
VLOG(6)
<< "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
if (autograd_meta->GradNode()) {
VLOG(7) << "Should not set grad node twice, original node is:"
<< autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
}
autograd_meta->SetGradNode(grad_node);
}
......
......@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <random>
#include <string>
#include <unordered_set>
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp>
#include <random>
#include <unordered_set>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
......@@ -25,7 +26,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/place.h"
USE_OP(batch_norm);
USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
......
......@@ -409,7 +409,7 @@ class ThreadPoolTempl {
return false;
}
platform::RecordEvent("SleepWaitForWork",
platform::TracerEventType::UserDefined, 2);
platform::TracerEventType::UserDefined, 10);
ec_.CommitWait(waiter);
blocked_--;
return true;
......
......@@ -2106,6 +2106,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset];
if (var) {
if (var->template IsType<framework::LoDTensor>()) {
tensor_out = var->template GetMutable<framework::LoDTensor>();
} else if (var->template IsType<phi::SelectedRows>()) {
......@@ -2115,6 +2117,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
}
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
}
......@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
vector_int_attr.end());
pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
}
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int32_t>))) {
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
......@@ -314,6 +314,7 @@ void BuildDygraphPhiKernelContext(
phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset]->MutableVar();
if (var) {
if (var->template IsType<phi::DenseTensor>()) {
tensor_out = var->template GetMutable<phi::DenseTensor>();
} else if (var->template IsType<phi::SelectedRows>()) {
......@@ -323,6 +324,8 @@ void BuildDygraphPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
}
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
}
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
......@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
ops::BatchNormDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, framework::DataLayout layout>
static __global__ void BNForwardInference(
const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, double exponentialAverageFactor, T *y,
BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T>
class BatchNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
&transformed_y);
} else {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(), est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(), est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C, est_mean->dims()[0], est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C, est_var->dims()[0], est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
framework::TensorCopy(*x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
platform::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
transformed_y.template data<T>(), bn_param_desc_,
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T, block,
DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T, block,
DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const double epsilon, const int N,
const int C, const int HxW, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon, int C, int M,
const int num, const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, double epsilon, int C,
int M, const int num, const T *y, int grid2, const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy, const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean, const T *x,
const BatchNormParamType<T> *variance, const int C, const int N,
const int HxW, T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
const Tensor *x;
bool is_inplace;
if (ctx.HasInput("Y")) {
x = ctx.Input<Tensor>("Y");
is_inplace = true;
if (d_x) {
PADDLE_ENFORCE_EQ(d_x, d_y,
platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD not inplace in inplace mode"));
}
} else {
x = ctx.Input<Tensor>("X");
is_inplace = false;
if (d_x) {
PADDLE_ENFORCE_NE(
d_x, d_y, platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
const bool is_test = ctx.Attr<bool>("is_test");
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(), x_dims));
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale->dims().size(), scale->dims()));
PADDLE_ENFORCE_EQ(
scale->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C, scale->dims()[0]));
auto dtype = platform::CudnnDataType<T>::type;
const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
reserve_space != nullptr;
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_d_y(d_y->type());
Tensor transformed_d_x;
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
if (d_x) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
&transformed_d_x);
}
} else {
transformed_x.ShareDataWith(*x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = dev_ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<platform::CUDADeviceContext,
BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
#endif
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const auto *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format, transformed_x.data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
saved_mean_data, saved_var_data, epsilon, C, H * W * D,
num, transformed_x.data<T>(), grid2, block, stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/transformed_d_x.template mutable_data<T>(
ctx.GetPlace()),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<
T, block,
DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<
T, block,
DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
} else {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = *x;
inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
running_mean_data, running_var_data, epsilon, C,
H * W * D, num, x->data<T>(), grid2, block, stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
};
template <typename T>
class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *X = ctx.Input<Tensor>("X");
const auto *Scale = ctx.Input<Tensor>("Scale");
const auto *dY = ctx.Input<Tensor>("DY");
const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE_EQ(
is_test, false,
platform::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *ddX = ctx.Input<Tensor>("DDX");
const auto *ddScale = ctx.Input<Tensor>("DDScale");
const auto *ddBias = ctx.Input<Tensor>("DDBias");
auto *dX = ctx.Output<Tensor>("DX");
auto *dScale = ctx.Output<Tensor>("DScale");
auto *ddY = ctx.Output<Tensor>("DDY");
NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
#else
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
#endif
......@@ -25,10 +25,10 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit);
......@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
......@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
default:
......@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// cuDNN only supports padding the same amount on every dimension.
// So we create a new padded input tensor.
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input(input->type());
Tensor transformed_input_grad(input->type());
std::vector<int> padding_common(data_dim, 0);
......@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
default:
......@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type());
......@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace paddle {
namespace operators {
......@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_input;
......@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, input_transpose, pad_value, &transformed_input);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, input_transpose, pad_value, &transformed_input);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
......@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_output_grad;
......@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, output_grad_transpose, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad);
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, output_grad_transpose, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad);
} break;
default:
......@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type());
......@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (dO) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_dO_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_dO_channel, pad_value,
&transformed_dO);
}
if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX);
}
} break;
......
......@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
bool is_fix_seed, int seed_val, const Tensor& x,
const Tensor* seed, Tensor* mask, Tensor* y) {
auto& place = *dev_ctx.eigen_device();
if (!is_test) {
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (!is_test) {
auto* mask_data = mask->data<uint8_t>();
size_t size = phi::product(mask->dims());
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (dropout_prob == 1.0f) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
}
#endif
} else {
auto X = EigenMatrix<T>::Reshape(x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
if (upscale_in_train) {
Y.device(place) = X;
// todo: can y share with data with x directly?
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
hipMemcpyDeviceToDevice, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
cudaMemcpyDeviceToDevice, stream));
#endif
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
T factor = static_cast<T>(1.0f - dropout_prob);
std::vector<const framework::Tensor*> ins = {&x};
std::vector<framework::Tensor*> outs = {y};
auto functor = phi::funcs::ScaleFunctor<T>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
}
......
......@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_int64(cudnn_exhaustive_search_times);
......@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim);
bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0);
......@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
T pad_value(0.0);
switch (rank) {
case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value,
phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input);
} break;
default:
......
......@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor;
USE_OP(batch_norm);
USE_OP_ITSELF(batch_norm);
USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
......
......@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT64) {
axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT16) {
axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
}
}
const auto &place = ctx.GetPlace();
......@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
} else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
dev_ctx);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
dev_ctx);
}
return;
}
......@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
}
}
};
......@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<int16_t>,
ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
......
......@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle {
namespace operators {
......@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
auto ids_dims = ctx->GetInputDim("Ids");
auto parents_dims = ctx->GetInputDim("Parents");
PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
platform::errors::InvalidArgument(
"The shape of Input(Parents) must be same with the "
"shape of Input(Ids)."));
ctx->SetOutputDim("Out", ids_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
......@@ -72,4 +61,8 @@ selected ids.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
PT_INFER_META(phi::GatherTreeMeta));
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
GatherTreeInferShapeFunctor);
......@@ -26,27 +26,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class CPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std);
auto shape = GetShape(context);
tensor->Resize(shape);
int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
}; // namespace operators
template <typename T>
class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
......@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
ops::GaussianRandomOpMaker);
REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
ops::CPUGaussianRandomKernel<double>);
REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
......
......@@ -52,53 +52,6 @@ struct GaussianGenerator {
}
};
template <typename T>
class GPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out");
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
auto shape = GetShape(context);
tensor->Resize(shape);
auto& dev_cxt =
context.template device_context<platform::CUDADeviceContext>();
T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
int64_t size = tensor->numel();
int device_id = context.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::normal_distribution<MT> dist;
distribution::normal_transform<MT> trans(mean, std);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
} else {
auto func = GaussianGenerator<T>(mean, std, seed);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
}
};
template <typename T>
class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
public:
......@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
gaussian_random,
paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL(
gaussian_random_batch_size_like,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
......
......@@ -17,6 +17,8 @@
#include <string>
#include <vector>
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle {
namespace operators {
......@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
};
template <typename DeviceContext, typename T>
class InplaceABNKernel
: public paddle::operators::BatchNormKernel<DeviceContext, T> {
class InplaceABNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
......@@ -213,7 +214,33 @@ class InplaceABNKernel
auto activation =
GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
auto cur_y = EigenVector<T>::Flatten(*y);
InplaceABNActivation<DeviceContext, T> functor;
......@@ -222,8 +249,7 @@ class InplaceABNKernel
};
template <typename DeviceContext, typename T>
class InplaceABNGradKernel
: public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
class InplaceABNGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Input<Tensor>("Y");
......@@ -244,7 +270,52 @@ class InplaceABNGradKernel
InplaceABNActivation<DeviceContext, T> functor;
functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
// BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
}
};
......
......@@ -15,14 +15,15 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/inplace_abn_op.h"
#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class InplaceABNKernel
: public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
public paddle::operators::BatchNormKernel<DeviceContext, T> {
: public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Output<Tensor>("Y");
......@@ -36,7 +37,33 @@ class InplaceABNKernel
if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
} else {
BatchNormKernel<DeviceContext, T>::Compute(ctx);
// BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
}
auto cur_y = EigenVector<T>::Flatten(*y);
......@@ -49,8 +76,7 @@ class InplaceABNKernel
// https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T>
class InplaceABNGradKernel
: public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
: public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* y = ctx.Input<Tensor>("Y");
......@@ -74,7 +100,50 @@ class InplaceABNGradKernel
if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
} else {
BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
}
}
};
......
......@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
}
template <typename DeviceContext, typename T>
void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
void NormDoubleGradFunctor(const DeviceContext &ctx,
const DataLayout data_layout, const Tensor *X,
const Tensor *Scale, const Tensor *dY,
const Tensor *Saved_mean,
const Tensor *Saved_variance, const double epsilon,
const Tensor *Saved_variance, const Tensor *Mean,
const Tensor *Variance, const double epsilon,
const bool use_global_stats, const Tensor *ddX,
const Tensor *ddScale, const Tensor *ddBias,
Tensor *dX, Tensor *dScale, Tensor *ddY) {
......@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
phi::funcs::SetConstant<DeviceContext, T> set_constant;
auto &x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
......@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
Tensor scale_tmp;
if (!Scale) {
scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
set_constant(ctx, &scale_tmp, static_cast<T>(1));
}
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__
......@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks);
int grid1 = (num + block - 1) / block;
const T *mean_data, *variance_data;
if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean = Mean;
const auto *running_var = Variance;
const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data;
......@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} else {
const T *smean_data = Saved_mean->data<T>();
const T *svariance_data = Saved_variance->data<T>();
mean_data = smean_data;
variance_data = svariance_data;
}
if (dX) {
T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dX, static_cast<T>(0));
set_constant(ctx, dX, static_cast<T>(0));
if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data);
} else {
DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data);
}
} else {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDX<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data);
} else {
DoubleGradComputeDX<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data);
}
......@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
}
if (dScale) {
T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dScale, static_cast<T>(0));
set_constant(ctx, dScale, static_cast<T>(0));
if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data);
} else {
DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data);
}
} else {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScale<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data);
} else {
DoubleGradComputeDScale<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data);
}
......@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
}
if (ddY) {
T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, ddY, static_cast<T>(0));
set_constant(ctx, ddY, static_cast<T>(0));
if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else {
DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data);
}
} else {
if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDY<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} else {
DoubleGradComputeDDY<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
}
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace paddle {
namespace operators {
......@@ -50,7 +50,8 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
}
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value,
phi::funcs::PaddingFunctor<DeviceContext, T>(
rank, context.template device_context<DeviceContext>(), pads, pad_value,
*in_y, out);
}
};
......@@ -82,7 +83,8 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
}
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout,
phi::funcs::PaddingGradFunctor<DeviceContext, T>(
rank, context.template device_context<DeviceContext>(), pads, *in_dout,
d_y);
}
};
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pad_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/complex.h"
namespace paddle {
......@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class PadKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
float pad_value = context.Attr<float>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
int rank = x->dims().size();
math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
static_cast<T>(pad_value), *x, out);
}
};
template <typename DeviceContext, typename T>
class PadGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x == nullptr) {
return;
}
d_x->mutable_data<T>(context.GetPlace());
int rank = d_out->dims().size();
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
d_x);
}
};
} // namespace operators
} // namespace paddle
......@@ -20,9 +20,11 @@ namespace cub = hipcub;
#endif
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
namespace paddle {
namespace operators {
......@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) {
}
template <typename T>
__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
const int ignore_index, const int limit,
T *out_data, T *counts) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T label = label_data[i];
struct NonzeroFunctor {
HOSTDEVICE explicit inline NonzeroFunctor() {}
HOSTDEVICE inline T operator()(const T x) const {
return static_cast<T>(static_cast<double>(x) != 0);
}
};
template <typename T>
struct SigmoidFwdFunctor {
T ignore_index_;
T eps = static_cast<T>(1e-5);
T diff = label - static_cast<T>(ignore_index);
HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
: ignore_index_(ignore_index) {}
HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
T counts;
T out_data;
T diff = label - static_cast<T>(ignore_index_);
if ((diff > -eps) && (diff < eps)) {
out_data[i] = static_cast<T>(0.);
counts[i] = 0;
out_data = static_cast<T>(0.);
counts = 0;
} else {
T term1 = (x > 0) ? x : 0;
T term2 = x * label;
T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
out_data[i] = term1 - term2 + term3;
counts[i] = 1;
}
}
}
template <typename T, int BlockDim>
__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
T in = 0;
for (int i = threadIdx.x; i < num; i += BlockDim) {
in += counts[i];
}
__syncthreads();
auto out =
BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
T a = out > eps ? out : eps;
sum[0] = a;
out_data = term1 - term2 + term3;
counts = 1;
}
}
phi::Array<T, 2> outs;
template <typename T>
__global__ void Div(T *loss, const int num, const T *norm) {
CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
}
outs[0] = out_data;
outs[1] = counts;
return outs;
}
};
template <typename T>
__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
const int ignore_index, const T *dout_data,
const int limit, T *dx_data, T *counts) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T label = label_data[i];
T dout = dout_data[i];
struct SigmoidBwdFunctor {
T ignore_index_;
T eps = static_cast<T>(1e-5);
T diff = label - static_cast<T>(ignore_index);
HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
: ignore_index_(ignore_index) {}
HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
const T dout) {
T counts;
T dx_data;
T diff = label - static_cast<T>(ignore_index_);
if ((diff > -eps) && (diff < eps)) {
dx_data[i] = static_cast<T>(0.);
counts[i] = 0;
dx_data = static_cast<T>(0.);
counts = 0;
} else {
T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
T diff = simoid_x - label;
dx_data[i] = dout * diff;
counts[i] = 1;
dx_data = dout * diff;
counts = 1;
}
phi::Array<T, 2> outs;
outs[0] = dx_data;
outs[1] = counts;
return outs;
}
}
};
template <typename T>
struct DivFunctor {
const T norm_;
HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
HOSTDEVICE inline T operator()(T loss) {
loss /= norm_;
return loss;
}
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template <typename DeviceContext, typename T>
......@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
bool normalize = context.Attr<bool>("normalize");
// Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T));
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
Tensor *counts_tensor = new Tensor();
counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(Out->dims());
int limit = Out->numel();
int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads;
GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
std::vector<const framework::Tensor *> ins = {X, Labels};
std::vector<framework::Tensor *> outs = {Out, counts_tensor};
auto functor = SigmoidFwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
counts, limit, static_cast<T>(1e-5), norm);
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
Tensor *norm_tensor = new Tensor();
norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
auto dims = phi::vectorize(counts_tensor->dims());
std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {Out};
std::vector<framework::Tensor *> div_outs = {Out};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
delete counts_tensor;
}
}
};
......@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel
auto &dev_ctx = context.cuda_device_context();
// Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T));
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
Tensor *counts_tensor = new Tensor();
counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(dX->dims());
int limit = dX->numel();
int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads;
GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
dx_data, counts);
std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
std::vector<framework::Tensor *> outs = {dX, counts_tensor};
auto functor = SigmoidBwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
bool normalize = context.Attr<bool>("normalize");
if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T));
T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
counts, limit, static_cast<T>(1e-5), norm);
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
Tensor *norm_tensor = new Tensor();
norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
auto dims = phi::vectorize(counts_tensor->dims());
std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {dX};
std::vector<framework::Tensor *> div_outs = {dX};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
}
}
};
......
......@@ -23,9 +23,9 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/padding.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "thrust/device_vector.h"
#endif
......@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
std::vector<int> pads(rank * 2, 0);
pads[axes.back() * 2 + 1] = zero_length;
paddle::operators::math::PaddingFunctor<DeviceContext, C>(
rank, ctx, pads, static_cast<C>(0), *dy, &full_dy);
phi::funcs::PaddingFunctor<DeviceContext, C>(
rank, ctx.template device_context<DeviceContext>(), pads,
static_cast<C>(0), *dy, &full_dy);
fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
!forward);
}
......
......@@ -23,12 +23,9 @@ namespace paddle {
namespace platform {
bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) {
ops = get_kl2_ops();
}
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
: get_kl2_ops();
if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) {
return true;
......@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
#ifdef PADDLE_WITH_XPU_KP
bool is_xpu_kp_support_op(const std::string& op_name,
const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) {
ops = get_kp_ops();
}
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
: get_kp_ops();
if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) {
return true;
......
......@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_XPU
......@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) {
#endif
void InitDevices() {
// set name at the entry point of Paddle
platform::SetCurrentThreadName("MainThread");
// CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute).
#ifdef PADDLE_WITH_CUDA
......
......@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
using paddle::platform::GetCurrentThreadName;
using paddle::platform::SetCurrentThreadName;
using paddle::platform::GetAllThreadNames;
EXPECT_EQ("unset", GetCurrentThreadName());
EXPECT_TRUE(SetCurrentThreadName("MainThread"));
SetCurrentThreadName("MainThread");
EXPECT_FALSE(SetCurrentThreadName("MainThread"));
auto names = GetAllThreadNames();
EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
......
......@@ -189,7 +189,10 @@ struct ThreadEventSection {
class ThreadEventRecorder {
public:
ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); }
ThreadEventRecorder() {
thread_id_ = GetCurrentThreadSysId();
thread_name_ = GetCurrentThreadName();
}
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
......@@ -202,7 +205,7 @@ class ThreadEventRecorder {
ThreadEventSection GatherEvents() {
ThreadEventSection thr_sec;
thr_sec.thread_name = GetCurrentThreadName();
thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce());
return thr_sec;
......@@ -210,6 +213,7 @@ class ThreadEventRecorder {
private:
uint64_t thread_id_;
std::string thread_name_;
EventContainer<CommonEvent> base_evt_cntr_;
};
......
......@@ -85,6 +85,9 @@ if(NOT ON_INFER)
if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif()
if (WITH_GLOO)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
endif()
set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
endif()
......
......@@ -31,9 +31,15 @@ namespace pybind {
using TCPStore = paddle::distributed::TCPStore;
void BindTCPStore(py::module* m) {
py::class_<TCPStore>(*m, "TCPStore")
.def(
py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
.def(py::init([](std::string hostname, uint16_t port, bool is_master,
size_t world_size, std::chrono::seconds timeout) {
return std::make_shared<TCPStore>(hostname, port, is_master,
world_size, timeout);
}),
py::arg("hostname"), py::arg("port"), py::arg("is_master"),
py::arg("world_size"), py::arg("timeout"),
py::call_guard<py::gil_scoped_release>())
.def("add", &TCPStore::add)
.def("get", &TCPStore::get);
}
......
......@@ -35,6 +35,11 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#endif
#if defined(PADDLE_WITH_GLOO)
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
#endif
namespace py = pybind11;
namespace paddle {
......@@ -42,6 +47,14 @@ namespace pybind {
using Tensor = paddle::experimental::Tensor;
#if defined(PADDLE_WITH_GLOO)
using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
#endif
static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT
void BindDistributed(py::module *m) {
py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
.value("SUM", distributed::ReduceOp::SUM)
......@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) {
.def(py::init<>())
.def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
.def(py::init<>())
.def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
.def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
auto ProcessGroup =
py::class_<distributed::ProcessGroup,
std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
......@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) {
return self.Recv(tensors, src);
},
py::arg("tensor"), py::arg("src"),
py::call_guard<py::gil_scoped_release>())
.def("all_gather",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllGather(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("alltoall",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllToAll(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("reduce",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
int dst, distributed::ReduceOp op) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
distributed::ReduceOptions opts;
opts.reduce_op = op;
opts.root_rank = dst;
std::vector<Tensor> tensors = {in_tensor};
return self.Reduce(tensors, opts);
},
py::arg("tensor"), py::arg("dst"),
py::arg("op") = distributed::ReduceOp::SUM,
py::call_guard<py::gil_scoped_release>())
.def("scatter",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor, int src) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
distributed::ScatterOptions opts;
opts.root_rank = src;
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.Scatter(in_tensors, out_tensors, opts);
},
py::arg("in"), py::arg("out"), py::arg("src"),
py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL)
......@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) {
*m, "ProcessGroupNCCL", ProcessGroup)
.def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
py::call_guard<py::gil_scoped_release>());
#endif
py::class_<distributed::ProcessGroup::Task,
std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
......@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) {
py::call_guard<py::gil_scoped_release>())
.def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
py::call_guard<py::gil_scoped_release>());
#endif
// define parallel strategy, it will be removed
py::class_<distributed::ProcessGroupStrategy> pg_strategy(
......@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) {
self.nrings_ = nrings;
});
#if defined(PADDLE_WITH_GLOO)
py::class_<GlooOptions>(*m, "GlooOptions")
.def(py::init<>())
.def_readwrite("_device", &GlooOptions::device)
.def_static("create", &GlooOptions::create);
py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
.def(py::init(
[](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
return std::make_shared<GlooStore>(store);
}),
py::call_guard<py::gil_scoped_release>());
py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
*m, "ProcessGroupGloo", ProcessGroup)
.def(py::init<const std::shared_ptr<GlooStore> &, int, int,
std::shared_ptr<GlooOptions> &>(),
py::call_guard<py::gil_scoped_release>())
.def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
int world_size) {
auto opts = GlooOptions::create();
char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
if (ifname && strlen(ifname) > 1) {
opts->device = ProcessGroupGloo::createDeviceForInterface(
std::string(ifname));
} else {
opts->device = ProcessGroupGloo::createDefaultDevice();
}
return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
opts);
}),
py::arg("store"), py::arg("rank"),
py::arg("world_size"), // py::arg("timeout") =
// kProcessGroupDefaultTimeout,
py::call_guard<py::gil_scoped_release>())
.def_static("create_default_device",
&ProcessGroupGloo::createDefaultDevice);
#endif
m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits,
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h"
......@@ -30,10 +31,12 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
namespace paddle {
namespace pybind {
......@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
extern PyTypeObject* p_tensor_type;
Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
PADDLE_ENFORCE_EQ(
tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"We can only support initialized tensor in slice, however we got "
"uninitialized tensor %s, please check your code.",
tensor.name()));
return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
CastPyArg2Tensor(obj, 0).impl().get())));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject* obj) {
return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
}
static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
// NOTE(wuweilong): Set value and not change self's original place
static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
VLOG(4) << "Value " << self->tensor.name();
pybind11::object numpy_value =
pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
InitTensorWithNumpyValue(self, numpy_value, false);
Py_INCREF(Py_None);
return Py_None;
PyObject* _index = PyTuple_GET_ITEM(args, 0);
VLOG(4) << "Call _getitem_index_not_tensor";
std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
decrease_axis, none_axes, infer_flags, list_select_idxs;
// if index is a list, list_select_flag will be true
bool list_select_flag = false;
PADDLE_ENFORCE_EQ(
self->tensor.is_initialized(), true,
platform::errors::InvalidArgument(
"tensor %s has not been initialized, we can only slice initialized "
"tensor please init it first with numpy or other tensor.",
self->tensor.name()));
auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
&slice_strides, &decrease_axis, &none_axes, &infer_flags,
&list_select_idxs, &list_select_flag);
auto out = slice_axes.empty() && !list_select_flag
? self->tensor
: paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
if (!slice_axes.empty()) {
framework::AttributeMap attrs = {{"axes", slice_axes},
{"starts", slice_starts},
{"ends", slice_ends},
{"infer_flags", infer_flags},
{"decrease_axis", decrease_axis}};
std::string op_type = "slice";
for (auto stride : slice_strides) {
if (stride != 1) {
op_type = "strided_slice";
attrs.insert({"strides", slice_strides});
attrs.erase("decrease_axis");
break;
}
}
if (op_type == "slice") {
out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(),
std::move(attrs));
} else if (op_type == "strided_slice") {
out = strided_slice_dygraph_function(self->tensor, attrs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Slice is only support slice and strided_slice, but we got %s which "
"is impossible, please check your code first or contact us by "
"issue. ",
op_type));
}
}
if (!none_axes.empty()) {
// Deal with cases when all axes are decreased.
// After slice, the shape of out is [1], which should have been
// [], but Paddle doesn't support scalar.
// In order to ensure the correctness of the final shape of out,
// one dimension of out needs to be decreased.
// For example:
// # x.shape: (2,3,4)
// out = x[0, 1, 1, None] # out.shape : (1)
if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
none_axes.pop_back();
}
if (!none_axes.empty()) {
// Deal with cases that decrease_axes is not empty
// For example:
// # x.shape: (2,3,4)
// out = x[0, 0:2, None] # out.shape : (2, 1, 4)
for (auto& axis : none_axes) {
int len = 0;
for (int da : decrease_axis) {
if (da < axis) {
len++;
}
}
axis -= len;
}
paddle::experimental::Tensor new_out;
framework::AttributeMap attrs = {{"axes", none_axes}};
new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
return ToPyObject(new_out);
}
}
// the index is a list
if (list_select_flag) {
auto select_index = paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
auto idx_tensor = std::make_shared<phi::DenseTensor>();
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
egr::Controller::Instance().GetExpectedPlace());
paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
idx_tensor.get());
framework::AttributeMap attrs = {{"dim", 0}};
out = index_select_dygraph_function(self->tensor, select_index,
std::move(attrs));
}
return ToPyObject(out);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
......@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
{"get_tensor",
(PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
{"_getitem_index_not_tensor",
(PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_register_grad_hook",
(PyCFunction)(void (*)(void))tensor_register_grad_hook,
......
......@@ -16,8 +16,11 @@ limitations under the License. */
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h"
......@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
}
}
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos) {
return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
}
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos) {
std::vector<paddle::experimental::Tensor> result;
......@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
return result;
}
} // namespace pybind
} // namespace paddle
......@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace paddle {
namespace pybind {
......@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos);
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos);
platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
......@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
const std::string& op_type, const std::string& arg_name, PyObject* args,
ssize_t arg_idx, bool dispensable = false);
// end of Slice related methods
} // namespace pybind
} // namespace paddle
......@@ -54,6 +54,7 @@ limitations under the License. */
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/pybind/op_function.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/fluid/pybind/tensor_py.h"
namespace paddle {
......@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
}
}
Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
if (py::isinstance<imperative::VarBase>(obj)) {
VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
return GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(obj)
->Var()
.Get<framework::LoDTensor>());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
// NOTE(zjl): py::handle is a very light wrapper of PyObject *.
......@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
return result;
}
static bool IsNumpyType(PyObject *obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
......@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
return result;
}
static bool PyCheckInteger(PyObject *obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static Py_ssize_t GetSliceIndexFromTensor(
const std::shared_ptr<imperative::VarBase> &tensor_index) {
const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
Py_ssize_t *start, Py_ssize_t *stop,
Py_ssize_t *step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor *tensor, PyObject *_index,
std::vector<int> *slice_axes, std::vector<int> *slice_starts,
std::vector<int> *slice_ends, std::vector<int> *slice_strides,
std::vector<int> *decrease_axis, std::vector<int> *none_axes,
std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
bool *list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto &shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject *slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject *slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
template <typename P>
static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT
imperative::VarBase &dst, // NOLINT
......
......@@ -80,6 +80,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
......@@ -101,7 +102,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h"
......@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
PYBIND11_MODULE(core_noavx, m) {
#endif
BindImperative(&m);
BindEager(&m);
BindCudaStream(&m);
......@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
m.def("_promote_types_if_complex_exists",
&paddle::framework::PromoteTypesIfComplexExists);
BindImperative(&m);
py::class_<framework::Tensor> framework_tensor(m, "Tensor",
py::buffer_protocol());
g_framework_tensor_pytype =
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Python.h>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
static bool PyCheckTensor(PyObject* obj);
static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
// Slice related methods
static bool PyCheckInteger(PyObject* obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static bool IsNumpyType(PyObject* obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
Py_ssize_t* start, Py_ssize_t* stop,
Py_ssize_t* step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromPyObject(r->step);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromPyObject(r->start);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromPyObject(r->stop);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor* tensor, PyObject* _index,
std::vector<int>* slice_axes, std::vector<int>* slice_starts,
std::vector<int>* slice_ends, std::vector<int>* slice_strides,
std::vector<int>* decrease_axis, std::vector<int>* none_axes,
std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
bool* list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto& shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject* slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject* slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
} // namespace pybind
} // namespace paddle
......@@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
# sparse api file
set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
set(sparse_api_header_file_tmp ${api_header_file}.tmp)
set(sparse_api_source_file_tmp ${api_source_file}.tmp)
# wrapped infermeta file
set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
......@@ -73,6 +81,19 @@ add_custom_command(
DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
VERBATIM)
# generate sparse api
add_custom_command(
OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
--api_yaml_path ${sparse_api_yaml_file}
--api_header_path ${sparse_api_header_file_tmp}
--api_source_path ${sparse_api_source_file_tmp}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
VERBATIM)
# generate wrapped infermeta
add_custom_command(
OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
......@@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl)
cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform)
cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl)
cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
......@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h"
......
......@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
namespace paddle {
namespace experimental {
/* ------------------ for input ----------------------- */
inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
const Tensor& tensor) {
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
}
inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
const paddle::optional<Tensor>& tensor) {
if (tensor) {
return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
......@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
return nullptr;
}
inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
const std::vector<Tensor>& tensors) {
auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
pt_tensors->reserve(tensors.size());
......@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
return std::move(pt_tensors);
}
inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
const Tensor& tensor) {
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
}
inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
const paddle::optional<Tensor>& tensor) {
if (tensor) {
return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
......@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
/* ----------------- for infer_meta --------------------- */
inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
return phi::MetaTensor(tensor);
}
inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::DenseTensor&>& tensor) {
if (tensor) {
return {phi::MetaTensor(*tensor)};
......@@ -79,7 +70,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
return {paddle::none};
}
inline std::vector<phi::MetaTensor> MakeMetaTensor(
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor>& tensors) {
std::vector<phi::MetaTensor> meta_tensors;
meta_tensors.reserve(tensors.size());
......@@ -89,11 +80,11 @@ inline std::vector<phi::MetaTensor> MakeMetaTensor(
return meta_tensors;
}
inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
return phi::MetaTensor(tensor);
}
inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::SelectedRows&>& tensor) {
if (tensor) {
return {phi::MetaTensor(*tensor)};
......@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
/* ------------------ for output ----------------------- */
inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
if (!out->initialized()) {
auto dense_tensor = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
......@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
return static_cast<phi::DenseTensor*>(out->impl().get());
}
inline std::vector<phi::DenseTensor*> SetKernelOutput(
size_t out_size, Backend backend, std::vector<Tensor>* out) {
std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
Backend backend,
std::vector<Tensor>* out) {
out->reserve(out_size);
std::vector<phi::DenseTensor*> results(out_size);
for (size_t i = 0; i < out_size; ++i) {
......@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
return results;
}
inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
Tensor* out) {
phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
if (!out->initialized()) {
auto select_rows = std::make_shared<phi::SelectedRows>();
out->set_impl(select_rows);
......@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
return static_cast<phi::SelectedRows*>(out->impl().get());
}
phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
if (!out->initialized()) {
if (type == TensorType::SPARSE_COO) {
auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
out->set_impl(sparse_tensor);
return sparse_tensor.get();
} else if (type == TensorType::SPARSE_CSR) {
auto sparse_tensor =
std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
phi::DenseTensor(),
phi::DenseTensor(),
phi::DDim{-1});
out->set_impl(sparse_tensor);
return sparse_tensor.get();
} else {
auto dense_tensor = std::make_shared<phi::DenseTensor>();
out->set_impl(dense_tensor);
return dense_tensor.get();
}
}
return out->impl().get();
}
} // namespace experimental
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace paddle {
namespace experimental {
enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
/* ------------------ for input ----------------------- */
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
const paddle::optional<Tensor>& tensor);
std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
const std::vector<Tensor>& tensors);
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
const paddle::optional<Tensor>& tensor);
/* ----------------- for infer_meta --------------------- */
phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::DenseTensor&>& tensor);
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor>& tensors);
phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::SelectedRows&>& tensor);
/* ------------------ for output ----------------------- */
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
Backend backend,
std::vector<Tensor>* out);
phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
} // namespace experimental
} // namespace paddle
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
#include <memory>
#include "glog/logging.h"
......@@ -20,29 +20,12 @@ limitations under the License. */
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/unary.h"
PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
#endif
namespace paddle {
namespace experimental {
namespace sparse {
PADDLE_API Tensor to_sparse_coo(const Tensor& x,
Tensor to_sparse_coo_impl(const Tensor& x,
Backend backend,
const int64_t sparse_dim) {
if (x.layout() == phi::DataLayout::SPARSE_COO) {
......@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
return out;
}
PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
if (x.layout() == phi::DataLayout::SPARSE_CSR) {
return x;
}
......@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
return out;
}
PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
Tensor to_dense_impl(const Tensor& x, Backend backend) {
if (x.layout() != phi::DataLayout::SPARSE_CSR &&
x.layout() != phi::DataLayout::SPARSE_COO) {
return x;
......
......@@ -21,13 +21,13 @@ namespace paddle {
namespace experimental {
namespace sparse {
PADDLE_API Tensor to_sparse_coo(const Tensor& x,
Tensor to_dense_impl(const Tensor& x, Backend backend);
Tensor to_sparse_coo_impl(const Tensor& x,
Backend backend,
const int64_t sparse_dim);
PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend);
PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
} // namespace sparse
} // namespace experimental
......
......@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
void* AllocateFrom(Allocator* allocator,
DataType dtype,
size_t requested_size = 0) override;
void set_dims(const DDim& dims) { this->dims_ = dims; }
private:
// save the indices of non zero elements in original dense tensor
......
......@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input,
out->share_lod(input);
}
void GatherTreeMeta(const MetaTensor& ids,
const MetaTensor& parents,
MetaTensor* out) {
auto ids_dims = ids.dims();
auto parents_dims = parents.dims();
PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
true,
phi::errors::InvalidArgument(
"The shape of Input(Parents) must be same with the "
"shape of Input(Ids)."));
out->set_dims(ids_dims);
}
} // namespace phi
......@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input,
const MetaTensor& label,
MetaTensor* out,
MetaConfig config = MetaConfig());
void GatherTreeMeta(const MetaTensor& ids,
const MetaTensor& parents,
MetaTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
const auto* d_y = &y_grad;
DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
auto* d_x = x_grad;
auto* d_scale = scale_grad;
auto* d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
if (is_inplace) {
if (d_x) {
PADDLE_ENFORCE_EQ(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
} else {
if (d_x) {
PADDLE_ENFORCE_NE(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// input dimension is 2 and the format is NCHW. The input can be regarded as
// NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
const T* mean_data = saved_mean.data<T>();
const T* inv_var_data = saved_variance.data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
T* d_bias_data = nullptr;
T* d_scale_data = nullptr;
if (d_scale && d_bias) {
d_bias_data = ctx.template Alloc<T>(d_bias);
d_scale_data = ctx.template Alloc<T>(d_scale);
}
// d_bias = np.sum(d_y, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
if (d_scale && d_bias) {
d_bias_arr.setZero();
d_scale_arr.setZero();
}
if (d_x && (N * sample_size) == 1 && !use_global_stats) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
return;
}
int scale_coefff = use_global_stats ? 1 : N * sample_size;
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
DenseTensor dy_sum;
dy_sum.Resize({C});
auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
auto dy_mul_x_sub_mean_mul_invstd_sum_data =
ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
dy_mul_x_sub_mean_mul_invstd_sum_data, C);
dy_sum_arr.setZero();
dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
// inplace calculation
// Y: ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
// X: (y - bias) / scale / (inv_var) + est_mean
// formula transform ====>
// (y - bias) / (scale * inv_var) + est_mean
switch (data_layout) {
case DataLayout::kNCHW: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
scale_inv_var_nhw(nc % C) / scale_coefff +
mean_arr(nc % C);
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
dy_sum_arr(c) += d_y_arr.col(nc).sum();
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
.sum();
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), sample_size, N * C);
if (!use_global_stats) {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) =
scale_inv_var_nhw(c) *
(d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
(x_arr.col(nc) - mean_arr[c]) *
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
}
} else {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
}
}
}
break;
}
case DataLayout::kNHWC: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; nhw++) {
x_data.col(nhw) =
(y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
mean_arr;
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
dy_sum_arr += d_y_arr.col(nhw);
dy_mul_x_sub_mean_mul_invstd_sum_arr +=
(x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), C, N * sample_size);
if (!use_global_stats) {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) =
scale_inv_var_nhw *
(d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
(x_arr.col(nhw) - mean_arr) *
dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
}
} else {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
}
}
}
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad) {
const auto* X = &x;
const auto* Scale = &scale;
const auto* dY = &y_grad;
const auto* Saved_mean = &saved_mean;
const auto* Saved_variance = &saved_variance;
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const auto data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto* ddX = &x_grad_grad;
const auto* ddScale = &scale_grad_grad;
const auto* ddBias = &bias_grad_grad;
auto* dX = x_grad;
auto* dScale = scale_grad;
auto* ddY = y_grad_grad;
ctx.template Alloc<T>(dX);
ctx.template Alloc<T>(ddY);
const auto& x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = X->numel() / C;
phi::funcs::SetConstant<Context, T> set_constant;
const T* mean_data = Saved_mean->data<T>();
const T* inv_var_data = Saved_variance->data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
// transpose NCHW -> NHWC for easy calculate
DenseTensor transformed_x(X->type());
DenseTensor transformed_dy(dY->type());
DenseTensor transformed_ddx(ddX->type());
DenseTensor transformed_dx(dX->type());
DenseTensor transformed_ddy(ddY->type());
if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
// Input Tensor
ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
TransToChannelLast<Context, T>(ctx, X, &transformed_x);
ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
// Output Tensor
ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
} else {
transformed_x.ShareDataWith(*X);
transformed_dy.ShareDataWith(*dY);
transformed_ddx.ShareDataWith(*ddX);
transformed_dx.ShareDataWith(*dX);
transformed_ddy.ShareDataWith(*ddY);
}
ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
Tensor mean_tile;
mean_tile.Resize({C, sample_size});
EigenArrayMap<T> mean_tile_data(
ctx.template Alloc<T>(&mean_tile), C, sample_size);
DenseTensor inv_var_tile;
inv_var_tile.Resize({C, sample_size});
EigenArrayMap<T> inv_var_tile_data(
ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
mean_tile_data = mean_arr.replicate(1, sample_size);
inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
DenseTensor Scale_data;
if (!Scale) {
Scale_data.Resize({C});
ctx.template Alloc<T>(&Scale_data);
set_constant(ctx, &Scale_data, static_cast<T>(1));
}
ConstEigenVectorArrayMap<T> scale_arr(
Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
Tensor scale_tile;
scale_tile.Resize({C, sample_size});
EigenArrayMap<T> scale_tile_data(
ctx.template Alloc<T>(&scale_tile), C, sample_size);
scale_tile_data = scale_arr.replicate(1, sample_size);
ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
DenseTensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({C, sample_size});
EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
if (dX) {
ctx.template Alloc<T>(dX);
EigenArrayMap<T> dx_arr(
ctx.template Alloc<T>(&transformed_dx), C, sample_size);
dx_arr.setZero();
if (use_global_stats) {
// math: dx = (ddscale * dy) * inv_var
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
}
} else {
// math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
// axis=(n,h,w)) *
// np.sum(dy, axis=(n,h,w)) -
// np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
// mean),
// axis=(n,h,w)) * inv_var.pow(2) *
// np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
// NxHxW *
// np.sum(ddx * (x - mean)) *
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var
// *
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
if (ddX) {
dx_arr +=
(x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
sample_size)
.colwise() *
(ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
(dy_arr * ddx_arr).rowwise().sum() +
3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size *
(ddx_arr.rowwise().sum() / sample_size - ddx_arr);
dx_arr = scale_tile_data * dx_arr;
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr +=
(dy_arr * inv_var_tile_data -
(dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
inv_var_tile_data -
x_sub_mean_mul_invstd_arr * inv_var_tile_data *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size) *
ddscale_tile_data;
}
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
}
}
if (dScale) {
EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
dscale_arr.setZero();
if (use_global_stats) {
// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
if (ddX) {
dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
}
} else {
// math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
// ddx
if (ddX) {
Tensor first_grad;
first_grad.Resize({C, sample_size});
EigenArrayMap<T> first_grad_arr(
ctx.template Alloc<T>(&first_grad), C, sample_size);
first_grad_arr.setZero();
first_grad_arr +=
inv_var_tile_data *
(dy_arr -
dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
}
}
}
if (ddY) {
ctx.template Alloc<T>(ddY);
EigenArrayMap<T> ddy_arr(
ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
ddy_arr.setZero();
if (use_global_stats) {
// math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if (ddX) {
ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
}
} else {
// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// np.mean(ddx * (x - mean), axis=(n,h,w)))
if (ddX) {
ddy_arr +=
scale_tile_data * inv_var_tile_data *
(ddx_arr -
ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(ddx_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
}
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
}
if (ddBias) {
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
Tensor ddbias_tile;
ddbias_tile.Resize({C, sample_size});
EigenArrayMap<T> ddbias_tile_data(
ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
ddy_arr += ddbias_tile_data;
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
CPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double) {}
PD_REGISTER_KERNEL(batch_norm_grad_grad,
CPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
bool test_mode = is_test && (!trainable_statistics);
bool global_stats = test_mode || use_global_stats;
auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensionss is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// alloc memory
ctx.template Alloc<T>(y);
ctx.template Alloc<T>(mean_out);
ctx.template Alloc<T>(variance_out);
ctx.template Alloc<T>(saved_mean);
ctx.template Alloc<T>(saved_variance);
// input dimension is 2 and the format is NCHW. The input can be regarded
// as NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
EigenVectorArrayMap<T> saved_variance_e(
ctx.template Alloc<T>(saved_variance), C);
saved_mean_e.setZero();
saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
C);
if ((N * sample_size) == 1) {
// Only 1 element in normalization dimension,
// we skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
return;
}
switch (data_layout) {
case DataLayout::kNCHW: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum();
}
saved_mean_e /= N * sample_size;
for (int nc = 0; nc < N * C; ++nc) {
saved_variance_e(nc % C) +=
(x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
}
saved_variance_e /= N * sample_size;
break;
}
case DataLayout::kNHWC: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
for (int i = 0; i < N * sample_size; ++i) {
saved_mean_e += x_arr.col(i);
}
saved_mean_e /= N * sample_size;
for (int i = 0; i < N * sample_size; ++i) {
saved_variance_e +=
(x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
}
saved_variance_e /= N * sample_size;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
running_mean_arr =
running_mean_arr * momentum + saved_mean_e * (1. - momentum);
running_var_arr =
running_var_arr * momentum + saved_variance_e * (1. - momentum);
}
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
inv_std = (var_arr + epsilon).sqrt().inverse();
} else {
EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr;
switch (data_layout) {
case DataLayout::kNCHW: {
EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
}
break;
}
case DataLayout::kNHWC: {
EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
(ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
new_scale)
.colwise() +
new_bias;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
data_layout));
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/gaussian_random_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/generator.h"
namespace phi {
template <typename T, typename Context>
void GaussianRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out) {
auto tensor = out;
std::normal_distribution<T> dist(mean, std);
tensor->Resize(phi::make_ddim(shape.GetData()));
int64_t size = tensor->numel();
T* data = dev_ctx.template Alloc<T>(tensor);
auto engine = paddle::framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
} // namespace phi
PD_REGISTER_KERNEL(gaussian_random,
CPU,
ALL_LAYOUT,
phi::GaussianRandomKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
PD_REGISTER_KERNEL(pad_grad,
CPU,
ALL_LAYOUT,
phi::PadGradKernel,
float,
double,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
PD_REGISTER_KERNEL(pad,
CPU,
ALL_LAYOUT,
phi::PadKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -15,21 +15,26 @@ limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
namespace paddle {
namespace operators {
namespace math {
namespace phi {
namespace funcs {
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
template <typename T,
size_t D,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
T pad_value, framework::Tensor* out) {
void PadFunction(const DeviceContext& context,
const std::vector<int>& pads,
const DenseTensor& src,
T pad_value,
DenseTensor* out) {
std::array<std::pair<int64_t, int64_t>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
......@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
auto src_tensor = EigenTensor<T, D>::From(src);
auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto& place = *(context.eigen_device());
EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_tensor, src_tensor, paddings, pad_value);
}
template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context,
const std::vector<int>& pads, const framework::Tensor& src,
framework::Tensor* d_out) {
void PadGradFunction(const DeviceContext& context,
const std::vector<int>& pads,
const DenseTensor& src,
DenseTensor* d_out) {
std::array<std::pair<int64_t, int64_t>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2];
......@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto src_tensor = EigenTensor<T, D>::From(src);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto& place = *(context.eigen_device());
EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
}
template <typename DeviceContext, typename T>
void PaddingFunctor(int rank, const framework::ExecutionContext& context,
const std::vector<int>& pads, T pad_value,
const framework::Tensor& src, framework::Tensor* out) {
void PaddingFunctor(int rank,
const DeviceContext& context,
const std::vector<int>& pads,
T pad_value,
const DenseTensor& src,
DenseTensor* out) {
switch (rank) {
case 1:
PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
......@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
PADDLE_THROW(
phi::errors::Unimplemented("PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
template <typename DeviceContext, typename T>
void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
void PaddingGradFunctor(int rank,
const DeviceContext& context,
const std::vector<int>& pads,
const framework::Tensor& src, framework::Tensor* out) {
const DenseTensor& src,
DenseTensor* out) {
switch (rank) {
case 1:
PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
......@@ -118,8 +127,8 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"PadOp only support tensors with no more"
PADDLE_THROW(
phi::errors::Unimplemented("PadOp only support tensors with no more"
" than 6 dimensions currently."));
}
}
......@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
}
return is_sys_pad;
}
} // namespace math
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
namespace phi {
template <typename T, typename Context>
void GaussianRandomKernel(const Context& ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy,
const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const double epsilon,
const int N,
const int C,
const int HxW,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, phi::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon,
const int C,
const int HxW,
const int num,
T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y,
int grid2,
const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x,
y,
phi::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy,
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance,
const int C,
const int N,
const int HxW,
const double epsilon,
T *dx,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean,
const T *x,
const BatchNormParamType<T> *variance,
const int C,
const int N,
const int HxW,
T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context &ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
double epsilon = static_cast<double>(epsilon_f);
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto *d_y = &y_grad;
auto *d_x = x_grad;
auto *d_scale = scale_grad;
auto *d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(),
x_dims));
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale.dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale.dims().size(),
scale.dims()));
PADDLE_ENFORCE_EQ(
scale.dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C,
scale.dims()[0]));
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
FLAGS_cudnn_batchnorm_spatial_persistent &&
(reserve_space.get_ptr() != nullptr);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_d_y(d_y->type());
DenseTensor transformed_d_x;
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
if (d_x) {
ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
}
} else {
transformed_x.ShareDataWith(x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(
&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_, mode_));
#endif
const auto *saved_mean_data =
saved_mean.template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_variance.template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format,
transformed_x.data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
epsilon,
C,
H * W * D,
num,
transformed_x.data<T>(),
grid2,
block,
stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<T,
block,
DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<T,
block,
DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackward(
ctx.cudnn_handle(),
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
transformed_d_y.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_d_x),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean_data,
saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(
bn_param_desc_));
#endif
} else {
const auto *running_mean = mean.get_ptr();
const auto *running_var = variance.get_ptr();
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = x;
inplace_functor(data_layout,
ctx.template Alloc<T>(&px),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
running_mean_data,
running_var_data,
epsilon,
C,
H * W * D,
num,
x.data<T>(),
grid2,
block,
stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context &dev_ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context &ctx,
const DenseTensor &x_grad_grad,
const DenseTensor &scale_grad_grad,
const DenseTensor &bias_grad_grad,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *y_grad_grad) {
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const DenseTensor *running_mean = nullptr;
const DenseTensor *running_variance = nullptr;
if (use_global_stats) {
running_mean = mean.get_ptr();
running_variance = variance.get_ptr();
}
paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
data_layout,
&x,
&scale,
&y_grad,
&saved_mean,
&saved_variance,
running_mean,
running_variance,
epsilon,
use_global_stats,
&x_grad_grad,
&scale_grad_grad,
&bias_grad_grad,
x_grad,
scale_grad,
y_grad_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, phi::DataLayout layout>
static __global__ void BNForwardInference(const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
double exponentialAverageFactor,
T *y,
BatchNormParamType<T> *mean,
BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T, typename Context>
void BatchNormKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &mean,
const DenseTensor &variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *y,
DenseTensor *mean_out,
DenseTensor *variance_out,
DenseTensor *saved_mean,
DenseTensor *saved_variance,
DenseTensor *reserve_space) {
double epsilon = epsilon_f;
const bool trainable_stats = trainable_statistics;
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
ctx.template Alloc<T>(y);
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
} else {
transformed_x.ShareDataWith(x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_,
data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
auto handle = ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = &mean;
const auto *est_var = &variance;
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(),
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(),
est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C,
est_mean->dims()[0],
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C,
est_var->dims()[0],
est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
// need to solve here
// if (ctx.HasInput("MomentumTensor")) {
// const auto *mom_tensor = MomentumTensor;
// DenseTensor mom_cpu;
// paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
// &mom_cpu);
// momentum = mom_cpu.data<float>()[0];
// }
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
// auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
phi::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle,
mode_,
CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
nullptr,
nullptr,
data_desc_,
transformed_y.template data<T>(),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr,
workspace_ptr,
workspace_size,
reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T,
block,
DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T,
block,
DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
handle,
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Tensor = DenseTensor;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
VLOG(5) << "Why am I called?";
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 4, 1, 2, 3};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 3, 1, 2};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 2, 3, 4, 1};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 2, 3, 1};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/gaussian_random_kernel.h"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/fluid/framework/generator.h"
DECLARE_bool(use_curand);
namespace phi {
template <typename T>
struct GaussianGenerator {
T mean_, std_;
unsigned int seed_;
unsigned int offset_ = 0;
__host__ __device__ GaussianGenerator(T mean, T std, int seed)
: mean_(mean), std_(std), seed_(seed) {}
__host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
: mean_(mean), std_(std), seed_(seed), offset_(offset) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
thrust::normal_distribution<MT> dist(mean_, std_);
unsigned int new_n = n + offset_;
rng.discard(new_n);
MT out = dist(rng);
return static_cast<T>(out);
}
};
template <typename T, typename Context>
void GaussianRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out) {
auto tensor = out;
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
tensor->Resize(phi::make_ddim(shape.GetData()));
T* data = dev_ctx.template Alloc<T>(tensor);
int64_t size = tensor->numel();
int device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
funcs::normal_distribution<MT> dist;
funcs::normal_transform<MT> trans(mean, std);
funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
}
} else {
auto func = GaussianGenerator<MT>(mean, std, seed);
IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
}
}
} // namespace phi
PD_REGISTER_KERNEL(gaussian_random,
GPU,
ALL_LAYOUT,
phi::GaussianRandomKernel,
phi::dtype::float16,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
PD_REGISTER_KERNEL(pad_grad,
GPU,
ALL_LAYOUT,
phi::PadGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/complex.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
#include "paddle/phi/kernels/pad_kernel.h"
PD_REGISTER_KERNEL(pad,
GPU,
ALL_LAYOUT,
phi::PadKernel,
float,
double,
int,
int64_t,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -178,6 +178,8 @@ struct IndexCalculator {
: dim(dim) {
dims = details::VectorToArray<int, kMaxRank>(cal_dims);
strides = details::VectorToArray<int, kMaxRank>(full_strides);
reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
#ifndef PADDLE_WITH_XPU_KP
std::vector<paddle::platform::FastDivMod> cal_divmoders;
// fast divmod
for (auto i : cal_strides) {
......@@ -185,9 +187,22 @@ struct IndexCalculator {
}
divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
cal_divmoders);
#endif
}
__device__ inline int operator()(int offset) const {
#ifdef PADDLE_WITH_XPU_KP
int index = 0;
#pragma unroll
for (int i = 0; i < kMaxRank; ++i) {
if (i == dim) {
break;
}
index += (offset / reduce_strides[i]) * strides[dims[i]];
offset = offset % reduce_strides[i];
}
return index;
#else
int index = 0;
#pragma unroll
for (int i = 0; i < kMaxRank; ++i) {
......@@ -199,12 +214,16 @@ struct IndexCalculator {
offset = divmod.val[1];
}
return index;
#endif
}
int dim;
phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif
};
template <bool ReduceLastDim = false>
......@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
__device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_y;
return 1;
#else
return blockDim.y;
#endif
......@@ -454,10 +473,14 @@ struct ReduceConfig {
bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny);
#else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
#endif
} else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else {
......@@ -471,6 +494,7 @@ struct ReduceConfig {
}
}
#ifndef PADDLE_WITH_XPU_KP
void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
constexpr int min_reduce_num_per_thread = 16;
constexpr int max_reduce_num_per_thread = 256;
......@@ -569,6 +593,7 @@ struct ReduceConfig {
grid_dim->y = details::AlignUp(reduce_num, blocking_size);
}
}
#endif
void SetBlockDim() {
// init
......@@ -577,14 +602,14 @@ struct ReduceConfig {
dim3 block_dim(block_num, 1, 1);
dim3 grid_dim(left_num, 1, 1);
blocking_size = reduce_num;
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
if (reduce_last_dim) {
block_dim.x = 128;
block_dim.x = 64;
block_dim.y = reduce_num;
grid_dim.x = 8;
grid_dim.y = 1;
grid_dim.x = 1;
grid_dim.y = 8;
} else {
block_dim.x = 128;
block_dim.x = 64;
block_dim.y = left_num;
grid_dim.x = 8;
grid_dim.y = 1;
......@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
store_offset = block.BlockIdY() * left_num + left_idx;
loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = 1;
tid = threadIdx.x;
tid = THREAD_ID_X;
} else {
auto block = ReduceIndexMapping<false>(dim);
input_idx = block.BlockIdY() * block.BlockDimY();
......@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = block.BlockDimX() * block.GridDimX();
store_offset = block.BlockIdY() * left_num + left_idx;
tid = threadIdx.y;
tid = THREAD_ID_Y;
}
// calculate the offset, means the addr where each thread really start.
// 1. reduce for each thread
MPType input_compute[REDUCE_VEC_SIZE];
Tx input_reg[REDUCE_VEC_SIZE];
int input_idx_tmp = input_idx;
for (int i = 0; i < loop_left; i += stride_left) {
int input_offset = left_index_calculator(left_idx + i);
const Tx* input = x + input_offset;
const _ptr_ Tx* input = x + input_offset;
MPType reduce_var = init;
// load REDUCE_VEC_SIZE data once, and then compute
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
input_idx = input_idx_tmp;
for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) {
kps::ReadDataReduce<Tx,
......@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int loop_size = min(reduce_num - idy, blocking_size);
int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
int block_offset = idy * left_num + idz * reduce_num;
const Tx* input = x + block_offset;
const _ptr_ Tx* input = x + block_offset;
Tx reduce_input;
for (; idx < size; idx += stride) {
MPType reduce_var = init;
......@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
const ReduceOp& reducer,
const TransformOp& transform,
MPType init,
gpuStream_t stream,
KPStream stream,
ReduceConfig<Ty> config) {
if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1;
......@@ -855,13 +882,14 @@ static void LaunchReduceKernel(const Tx* x_data,
0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<8, 128, stream>>>(x_data,
OneDimIndexCal><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
......@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<8, 128, stream>>>(
IndexCalculator><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
......@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig dim =
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
ReduceHigherDimKernel<Ty,
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
......@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform,
int reduce_num,
const paddle::platform::Place& place,
gpuStream_t stream) {
KPStream stream) {
auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
transform);
......@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform,
int reduce_num,
const paddle::platform::Place& place,
gpuStream_t stream) {
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
}
......@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
phi::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) {
KPStream stream) {
y->mutable_data<Ty>(x.place());
auto x_dim = phi::vectorize<int>(x.dims());
......@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.SetOutputData(y_data, x.place(), &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, x.place(), stream);
return;
}
#endif
using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
auto reducer = ReduceOp<MPType>();
......@@ -1124,12 +1155,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.reduce_num % config.blocking_size,
0);
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<8, 128, stream>>>(x_data,
TransformOp><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
......@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
......@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
template <typename T,
template <typename> class ReduceOp,
template <typename, typename> class TransformOp>
void Reduce(const GPUContext& dev_ctx,
void Reduce(const KPDevice& dev_ctx,
const DenseTensor& x,
bool reduce_all,
const std::vector<int64_t>& dims,
......@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
reduce_num *= (x.dims())[i];
}
gpuStream_t stream = dev_ctx.stream();
KPStream stream = dev_ctx.stream();
if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace phi {
template <typename T, typename Context>
void PadGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* d_x) {
if (d_x == nullptr) {
return;
}
dev_ctx.template Alloc<T>(d_x);
int rank = d_out.dims().size();
phi::funcs::PaddingGradFunctor<Context, T>(
rank, dev_ctx, paddings, d_out, d_x);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <utility>
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace phi {
template <typename T, typename Context>
void PadKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
int rank = x.dims().size();
funcs::PaddingFunctor<Context, T>(
rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void PadGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* d_x);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void PadKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* out);
} // namespace phi
set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
......@@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx,
f_calc_rulebook(nullptr);
// alloc the rulebook
rulebook->ResizeAndAllocate({3, rulebook_len});
DenseTensorMeta rulebook_meta(
DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
rulebook->set_meta(rulebook_meta);
dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
int* rulebook_ptr = rulebook->data<int>();
f_calc_rulebook(rulebook_ptr);
......
......@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
namespace phi {
namespace sparse {
......@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx,
// 1. product rulebook
DenseTensorMeta counter_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW);
// DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
ProductRuleBook<T, Context>(dev_ctx,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include "glog/logging.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
namespace phi {
namespace sparse {
// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
out1[i] = i;
out2[i] = i;
}
}
/**
* @brief: update the out index and indices
* unique_keys: save the index of the output feature list
* unique_values: indiates the index of key before deduplication
* out_indexs: indicates the position of the output index in the rulebook
* rulebook_len: indicates the length of rulebook
* out_dims: indicates the output dims
* out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
* rulebook_out_indexs: the output index in rulebook
**/
__global__ void UpdateIndexKernel(const int* unique_keys,
const int* unique_values,
const int* out_indexs,
const int non_zero_num,
const int rulebook_len,
const Dims4D out_dims,
int* out_indices,
int* rulebook_out_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
const int index = unique_keys[i];
int batch, x, y, z;
IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
// get out indices
out_indices[i] = batch;
out_indices[i + non_zero_num] = z;
out_indices[i + non_zero_num * 2] = y;
out_indices[i + non_zero_num * 3] = x;
// update rulebook
int start = unique_values[i];
int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
// max(end-start) = kernel_size
for (int j = start; j < end; j++) {
rulebook_out_indexs[out_indexs[j]] = i;
}
}
}
/**
* @brief product rulebook
* for input_i in x_indices:
* if input_i participate in the convolution calculation:
* infer the output_i by input_i and kernel_i
* save output_i
*
* x_indices: the indices of input features
* x_dims: the input dims
* kernel_dims: the kernel dims
* out_dims: the output dims
* non_zero_num: the number of input features
* rulebook: the rulebook to save the kernel index, input index and output index
* counter: save the number of times each location in the kernel participates in
*the caculation
**/
__global__ void ProductRuleBookKernel(const int* x_indices,
const Dims4D x_dims,
const Dims4D kernel_dims,
const Dims4D out_dims,
const int64_t non_zero_num,
const Dims4D paddings,
const Dims4D dilations,
const Dims4D strides,
int* rulebook,
int* counter) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int counter_buf[]; // kernel_size
const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
const int offset = kernel_size * non_zero_num;
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
counter_buf[i] = 0;
}
__syncthreads();
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
int kernel_index = 0;
for (int kz = 0; kz < kernel_dims[1]; kz++) {
for (int ky = 0; ky < kernel_dims[2]; ky++) {
for (int kx = 0; kx < kernel_dims[3]; kx++) {
int batch = x_indices[i];
int in_z = x_indices[i + non_zero_num];
int in_y = x_indices[i + 2 * non_zero_num];
int in_x = x_indices[i + 3 * non_zero_num];
int in_i = -1, out_index = -1;
if (Check(x_dims,
kernel_dims,
paddings,
dilations,
strides,
in_x,
in_y,
in_z,
kx,
ky,
kz)) {
int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
in_i = i;
out_index =
PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
atomicAdd(&counter_buf[kernel_index], 1);
}
rulebook[kernel_index * non_zero_num + i] = in_i;
rulebook[kernel_index * non_zero_num + offset + i] = out_index;
++kernel_index;
}
}
}
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicAdd(&counter[i], counter_buf[i]);
}
}
// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
// this kernel with phi::GatherCUDAKernel;
// Vectorization can be used to improve read and write bandwidth
/**
* brief: gather data from params according to indices
* params: the inputs
* indices: the indices you want to gather
* output: the outputs
* index_size: the size of indices
* slice_size: slice size corresponding to each index, here is the channel size
**/
template <typename T, typename IndexT = int>
__global__ void GatherKernel(const T* params,
const IndexT* indices,
T* output,
size_t index_size,
size_t slice_size) {
CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
int64_t indices_i = i / slice_size;
int64_t slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = indices[indices_i];
int64_t params_i = gather_i * slice_size + slice_i;
*(output + i) = *(params + params_i);
}
}
/**
* brief: scatter add
* input: the inputs
* unique_value: refer to UpdateIndexKernel notes
* out_index: the output feature index
* non_zero_num: the number of output features
* rulebook_len: the length of rulebook
* channels: the output channel size
* out: the outputs
**/
template <typename T>
__global__ void ScatterKernel(const T* input,
const int* unique_value,
const int* out_index,
const int non_zero_num,
const int rulebook_len,
const int channels,
T* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
int indices_i = i / channels;
int channels_i = i - indices_i * channels;
int start = unique_value[indices_i];
int end = indices_i == non_zero_num - 1 ? rulebook_len
: unique_value[indices_i + 1];
// max(end-start) = kernel_size
T sum = static_cast<T>(0);
for (int j = start; j < end; j++) {
const int out_feature_i = out_index[j];
sum += input[out_feature_i * channels + channels_i];
}
out[indices_i * channels + channels_i] = sum;
}
}
// brief: calculation the distance between start and end
__global__ void DistanceKernel(const int* start,
const int* end,
int* distance) {
if (threadIdx.x == 0) {
*distance = end - start;
}
}
// the basic algorithm can refer to convolution_kernel.cc or
// the second paper
// example:
// 1. the rulebook:
// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, ....
// the out_index(key): 20, 30, 33, 30, 33, 20, 25
// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, ....
// 3. sorted the (key, value)
// 4. unique the (key, value):
// unique_key: 20, 25, 30, 33
// unique_values: 0, 2, 3, 5
// the index of unique_values is: 0, 1, 2, 3
// 5. update the out_index by unique_key, uniqe_value and the index of
// unique_value:
// the new out_index: 0, 2, 3, 2, 3, 0, 1
template <typename T, typename Context>
int ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const DDim& out_dims,
DenseTensor* rulebook,
DenseTensor* counter_per_kernel,
DenseTensor* offsets_per_kernel,
DenseTensor* out_index,
DenseTensor* unique_key,
DenseTensor* unique_value,
SparseCooTensor* out,
std::vector<int>* h_counter,
std::vector<int>* h_offsets) {
const auto& kernel_dims = kernel.dims();
const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>();
dev_ctx.Alloc(counter_per_kernel,
counter_per_kernel->dtype(),
sizeof(int) * counter_per_kernel->numel());
int* counter_ptr = counter_per_kernel->data<int>();
dev_ctx.Alloc(offsets_per_kernel,
offsets_per_kernel->dtype(),
sizeof(int) * offsets_per_kernel->numel());
int* offsets_ptr = offsets_per_kernel->data<int>();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
int* rulebook_ptr = rulebook->data<int>();
const auto x_dims = x.dims();
Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
Dims4D d_strides(1, strides[2], strides[1], strides[0]);
Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
// 1. product rule book
phi::funcs::SetConstant<Context, int> set_zero;
set_zero(dev_ctx, counter_per_kernel, 0);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
ProductRuleBookKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(indices_ptr,
d_x_dims,
d_kernel_dims,
d_out_dims,
non_zero_num,
d_paddings,
d_dilations,
d_strides,
rulebook_ptr,
counter_ptr);
// 2. remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + 2 * kernel_size * non_zero_num,
-1);
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
int rulebook_len =
(*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
// 3. sorted or merge the out index
out_index->ResizeAndAllocate({rulebook_len});
unique_value->ResizeAndAllocate({rulebook_len});
unique_key->ResizeAndAllocate({rulebook_len});
dev_ctx.Alloc(
out_index, out_index->dtype(), sizeof(int) * out_index->numel());
int* out_index_ptr = out_index->data<int>();
dev_ctx.Alloc(
unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
int* unique_value_ptr = unique_value->data<int>();
dev_ctx.Alloc(
unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
int* unique_key_ptr = unique_key->data<int>();
config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
InitByIndexKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
rulebook_len, out_index_ptr, unique_value_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
rulebook_ptr + rulebook_len,
rulebook_len * sizeof(int),
hipMemcpyDeviceToDevice,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
rulebook_ptr + rulebook_len,
rulebook_len * sizeof(int),
cudaMemcpyDeviceToDevice,
dev_ctx.stream());
#endif
// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
// performance, but thrust::merge_by_key limited by data size
#ifdef PADDLE_WITH_HIP
thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
unique_key_ptr,
unique_key_ptr + rulebook_len,
out_index_ptr);
// 4. unique
thrust::pair<int*, int*> new_end =
#ifdef PADDLE_WITH_HIP
thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
unique_key_ptr,
unique_key_ptr + rulebook_len,
unique_value_ptr);
// thrust::distance doesn't support stream parameters
// const int out_non_zero_num = thrust::distance(unique_key_ptr,
// new_end.first);
DistanceKernel<<<1, 1>>>(unique_key_ptr,
new_end.first,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
int out_non_zero_num = 0;
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
// 5. update out_indices and rulebook by unique_value_ptr
const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(
x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
dev_ctx.Alloc(
&out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
int* out_indices_ptr = out_indices.data<int>();
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
UpdateIndexKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(unique_key_ptr,
unique_value_ptr,
out_index_ptr,
out_non_zero_num,
rulebook_len,
d_out_dims,
out_indices_ptr,
rulebook_ptr + rulebook_len);
out->SetMember(out_indices, out_values, out_dims, true);
return rulebook_len;
}
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
SparseCooTensor* out,
DenseTensor* rulebook) {
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
const auto& x_dims = x.dims();
const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
DDim out_dims = {1, 1, 1, 1, 1};
GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
out->set_dims(out_dims);
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
// Second algorithm:
// https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
// 1. product rulebook
DenseTensorMeta counter_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensorMeta offsets_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
int n = ProductRuleBook<T, Context>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
out_dims,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_key,
&unique_value,
out,
&h_counter,
&offsets);
const int* counter_ptr = counter_per_kernel.data<int>();
const int* offsets_ptr = counter_per_kernel.data<int>();
// 2. gather
DenseTensorMeta in_features_meta(
x.dtype(), {n, in_channels}, DataLayout::NCHW);
DenseTensorMeta out_features_meta(
x.dtype(), {n, out_channels}, DataLayout::NCHW);
phi::DenseTensor in_features =
phi::Empty(dev_ctx, std::move(in_features_meta));
phi::DenseTensor out_features =
phi::Empty(dev_ctx, std::move(out_features_meta));
dev_ctx.Alloc(
&in_features, in_features.dtype(), sizeof(T) * in_features.numel());
T* in_features_ptr = in_features.data<T>();
dev_ctx.Alloc(
&out_features, out_features.dtype(), sizeof(T) * out_features.numel());
T* out_features_ptr = out_features.data<T>();
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook->data<int>(),
in_features_ptr,
n,
in_channels);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto* out_values = out->mutable_non_zero_elements();
dev_ctx.Alloc(
out_values, out_values->dtype(), sizeof(T) * out_values->numel());
T* out_values_ptr = out_values->data<T>();
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
if (h_counter[i] <= 0) {
continue;
}
// call gemm: (n, in_channels) * (in_channels, out_channels)
const int M = h_counter[i];
const int K = in_channels;
const int N = out_channels;
T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
blas.GEMM(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1),
tmp_in_ptr,
tmp_kernel_ptr,
static_cast<T>(0),
tmp_out_ptr);
}
// 4. scatter
config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, out->nnz() * out_channels, 1);
ScatterKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(out_features_ptr,
unique_value.data<int>(),
out_index.data<int>(),
out->nnz(),
n,
out_channels,
out_values_ptr);
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(sparse_conv3d,
GPU,
ALL_LAYOUT,
phi::sparse::Conv3dKernel,
float,
double,
phi::dtype::float16) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm",
{"X", "Scale", "Bias", "Mean", "Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"Y",
"MeanOut",
"VarianceOut",
"SavedMean",
"SavedVariance",
"ReserveSpace"});
}
KernelSignature BatchNormGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"batch_norm_grad",
{GradVarName("Y"),
"X",
"Scale",
"Bias",
"SavedMean",
"SavedVariance",
"ReserveSpace",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
}
KernelSignature BatchNormGradGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm_grad_grad",
{"DDX",
"DDScale",
"DDBias",
"DY",
"X",
"Scale",
"SavedMean",
"SavedVariance",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"DX", "DScale", "DDY"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
phi::BatchNormGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
phi::BatchNormGradGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature GaussianRandomOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("gaussian_random",
{},
{"ShapeTensorList", "mean", "std", "seed", "dtype"},
{"Out"});
}
const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("gaussian_random",
{},
{"ShapeTensor", "mean", "std", "seed", "dtype"},
{"Out"});
}
return KernelSignature("gaussian_random",
{},
{"shape", "mean", "std", "seed", "dtype"},
{"Out"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
phi::GaussianRandomOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("pad_grad",
{GradVarName("Out")},
{"paddings", "pad_value"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
......@@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_
cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See
the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
template <typename T>
void TestConv3dBase(const std::vector<int>& indices,
const std::vector<T>& features,
const phi::DDim& x_dims,
const std::vector<T>& kernel,
const phi::DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<T>& correct_out_features,
const phi::DDim& correct_out_dims,
const int non_zero_num,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const float diff = 1e-3) {
const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
phi::DenseTensor indices_tensor(
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
memcpy(
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
phi::DenseTensor features_tensor(
alloc.get(),
phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{non_zero_num, in_channels},
phi::DataLayout::NHWC));
memcpy(
features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
auto x_tensor = std::make_shared<phi::SparseCooTensor>(
indices_tensor, features_tensor, x_dims);
paddle::experimental::Tensor x(x_tensor);
auto kernel_tensor = std::make_shared<phi::DenseTensor>(
alloc.get(),
phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
kernel_dims,
phi::DataLayout::NHWC));
paddle::experimental::Tensor weight(kernel_tensor);
memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
kernel.data(),
kernel.size() * sizeof(T));
if (!std::is_same<T, phi::dtype::float16>::value) {
auto outs = paddle::experimental::sparse::conv3d(
x, weight, paddings, dilations, strides, 1);
auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
std::get<0>(outs).impl());
ASSERT_EQ(correct_out_dims.size(), out->dims().size());
for (int i = 0; i < correct_out_dims.size(); i++) {
ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
}
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
int cmp_indices = memcmp(correct_out_indices.data(),
out->non_zero_indices().data<int>(),
correct_out_indices.size() * sizeof(int));
ASSERT_EQ(cmp_indices, 0);
for (uint64_t i = 0; i < correct_out_features.size(); i++) {
float tmp = std::fabs(static_cast<float>(
correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
ASSERT_LT(tmp, diff);
}
}
}
void TestConv3d(const std::vector<int>& indices,
const std::vector<float>& features,
const phi::DDim& x_dims,
const std::vector<float>& kernel,
const phi::DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<float>& correct_out_features,
const phi::DDim& correct_out_dims,
const int non_zero_num,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations) {
// test float
TestConv3dBase<float>(indices,
features,
x_dims,
kernel,
kernel_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
TEST(API, sparse_conv2d) {
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
const int in_channels = 1;
const int out_channels = 1;
phi::DDim x_dims = {1, 1, 5, 5, in_channels};
phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
phi::DDim out_dims = {1, 1, 3, 3, out_channels};
std::vector<int> paddings = {0, 0, 0};
std::vector<int> strides = {1, 1, 1};
std::vector<int> dilations = {1, 1, 1};
const int non_zero_num = 3;
std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
// 3*3*3=27
std::vector<float> kernel = {0.65820312,
0.75048828,
0.21411133,
0.17370605,
0.85546875,
0.53076172,
0.28833008,
0.71044922,
0.00659943};
std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
std::vector<float> out_features = {
-0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
TestConv3d(indices_flatten,
features,
x_dims,
kernel,
kernel_dims,
out_indices_flatten,
out_features,
out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
......@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices,
f_verify(grads[1].data<T>(), kernel_grad);
}
}
// test gpu
#if defined(PADDLE_WITH_CUDA)
phi::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
dev_ctx_gpu.Alloc(&d_indices_tensor,
d_indices_tensor.dtype(),
sizeof(int) * d_indices_tensor.numel());
phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
DenseTensor d_features_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{non_zero_num, in_channels},
DataLayout::NHWC));
dev_ctx_gpu.Alloc(&d_features_tensor,
d_features_tensor.dtype(),
sizeof(T) * d_features_tensor.numel());
phi::Copy(
dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
DenseTensor d_kernel_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
kernel_dims,
DataLayout::NHWC));
dev_ctx_gpu.Alloc(&d_kernel_tensor,
d_kernel_tensor.dtype(),
sizeof(T) * d_kernel_tensor.numel());
phi::Copy(
dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
d_x_tensor,
d_kernel_tensor,
paddings,
dilations,
strides,
1,
&d_rulebook);
ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
for (int i = 0; i < correct_out_dims.size(); i++) {
ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
}
DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
dev_ctx_cpu.Alloc(&h_indices_tensor,
h_indices_tensor.dtype(),
sizeof(int) * h_indices_tensor.numel());
phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(),
phi::CPUPlace(),
true,
&h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(),
correct_out_indices.size() * sizeof(int));
ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{d_out.nnz()},
d_out.layout()));
dev_ctx_cpu.Alloc(&h_features_tensor,
h_features_tensor.dtype(),
sizeof(T) * h_features_tensor.numel());
phi::Copy(dev_ctx_gpu,
d_out.non_zero_elements(),
phi::CPUPlace(),
true,
&h_features_tensor);
for (uint64_t i = 0; i < correct_out_features.size(); i++) {
float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
h_features_tensor.data<T>()[i]));
ASSERT_LT(tmp, diff);
}
#endif
}
void TestConv3d(const std::vector<int>& indices,
......
......@@ -1430,6 +1430,22 @@ class Fleet(object):
# cache original feed forward program
self.origin_main_program = loss.block.program
# add distributed attr
if not hasattr(self.origin_main_program, "distributed_info_"):
setattr(self.origin_main_program, "distributed_info_", dict())
self.origin_main_program.distributed_info_[
"dp_degree"] = self._user_defined_strategy.sharding_configs[
"dp_degree"]
self.origin_main_program.distributed_info_[
"mp_degree"] = self._user_defined_strategy.sharding_configs[
"mp_degree"]
self.origin_main_program.distributed_info_[
"pp_degree"] = self._user_defined_strategy.sharding_configs[
"pp_degree"]
self.origin_main_program.distributed_info_[
"sharding_degree"] = self._user_defined_strategy.sharding_configs[
"sharding_degree"]
context["origin_main_program"] = self.origin_main_program
context["loss"] = loss
if startup_program == None:
......
......@@ -351,10 +351,10 @@ endif()
set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
if(LINUX AND WITH_MKLDNN)
set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
......
......@@ -26,7 +26,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid import core
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.framework import IrGraph
from paddle.fluid.framework import IrGraph, _test_eager_guard
from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
from paddle.fluid.dygraph.container import Sequential
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
......@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
class TestImperativeOutSclae(unittest.TestCase):
def test_out_scale_acc(self):
def func_out_scale_acc(self):
seed = 1000
lr = 0.001
......@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
loss_list[i] > loss_list[i + 1],
msg='Failed to do the imperative qat.')
def test_out_scale_acc(self):
with _test_eager_guard():
self.func_out_scale_acc()
self.func_out_scale_acc()
class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
def test_save_quantized_model(self):
def func_save_quantized_model(self):
lr = 0.001
load_param_path = "test_save_quantized_model/lenet.pdparams"
......@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
loss_list[i] > loss_list[i + 1],
msg='Failed to do the imperative qat.')
def test_save_quantized_model(self):
with _test_eager_guard():
self.func_save_quantized_model()
self.func_save_quantized_model()
if __name__ == '__main__':
unittest.main()
......@@ -29,6 +29,7 @@ import paddle.fluid as fluid
from paddle.fluid.contrib.slim.quantization import *
from paddle.fluid.log_helper import get_logger
from paddle.dataset.common import download
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
from imperative_test_utils import ImperativeLinearBn_hook
......@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
break
return top1_correct_num / total_num
def test_ptq(self):
def func_ptq(self):
start_time = time.time()
self.set_vars()
......@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
end_time = time.time()
print("total time: %ss \n" % (end_time - start_time))
def test_ptq(self):
with _test_eager_guard():
self.func_ptq()
self.func_ptq()
class TestImperativePTQfuse(TestImperativePTQ):
def test_ptq(self):
def func_ptq(self):
start_time = time.time()
self.set_vars()
......@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
end_time = time.time()
print("total time: %ss \n" % (end_time - start_time))
def test_ptq(self):
with _test_eager_guard():
self.func_ptq()
self.func_ptq()
class TestImperativePTQHist(TestImperativePTQ):
def set_vars(self):
......
......@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
from paddle.fluid.log_helper import get_logger
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet
paddle.enable_static()
......@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
self.activation_quantize_type = 'moving_average_abs_max'
print('weight_quantize_type', self.weight_quantize_type)
def test_qat(self):
def func_qat(self):
self.set_vars()
imperative_qat = ImperativeQuantAware(
......@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
np.allclose(after_save, before_save.numpy()),
msg='Failed to save the inference quantized model.')
def test_qat(self):
with _test_eager_guard():
self.func_qat()
self.func_qat()
if __name__ == '__main__':
unittest.main()
......@@ -27,7 +27,7 @@ import paddle.fluid as fluid
from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
from paddle.fluid.log_helper import get_logger
from paddle.dataset.common import download
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet
os.environ["CPU_NUM"] = "1"
......
......@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
from paddle.fluid.dygraph import Linear
from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
from paddle.fluid.log_helper import get_logger
from paddle.fluid.framework import _test_eager_guard
os.environ["CPU_NUM"] = "1"
_logger = get_logger(
......@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
_logger.info("test act_preprocess")
self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
def test_quant_aware_training(self):
def func_quant_aware_training(self):
imperative_qat = self.imperative_qat
seed = 1
np.random.seed(seed)
......@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
train(lenet)
test(lenet)
def test_quant_aware_training(self):
with _test_eager_guard():
self.func_quant_aware_training()
self.func_quant_aware_training()
class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
def setUp(self):
......
......@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
from paddle.fluid.log_helper import get_logger
from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
from paddle.fluid.framework import _test_eager_guard
os.environ["CPU_NUM"] = "1"
if core.is_compiled_with_cuda():
......@@ -42,7 +43,8 @@ _logger = get_logger(
class TestImperativeOutSclae(unittest.TestCase):
def test_out_scale_acc(self):
def func_out_scale_acc(self):
paddle.disable_static()
seed = 1000
lr = 0.1
......@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
if find_matmul:
self.assertTrue(matmul_skip_count == 1)
def test_out_scale_acc(self):
with _test_eager_guard():
self.func_out_scale_acc()
self.func_out_scale_acc()
if __name__ == '__main__':
unittest.main()
......@@ -155,8 +155,7 @@ def prune_model(main_program=None,
n=2,
m=4,
mask_algo='mask_1d',
with_mask=True,
sharding=False):
with_mask=True):
r"""
Pruning parameters of supported layers in :attr:`main_program` via
specified mask generation function given by :attr:`mask_algo`. This
......@@ -179,7 +178,6 @@ def prune_model(main_program=None,
mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
Returns:
dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
Examples:
......@@ -221,7 +219,10 @@ def prune_model(main_program=None,
# Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
sparsity.prune_model(main_program, mask_algo='mask_2d_best')
"""
if sharding:
if main_program is not None and hasattr(
main_program,
"distributed_info_") and main_program.distributed_info_[
"sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
place = paddle.CUDAPlace(gpu_id)
else:
......
......@@ -99,18 +99,19 @@ def param_guard(parameters):
yield
def _convert_into_variable(var_base):
def _convert_into_variable(tensor):
"""
Convert Varbase into Variable.
"""
if isinstance(var_base, core.VarBase):
if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
# Check whether has been created before.
new_var = var_base.block._find_var_recursive(var_base.name)
new_var = tensor.block._find_var_recursive(tensor.name)
if new_var is not None:
assert isinstance(new_var, framework.Variable)
# Convert ParamBase into Parameter with same attributes in dy2stat.
elif isinstance(var_base, framework.ParamBase):
new_var = var_base._to_static_var(to_parameter=True)
elif isinstance(tensor,
(framework.EagerParamBase, framework.ParamBase)):
new_var = tensor._to_static_var(to_parameter=True)
else:
# Note(Aurelius84): Convert VarBase in self._buffers into Variable with
# same attributes and set persistable=True to allow saving this var.
......@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
# But if its shape is empty while created from `create_variable()`, we consider this buffer
# non-persistable. See case of `drop_state` in lstm api.
is_persistable = len(var_base.shape) > 0
is_persistable = len(tensor.shape) > 0
new_var = var_base._to_static_var(
new_var = tensor._to_static_var(
to_parameter=False, persistable=is_persistable)
return new_var
else:
return var_base
return tensor
def enabled():
......
......@@ -61,7 +61,8 @@ class NestSequence(object):
def _get_var_ids(self):
var_ids = []
for idx, var in enumerate(self.__input_list):
if isinstance(var, (framework.Variable, core.VarBase)):
if isinstance(var, (framework.Variable, core.VarBase,
core.eager.Tensor)):
var_ids.append(idx)
return var_ids
......@@ -73,7 +74,8 @@ class NestSequence(object):
if need_check:
warning_types = set()
for var in self.__input_list:
if not isinstance(var, (framework.Variable, core.VarBase)):
if not isinstance(var, (framework.Variable, core.VarBase,
core.eager.Tensor)):
warning_types.add(type(var))
if warning_types:
logging_utils.warn(
......@@ -301,10 +303,17 @@ class PartialProgramLayer:
for name in block.vars:
if "@GRAD" in name:
var_desc = block.vars[name].desc
var_base = None
if not core._in_eager_mode():
var_base = core.VarBase(var_desc.dtype(),
var_desc.shape(),
var_desc.name(),
var_desc.type(), False)
else:
var_base = core.eager.Tensor(var_desc.dtype(),
var_desc.shape(),
var_desc.name(),
var_desc.type(), False)
double_grads.append(var_base)
return self._valid_vars(double_grads)
......@@ -386,13 +395,22 @@ class PartialProgramLayer:
expected_place = framework._current_expected_place()
for i, value in enumerate(flatten_inputs):
if isinstance(value, np.ndarray):
var = None
if not core._in_eager_mode():
var = core.VarBase(
value=value,
name=self._inputs[i].desc.name(),
persistable=False,
place=expected_place,
zero_copy=True)
elif isinstance(value, core.VarBase):
else:
var = core.eager.Tensor(
value=value,
name=self._inputs[i].desc.name(),
persistable=False,
place=expected_place,
zero_copy=True)
elif isinstance(value, (core.VarBase, core.eager.Tensor)):
# NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
# into CUDAPlace when it's as input of multi Ops. so we move it in advance
# to avoid this problem.
......@@ -411,9 +429,16 @@ class PartialProgramLayer:
var = self._outputs[var_id]
assert isinstance(var, framework.Variable)
var_desc = var.desc
varbase = None
if not core._in_eager_mode():
var_base = core.VarBase(var_desc.dtype(),
var_desc.shape(),
var_desc.name(), var_desc.type(), False)
else:
var_base = core.eager.Tensor(var_desc.dtype(),
var_desc.shape(),
var_desc.name(),
var_desc.type(), False)
return var_base
# Create VarBase to receive output data.
......@@ -423,9 +448,16 @@ class PartialProgramLayer:
def _create_scope_vec(self):
# Hold forward variables
tmp_scope_vec = None
if not core._in_eager_mode():
tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
"program_out_scope",
core.VarDesc.VarType.STEP_SCOPES, True)
# TODO(jiabin): Support this later.
# else:
# tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
# "program_out_scope",
# core.VarDesc.VarType.STEP_SCOPES, True)
inner_scope = core.Scope()
tmp_scope_vec.value().set_scope(inner_scope)
......@@ -450,7 +482,8 @@ class PartialProgramLayer:
return main_program.clone(for_test=True)
def _is_no_value(self, var):
if isinstance(var, core.VarBase) and var.shape == [1]:
if isinstance(var,
(core.VarBase, core.eager.Tensor)) and var.shape == [1]:
# NOTE: .numpy() will insert MemcpySync operation, it hits performance.
if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
return True
......@@ -460,7 +493,7 @@ class PartialProgramLayer:
"""
Removes invalid value for various-length return statement
"""
if isinstance(out_vars, core.VarBase):
if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
if self._is_no_value(out_vars):
return None
return out_vars
......@@ -527,7 +560,7 @@ class PartialProgramLayer:
param_and_buffer_names_set = set()
for i, var in enumerate(self._params):
# self._params constains parameters and buffers with persistable=True.
if not isinstance(var, core.VarBase):
if not isinstance(var, (core.VarBase, core.eager.Tensor)):
raise TypeError(
'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
format(i, type(var)))
......@@ -559,10 +592,18 @@ def _create_fake_var():
"""
Create a fake_var (force on CPU) to handle empty input or output
"""
if not core._in_eager_mode():
return [
core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
core.VarDesc.VarType.RAW, False)
]
else:
return []
# TODO(jiabin): Support this later
# return [
# core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
# core.VarDesc.VarType.RAW, False)
# ]
def partial_program_from(concrete_program):
......
......@@ -25,7 +25,7 @@ import threading
import six
import paddle
from paddle.fluid import core
from paddle.fluid import core, dygraph
from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
from paddle.fluid.data_feeder import check_type
from paddle.fluid.layers.utils import flatten, pack_sequence_as
......@@ -898,6 +898,7 @@ def save(layer, path, input_spec=None, **configs):
state_var_dict[var.name] = var
# 3. share parameters from Layer to scope & record var info
with dygraph.guard():
for param_or_buffer in concrete_program.parameters:
# share to scope
if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
......@@ -915,12 +916,14 @@ def save(layer, path, input_spec=None, **configs):
if param_or_buffer.name not in extra_var_info:
extra_info_dict = dict()
if param_or_buffer.name in state_names_dict:
extra_info_dict['structured_name'] = state_names_dict[
extra_info_dict[
'structured_name'] = state_names_dict[
param_or_buffer.name]
extra_info_dict[
'stop_gradient'] = param_or_buffer.stop_gradient
if isinstance(param_or_buffer, ParamBase):
extra_info_dict['trainable'] = param_or_buffer.trainable
extra_info_dict[
'trainable'] = param_or_buffer.trainable
extra_var_info[param_or_buffer.name] = extra_info_dict
# 4. build input & output of save_infernece_model
......
......@@ -94,7 +94,7 @@ def monkey_patch_varbase():
# Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
# It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
attr_not_need_keys = ['grad', 'T']
if isinstance(self, ParamBase):
if isinstance(self, (ParamBase, EagerParamBase)):
attr_kwargs = self.__dict__.copy()
else:
attr_names = []
......@@ -111,7 +111,7 @@ def monkey_patch_varbase():
attr_kwargs.update(kwargs)
if to_parameter or isinstance(self, ParamBase):
if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
del attr_kwargs['persistable']
# NOTE(Aurelius84): All parameters should be placed into global block.
attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
......
......@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
@static_only
def _legacy_save(param_dict, model_path, protocol=2):
def get_tensor(var):
if isinstance(var, core.VarBase):
if isinstance(var, (core.VarBase, core.eager.Tensor)):
return var.numpy()
elif isinstance(var, core.LoDTensor):
return np.array(var)
......
......@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
'flatten')
if in_dygraph_mode():
return _C_ops.flatten2(x, 'axis', axis)[0]
helper = LayerHelper('flatten', **locals())
if not (isinstance(x, Variable)):
......
......@@ -663,6 +663,8 @@ def assign(input, output=None):
})
if is_inplace and in_dygraph_mode():
# TODO(jiabin): Remove this when we support inplace
if not core._in_eager_mode():
output._bump_inplace_version()
return output
......
......@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
exe.run(startup_prog)
sparsity.prune_model(train_prog, sharding=True)
sparsity.prune_model(train_prog)
data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
exe.run(train_prog, feed=feeder.feed([data]))
......
......@@ -520,6 +520,7 @@ def predict_static(args, data):
paddle.enable_static()
exe = fluid.Executor(args.place)
# load inference model
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(
args.model_save_dir,
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import random
import numpy as np
import os
import shutil
import paddle
from paddle.fluid import core
import datetime
from datetime import timedelta
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard
from paddle.fluid.dygraph.parallel import ParallelEnv
class TestProcessGroupFp32(unittest.TestCase):
def setUp(self):
paddle.seed(2022)
random.seed(2022)
np.random.seed(2022)
self.config()
def config(self):
self.dtype = "float32"
self.shape = (2, 10, 5)
def test_create_process_group_gloo(self):
with _test_eager_guard():
nranks = ParallelEnv().nranks
rank = ParallelEnv().local_rank
is_master = True if rank == 0 else False
store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
nranks, datetime.timedelta(0))
gloo_store = paddle.fluid.core.GlooStore(store)
opt = paddle.fluid.core.GlooOptions()
pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
# test allreduce sum
# rank 0
paddle.device.set_device('cpu')
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
sum_result = x + y
if rank == 0:
task = pg.allreduce(tensor_x)
task.wait()
assert np.array_equal(tensor_x, sum_result)
else:
task = pg.allreduce(tensor_y)
task.wait()
assert np.array_equal(tensor_y, sum_result)
print("test allreduce sum api ok")
# test allreduce max
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
max_result = paddle.maximum(tensor_x, tensor_y)
if rank == 0:
task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
task.wait()
assert np.array_equal(tensor_x, max_result)
else:
task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
task.wait()
assert np.array_equal(tensor_y, max_result)
print("test allreduce max api ok")
# test broadcast
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
broadcast_result = paddle.assign(tensor_x)
if rank == 0:
task = pg.broadcast(tensor_x, 0)
task.synchronize()
assert task.is_completed()
assert np.array_equal(broadcast_result, tensor_x)
else:
task = pg.broadcast(tensor_y, 0)
task.synchronize()
assert task.is_completed()
assert np.array_equal(broadcast_result, tensor_y)
print("test broadcast api ok")
if __name__ == "__main__":
unittest.main()
......@@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase):
print("test barrier api ok\n")
# test send/recv
# test allgather
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
out_shape = list(self.shape)
out_shape[0] *= 2
out = np.random.random(out_shape).astype(self.dtype)
tensor_out = paddle.to_tensor(out)
if pg.rank() == 0:
task = pg.send(tensor_x, dst=1)
task = pg.all_gather(tensor_x, tensor_out)
task.wait()
paddle.device.cuda.synchronize()
# rank 1
else:
task = pg.all_gather(tensor_y, tensor_out)
task.wait()
paddle.device.cuda.synchronize()
out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
[out_shape[0]])
assert np.array_equal(tensor_x, out_1)
assert np.array_equal(tensor_y, out_2)
print("test allgather api ok\n")
# test alltoall
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
out1 = np.random.random(self.shape).astype(self.dtype)
out2 = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
task = pg.recv(tensor_y, src=0)
tensor_out1 = paddle.to_tensor(out1)
tensor_out2 = paddle.to_tensor(out2)
raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
[self.shape[0]])
raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
[self.shape[0] // 2])
if pg.rank() == 0:
task = pg.alltoall(tensor_x, tensor_out1)
task.wait()
paddle.device.cuda.synchronize()
assert np.array_equal(tensor_x, tensor_y)
print("test send/recv api ok\n")
# rank 1
else:
task = pg.alltoall(tensor_y, tensor_out2)
task.wait()
paddle.device.cuda.synchronize()
out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
[self.shape[0]])
out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
if pg.rank() == 0:
assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
else:
assert np.array_equal(out2_1, raw_tensor_x_2)
print("test alltoall api ok\n")
# test Reduce
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
sum_result = tensor_x + tensor_y
if pg.rank() == 0:
task = pg.reduce(tensor_x, 0)
task.wait()
paddle.device.cuda.synchronize()
# rank 1
else:
task = pg.reduce(tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
if pg.rank() == 0:
assert np.array_equal(tensor_x, sum_result)
print("test reduce sum api ok\n")
# test Scatter
# rank 0
in_shape = list(self.shape)
in_shape[0] *= 2
x = np.random.random(in_shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
if pg.rank() == 0:
task = pg.scatter(tensor_x, tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
# rank 1
else:
task = pg.scatter(tensor_x, tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
[self.shape[0] * 2])
if pg.rank() == 0:
assert np.array_equal(tensor_y, out1)
else:
assert np.array_equal(tensor_y, out2)
print("test scatter api ok\n")
class TestProcessGroupFp16(TestProcessGroupFp32):
......
......@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
for k, v in self.get_strategy().items():
setattr(build_strategy, k, v)
self.check_before_applied(main2, startup2)
apply_build_strategy(main2, startup2, build_strategy,
{"use_cuda": self.use_cuda})
self.check_after_applied(main2, startup2)
......
......@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
def test_check_output(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
......@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def test_check_output(self):
places = []
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
places.append(place)
for place in places:
for data_format in ["NCHW", "NHWC"]:
#for data_format in ["NCHW", "NHWC"]:
for data_format in ["NCHW"]:
self.check_with_place(place, data_format, self.dtype,
[2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3])
......@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
......@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
......@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
......@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main()
......@@ -28,7 +28,7 @@ import paddle
class TestBatchNorm(unittest.TestCase):
def test_name(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
with fluid.dygraph.guard(p):
......@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
def test_error(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
#paddle.disable_static()
......@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
def test_dygraph(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
shape = [4, 10, 4, 4]
......@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
def test_static(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for p in places:
exe = fluid.Executor(p)
......@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
else:
paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
def tearDown(self):
......@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
class TestBatchNormUseGlobalStats(unittest.TestCase):
def setUp(self):
self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0))
self.init_test()
......@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main()
......@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
def test_process_group_nccl(self):
self.run_mnist_2gpu('process_group_nccl.py')
def test_process_group_gloo(self):
self.run_mnist_2gpu('process_group_gloo.py')
if __name__ == "__main__":
unittest.main()
......@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle
import paddle.fluid.core as core
......@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
TestWithDilation_AsyPadding, grad_check=False)
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
ori_place = egr_tensor.place
new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
egr_tensor._set_value(new_arr)
egr_tensor.set_value(new_arr)
self.assertEqual(egr_tensor.stop_gradient, True)
self.assertTrue(egr_tensor.place._equals(ori_place))
self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
......@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
new_weight = np.ones([1, 3]).astype('float32')
self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
linear.weight._set_value(new_weight)
linear.weight.set_value(new_weight)
self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
self.assertTrue(linear.weight.place._equals(ori_place))
......
......@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import compiler
import paddle.fluid.unique_name as unique_name
import paddle
class TestInplaceANBOpTraining(unittest.TestCase):
......@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
outs[0].name if not only_forward else None,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
bn_fetches = exe.run(program=comp_prog1,
bn_fetches = exe.run(program=main,
feed={'input': data},
fetch_list=fetch_name)
fetch_outs.append(bn_fetches)
fetch_names.append(fetch_name)
for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
fetch_names)):
for bn_val, inplace_abn_val, name1, name2 in zip(*(
fetch_outs + fetch_names)):
self.assertTrue(
np.allclose(
bn_val, inplace_abn_val, atol=1e-2),
......@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
def test_op(self):
use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
#use_cudas = [False]
for use_cuda in use_cudas:
place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
layouts = ["NCHW", "NHWC"]
......@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -21,6 +21,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.core as core
import gradient_checker
import paddle
from decorator_helper import prog_scope
......@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
import seresnext_net
from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
from fake_reader import fake_imdb_reader
import paddle
def lstm_net(use_feed):
......@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -533,10 +533,6 @@ class TestTensorRegisterHook(unittest.TestCase):
size=[self.batch_size, self.in_size]).astype('float32')
data_t = paddle.to_tensor(data)
if _in_eager_mode():
with self.assertRaises(TypeError):
out = jit_net(data_t)
else:
with self.assertRaises(AssertionError):
out = jit_net(data_t)
......
......@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
check_variable_and_dtype(
x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
x, 'x',
['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
'gather')
check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
......
......@@ -43,7 +43,9 @@ class BaseAPI(object):
self.is_base_api = False
self.invoke = api_item_yaml['invoke']
else:
self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta'])
if 'infer_meta' in api_item_yaml:
self.infer_meta = self.parse_infer_meta(api_item_yaml[
'infer_meta'])
self.kernel = self.parse_kernel(api_item_yaml['kernel'])
self.support_selected_rows_kernel = False if len(self.kernel[
'func']) == 1 else True
......@@ -182,9 +184,9 @@ class BaseAPI(object):
'Tensor': 'Tensor',
'Tensor[]': 'std::vector<Tensor>'
}
if re.search(r'\(\w*\)', output_item):
if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
result = re.search(
r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)",
r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
output_item)
out_type = result.group('out_type')
assert out_type in output_type_map, \
......@@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
def get_kernel_args(self, code_indent):
input_trans_map = {
'const Tensor&': 'const phi::DenseTensor&',
'const Tensor &': 'const phi::DenseTensor&',
'const std::vector<Tensor>&':
'const std::vector<phi::DenseTensor>&',
'const std::vector<Tensor> &':
'const std::vector<phi::DenseTensor>&',
'const paddle::optional<Tensor>&':
'paddle::optional<const phi::DenseTensor&>',
'const paddle::optional<std::vector<Tensor>>&':
......@@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
def get_selected_rows_kernel_args(self, code_indent):
input_trans_map = {
'const Tensor&': 'const phi::SelectedRows&',
'const Tensor &': 'const phi::SelectedRows&',
'const paddle::optional<Tensor>&':
'paddle::optional<const phi::SelectedRows&>'
}
......
......@@ -105,7 +105,7 @@ def source_include(header_file_path):
#include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h"
......
......@@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI):
# check the attributes of backward
for attr in self.attrs['names']:
assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
self.attrs['attr_info'][attr][1] is not None, \
f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
Please check the args of {self.api} in yaml."
# check the output of backward
......@@ -145,7 +146,7 @@ def source_include(header_file_path):
#include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h"
......
- sparse_api : conv3d
args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
kernel :
func : sparse_conv3d
layout : x
- sparse_api : to_dense
args : (Tensor x, Backend backend)
output : Tensor(out@DenseTensor)
invoke : to_dense_impl(x, backend)
- sparse_api : to_sparse_coo
args : (Tensor x, Backend backend, int64_t sparse_dim)
output : Tensor(out@SparseCooTensor)
invoke : to_sparse_coo_impl(x, backend, sparse_dim)
- sparse_api : to_sparse_csr
args : (Tensor x, Backend backend)
output : Tensor(out@SparseCsrTensor)
invoke : to_sparse_csr_impl(x, backend)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
import argparse
import re
from api_base import BaseAPI
class SparseAPI(BaseAPI):
def __init__(self, api_item_yaml):
super(SparseAPI, self).__init__(api_item_yaml)
def get_api_name(self, api_item_yaml):
return api_item_yaml['sparse_api']
def get_api_func_name(self):
return self.api
def get_return_type(self, out_type_list):
return out_type_list[0] if len(
out_type_list) == 1 else "std::tuple<" + ",".join(
out_type_list) + ">"
def gene_api_declaration(self):
return f"""
// {", ".join(self.outputs['names'])}
PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
"""
def get_kernel_tensor_out_type(self, output_name):
sparse_type = 'TensorType::DENSE_TENSOR'
if output_name.endswith('@SparseCooTensor'):
sparse_type = 'TensorType::SPARSE_COO'
elif output_name.endswith('@SparseCsrTensor'):
sparse_type = 'TensorType::SPARSE_CSR'
return sparse_type
def gene_output(self,
output_type_list,
set_out_func,
code_indent,
inplace_flag=False):
kernel_output = ""
output_names = []
output_create = ""
if len(output_type_list) == 1:
kernel_output = 'kernel_out'
output_names.append('kernel_out')
inplace_assign = " = " + self.inplace_map[self.outputs['names'][
0]] if inplace_flag and self.inplace_map is not None and self.outputs[
'names'][0] in self.inplace_map else ""
output_create = f"""
{self.outputs['return_type']} out{inplace_assign};
auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
elif len(output_type_list) > 1:
output_create = f"""
{self.outputs['return_type']} out;"""
for i in range(len(output_type_list)):
kernel_output = kernel_output + f'kernel_out_{i}, '
output_names.append(f'kernel_out_{i}')
if inplace_flag and self.inplace_map is not None and self.outputs[
'names'][i] in self.inplace_map:
output_create = output_create + f"""
std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
output_create = output_create + f"""
auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
kernel_output = kernel_output[:-2]
else:
raise ValueError(
"{} : Output error: the output should not be empty.".format(
self.api))
return kernel_output, output_names, output_create
def gen_sparse_kernel_context(self, kernel_output_names):
input_trans_map = {
'const Tensor&': 'const phi::TenseBase&',
'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
'const paddle::optional<Tensor>&':
'paddle::optional<const phi::TenseBase&>'
}
out_trans_map = {
'Tensor': 'phi::TenseBase*',
'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
}
input_names = self.inputs['names']
input_infos = self.inputs['input_info']
attr_names = self.attrs['names']
kernel_param = self.kernel['param']
if kernel_param is None:
kernel_param = input_names + attr_names
kernel_context_code = ""
for param in kernel_param:
if param in input_names:
if param in self.optional_vars:
raise ValueError(
f"{self.api} : Unsupport optional input({param}) for sparse api."
)
else:
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackInput({param}.impl().get());"""
continue
if param in attr_names:
# set attr for kernel_context
if 'ScalarArray' in self.attrs['attr_info'][param][0]:
param = 'phi::ScalarArray(' + param + ')'
elif 'Scalar' in self.attrs['attr_info'][param][0]:
param = 'phi::Scalar(' + param + ')'
elif isinstance(param, bool):
param = str(param).lower()
else:
param + str(param) + ", "
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackAttr({param});"""
for out_name in kernel_output_names:
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackOutput({out_name});"""
return kernel_context_code
def gen_sparse_kernel_code(self, inplace_flag=False):
_, kernel_output_names, output_create = self.gene_output(
self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
kernel_context_code = self.gen_sparse_kernel_context(
kernel_output_names)
return f"""
auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto kernel_context = phi::KernelContext(dev_ctx);
{output_create}
{kernel_context_code}
phi_kernel(&kernel_context);
return out;"""
def gene_base_api_code(self, inplace_flag=False):
api_func_name = self.get_api_func_name()
return f"""
PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
{self.gene_kernel_select()}
{self.gen_sparse_kernel_code(inplace_flag)}
}}
"""
def header_include():
return """
#include <tuple>
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/scalar_array.h"
#include "paddle/utils/optional.h"
"""
def source_include(header_file_path):
return f"""
#include "{header_file_path}"
#include <memory>
#include "glog/logging.h"
#include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/declarations.h"
"""
def api_register():
return """
PD_REGISTER_API(Test);
"""
def api_namespace():
return ("""
namespace paddle {
namespace experimental {
namespace sparse {
""", """
} // namespace sparse
} // namespace experimental
} // namespace paddle
""")
def generate_api(api_yaml_path, header_file_path, source_file_path):
with open(api_yaml_path, 'r') as f:
apis = yaml.load(f, Loader=yaml.FullLoader)
header_file = open(header_file_path, 'w')
source_file = open(source_file_path, 'w')
namespace = api_namespace()
header_file.write("#pragma once\n")
header_file.write(header_include())
header_file.write(namespace[0])
include_header_file = "paddle/phi/api/include/sparse_api.h"
source_file.write(source_include(include_header_file))
source_file.write(namespace[0])
for api in apis:
sparse_api = SparseAPI(api)
header_file.write(sparse_api.gene_api_declaration())
source_file.write(sparse_api.gene_api_code())
header_file.write(namespace[1])
source_file.write(namespace[1])
source_file.write(api_register())
header_file.close()
source_file.close()
def main():
parser = argparse.ArgumentParser(
description='Generate PaddlePaddle C++ Sparse API files')
parser.add_argument(
'--api_yaml_path',
help='path to sparse api yaml file',
default='python/paddle/utils/code_gen/sparse_api.yaml')
parser.add_argument(
'--api_header_path',
help='output of generated api header code file',
default='paddle/phi/api/include/sparse_api.h')
parser.add_argument(
'--api_source_path',
help='output of generated api source code file',
default='paddle/phi/api/lib/sparse_api.cc')
options = parser.parse_args()
api_yaml_path = options.api_yaml_path
header_file_path = options.api_header_path
source_file_path = options.api_source_path
generate_api(api_yaml_path, header_file_path, source_file_path)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册