提交 9e00395a 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_yolo_box_to_phi

...@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec ...@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/phi/api/backward/backward_api.h paddle/phi/api/backward/backward_api.h
paddle/phi/api/include/api.h paddle/phi/api/include/api.h
paddle/phi/api/include/sparse_api.h
paddle/phi/api/lib/api.cc paddle/phi/api/lib/api.cc
paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/dygraph_api.*
paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/backward_api.cc
paddle/phi/api/lib/sparse_api.cc
paddle/phi/extension.h paddle/phi/extension.h
paddle/phi/include/* paddle/phi/include/*
paddle/phi/infermeta/generated.* paddle/phi/infermeta/generated.*
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
if (WITH_DISTRIBUTE)
cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
endif()
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL) if(WITH_NCCL)
......
...@@ -117,6 +117,35 @@ class ProcessGroup { ...@@ -117,6 +117,35 @@ class ProcessGroup {
"ProcessGroup%s does not support receive", GetBackendName())); "ProcessGroup%s does not support receive", GetBackendName()));
} }
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllGather", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in /* tensors */, // NOLINT
std::vector<Tensor>& out /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors /* tensors */, // NOLINT
const ReduceOptions& opts) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Reduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */, // NOLINT
const ScatterOptions&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Scatter", GetBackendName()));
}
protected: protected:
const int rank_; const int rank_;
const int size_; const int size_;
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#ifdef _WIN32
#include <gloo/common/win.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <gloo/broadcast.h>
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
#ifdef _WIN32
#define GENERATE_FUNC(type, func, ...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(__VA_ARGS__); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(__VA_ARGS__); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(__VA_ARGS__); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#define HOST_NAME_MAX 256
#else
#define GENERATE_FUNC(type, func, args...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(args); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(args); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(args); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(args); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(args); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#endif
typedef void (*reduce_func)(void*, const void*, const void*, size_t);
template <typename T>
reduce_func get_function(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return reduce_func(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return reduce_func(&::gloo::product<T>);
case ReduceOp::MIN:
return reduce_func(&::gloo::min<T>);
case ReduceOp::MAX:
return reduce_func(&::gloo::max<T>);
case ReduceOp::AVG:
VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
exit(-1);
}
VLOG(0) << "Error: Unknown ReduceOp.";
exit(-1);
}
bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
return t.place() == PlaceType::kCPU;
});
}
template <typename T>
T* get_data(const Tensor& tensor) {
auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
return static_cast<T*>(raw_tensor->data());
}
template <typename T>
std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
std::vector<T*> ret(tensors.size());
for (size_t i = 0; i < tensors.size(); i++) {
ret[i] = get_data<T>(tensors[i]);
}
return ret;
}
template <typename T, typename P>
void set_output(P& opts, const Tensor& tensor) { // NOLINT
opts.setOutput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_input(P& opts, const Tensor& tensor) { // NOLINT
opts.setInput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_outputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
}
template <typename T, typename P>
void set_inputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
}
ProcessGroupGloo::GlooTask::GlooTask(int rank,
const std::vector<Tensor>& inputs,
CommType comm_type)
: ProcessGroup::Task(rank, inputs, comm_type) {
PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
platform::errors::Fatal(
"Only CPU place is supported for ProcessGroupGloo."));
}
ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
int rank, int world_size,
const std::shared_ptr<GlooOptions> options)
: ProcessGroup(rank, world_size), _tag(0), _store(store) {
_context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
auto prefix_store =
::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
_context->connectFullMesh(prefix_store, options->device);
}
class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
public:
BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
const std::vector<Tensor>& inputs, int rank, int root,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
_context(context),
_root(root),
_inputs(inputs),
_tag(tag) {}
void Run() override { _do_broadcast(_inputs[0]); }
private:
std::shared_ptr<gloo::Context> _context;
const int _root;
std::vector<Tensor> _inputs{};
const uint32_t _tag;
void _do_broadcast(const Tensor& tensor) {
gloo::BroadcastOptions opts(_context);
const auto& dtype = tensor.type();
GENERATE_FUNC(dtype, set_output, opts, tensor);
opts.setRoot(_root);
opts.setTag(_tag);
gloo::broadcast(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
auto root = opts.source_rank;
std::unique_ptr<BroadcastGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
task->Run();
return task;
}
class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
public:
AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
std::vector<Tensor>& inputs, ReduceOp reduce_op, // NOLINT
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
_context(context),
_inputs(inputs),
_reduce_op(reduce_op),
_tag(tag) {}
void Run() override { _do_allreduce(_inputs); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<Tensor> _inputs;
const ReduceOp _reduce_op;
uint32_t _tag;
gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
const ReduceOp op) {
gloo::AllreduceOptions::Func fn;
GENERATE_FUNC(type, _get_function_impl, fn, op);
return fn;
}
template <typename T>
void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
const ReduceOp op) {
fn = get_function<T>(op);
}
void _do_allreduce(std::vector<Tensor>& tensors) { // NOLINT
const auto& dtype = tensors[0].type();
gloo::AllreduceOptions opts(_context);
GENERATE_FUNC(dtype, set_inputs, opts, tensors);
GENERATE_FUNC(dtype, set_outputs, opts, tensors);
opts.setReduceFunction(_get_function(dtype, _reduce_op));
opts.setTag(_tag);
gloo::allreduce(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
auto tag = next_tag();
std::shared_ptr<GlooTask> task;
auto context = get_context();
task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
opts.reduce_op, tag);
task->Run();
return task;
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
::gloo::transport::tcp::attr attr;
attr.iface = ifname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
::gloo::transport::tcp::attr attr;
attr.hostname = hostname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDefaultDevice() {
std::array<char, HOST_NAME_MAX> hostname{};
auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
"Get hostname error for createDefaultDevice."));
::addrinfo* result;
result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
::addrinfo* cur;
for (cur = result; cur != nullptr; cur = cur->ai_next) {
SocketType socket =
::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
if (socket == -1) {
continue;
}
ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
#ifdef _WIN32
closesocket(socket);
#else
close(socket);
#endif
if (ret == -1) {
continue;
}
break;
}
freeaddrinfo(result);
if (cur != nullptr) {
return createDeviceForHostname(hostname.data());
}
return createDeviceForHostname("127.0.0.1");
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <future>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
constexpr const char* GLOO_BACKEND_NAME = "GLOO";
namespace paddle {
namespace distributed {
class ProcessGroupGloo : public ProcessGroup {
public:
class GlooTask : public ProcessGroup::Task,
public std::enable_shared_from_this<GlooTask> {
public:
explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
CommType comm_type);
~GlooTask() = default;
virtual void Run() = 0;
bool Wait(std::chrono::milliseconds timeout) override { return true; }
bool IsCompleted() override { return true; }
void Synchronize() override {}
protected:
friend class ProcessGroupGloo;
};
class GlooStore : public ::gloo::rendezvous::Store {
public:
explicit GlooStore(
const std::shared_ptr<paddle::distributed::TCPStore>& store)
: _store(store) {}
~GlooStore() = default;
std::vector<char> get(const std::string& key) override {
VLOG(3) << "GlooStore::get";
auto value = _store->get(key);
return std::vector<char>(value.begin(), value.end());
}
void wait(const std::vector<std::string>& keys) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
}
void set(const std::string& key, const std::vector<char>& value) override {
VLOG(3) << "GlooStore::set";
std::vector<uint8_t> tmp(value.begin(), value.end());
_store->set(key, tmp);
}
void wait(const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
// wait(keys);
}
protected:
std::shared_ptr<paddle::distributed::TCPStore> _store;
};
class GlooOptions {
public:
GlooOptions() = default;
~GlooOptions() = default;
static std::shared_ptr<GlooOptions> create() {
return std::make_shared<GlooOptions>();
}
std::shared_ptr<::gloo::transport::Device> device;
};
explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
int world_size,
std::shared_ptr<GlooOptions> options);
~ProcessGroupGloo() = default;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& inputs,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& inputs,
const AllreduceOptions& opts = AllreduceOptions()) override;
std::shared_ptr<::gloo::Context> get_context() { return _context; }
uint64_t next_tag() { return _tag++; }
const std::string GetBackendName() const override {
return GLOO_BACKEND_NAME;
}
// Helper functions for Gloo.
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
const std::string& hostname);
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
const std::string& ifname);
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
protected:
uint32_t _tag;
std::shared_ptr<gloo::rendezvous::Context> _context;
std::shared_ptr<GlooStore> _store;
};
} // namespace distributed
} // namespace paddle
...@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv( ...@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
return task; return task;
} }
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::ncclAllGather(
input_tensor->data(), output_tensor->data(), input_tensor->numel(),
platform::ToNCCLDataType(input.type()), comm, stream);
},
CommType::ALLGATHER);
}
void* GetPointerByOffset(void* raw_pointer, size_t offset,
experimental::DataType type) {
if (type == experimental::DataType::FLOAT32) {
return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in nccl is not supported."));
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
tensors, tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
input_tensor->data(), output_tensor->data(), input.numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
},
CommType::REDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
const ScatterOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
if (rank_ == opts.root_rank) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
}
},
CommType::SCATTER);
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup { ...@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup {
std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors, std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
int src_rank) override; int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in, std::vector<Tensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors,
const ScatterOptions&) override;
protected: protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask( virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places, int rank, CommType opType, std::vector<Place> places, int rank, CommType opType,
......
...@@ -36,5 +36,14 @@ struct BarrierOptions { ...@@ -36,5 +36,14 @@ struct BarrierOptions {
std::vector<int> place_ids; std::vector<int> place_ids;
}; };
struct ReduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
int root_rank = 0;
};
struct ScatterOptions {
int root_rank = 0;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,8 @@ class Store { ...@@ -32,6 +32,8 @@ class Store {
virtual int64_t add(const std::string& key, int64_t value) = 0; virtual int64_t add(const std::string& key, int64_t value) = 0;
virtual std::vector<uint8_t> get(const std::string& key) = 0; virtual std::vector<uint8_t> get(const std::string& key) = 0;
virtual void wait(const std::string& key) = 0; virtual void wait(const std::string& key) = 0;
virtual void set(const std::string& key,
const std::vector<uint8_t>& value) = 0;
virtual const std::chrono::seconds& timeout() const { return _timeout; } virtual const std::chrono::seconds& timeout() const { return _timeout; }
......
...@@ -27,11 +27,13 @@ namespace detail { ...@@ -27,11 +27,13 @@ namespace detail {
constexpr int INFTIME = -1; constexpr int INFTIME = -1;
std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) { std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
return std::make_unique<MasterDaemon>(socket); int nranks) {
return std::make_unique<MasterDaemon>(socket, nranks);
} }
MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { MasterDaemon::MasterDaemon(SocketType socket, int nranks)
: _listen_socket(socket), _nranks(nranks) {
_background_thread = std::thread{&MasterDaemon::run, this}; _background_thread = std::thread{&MasterDaemon::run, this};
} }
...@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) { ...@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
tcputils::send_value<int64_t>(socket, new_value); tcputils::send_value<int64_t>(socket, new_value);
} }
void MasterDaemon::_do_set(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_set";
std::string key = tcputils::receive_string(socket);
auto value = tcputils::receive_vector<uint8_t>(socket);
_store[key] = value;
}
void MasterDaemon::_do_get(SocketType socket) { void MasterDaemon::_do_get(SocketType socket) {
std::string key = tcputils::receive_string(socket); std::string key = tcputils::receive_string(socket);
auto iter = _store.find(key); auto iter = _store.find(key);
...@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) { ...@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
iter, _store.end(), iter, _store.end(),
platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
std::vector<uint8_t> value = iter->second; std::vector<uint8_t> value = iter->second;
VLOG(3) << "TCPStore: value ("
<< std::stoll(std::string(reinterpret_cast<char*>(value.data()),
value.size()))
<< ") for key (" << key << ").";
tcputils::send_vector<uint8_t>(socket, value); tcputils::send_vector<uint8_t>(socket, value);
} }
void MasterDaemon::_do_stop(SocketType socket) { void MasterDaemon::_do_stop(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_stop";
ReplyType value = ReplyType::STOP_WAIT; ReplyType value = ReplyType::STOP_WAIT;
if (--_nranks == 0) {
_stop = true; _stop = true;
}
tcputils::send_value<ReplyType>(socket, value); tcputils::send_value<ReplyType>(socket, value);
} }
...@@ -140,21 +148,27 @@ void MasterDaemon::run() { ...@@ -140,21 +148,27 @@ void MasterDaemon::run() {
case Command::GET: case Command::GET:
_do_get(fds[i].fd); _do_get(fds[i].fd);
break; break;
case Command::SET:
_do_set(fds[i].fd);
break;
case Command::WAIT: case Command::WAIT:
_do_wait(fds[i].fd); _do_wait(fds[i].fd);
break; break;
case Command::STOP: case Command::STOP:
_do_stop(fds[i].fd); _do_stop(fds[i].fd);
break; break;
default:
VLOG(0) << "Unknow command: " << static_cast<int>(command);
exit(-1);
} }
} }
} }
} }
std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) { std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
auto server = std::make_unique<TCPServer>(); auto server = std::make_unique<TCPServer>();
server->_master_daemon = MasterDaemon::start(socket); server->_master_daemon = MasterDaemon::start(socket, nranks);
return server; return server;
} }
...@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, ...@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
size_t num_workers, std::chrono::seconds timeout) size_t num_workers, std::chrono::seconds timeout)
: Store(timeout), _is_master(is_master), _num_workers(num_workers) { : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
if (_is_master) { if (_is_master) {
_server = detail::TCPServer::create(port); _server = detail::TCPServer::create(port, num_workers);
} }
_client = detail::TCPClient::connect(host, port); _client = detail::TCPClient::connect(host, port);
...@@ -213,7 +227,6 @@ void TCPStore::waitWorkers() { ...@@ -213,7 +227,6 @@ void TCPStore::waitWorkers() {
} }
add(_init_key, 1); add(_init_key, 1);
if (_server) {
auto begin = std::chrono::steady_clock::now(); auto begin = std::chrono::steady_clock::now();
do { do {
auto value = get(_init_key); auto value = get(_init_key);
...@@ -233,16 +246,22 @@ void TCPStore::waitWorkers() { ...@@ -233,16 +246,22 @@ void TCPStore::waitWorkers() {
"TCPStore timeouted and not all workers got ready.")); "TCPStore timeouted and not all workers got ready."));
} }
} while (true); } while (true);
}
VLOG(3) << "TCPStore initialized."; VLOG(3) << "TCPStore initialized.";
} }
int64_t TCPStore::add(const std::string& key, int64_t value) { int64_t TCPStore::add(const std::string& key, int64_t value) {
VLOG(3) << "TCPStore add.";
_client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_command_for_key(Command::ADD, _key_prefix + key);
_client->send_value<std::int64_t>(value); _client->send_value<std::int64_t>(value);
return _client->receive_value<std::int64_t>(); return _client->receive_value<std::int64_t>();
} }
void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
VLOG(3) << "TCPStore set.";
_client->send_command_for_key(Command::SET, _key_prefix + key);
_client->send_vector<std::uint8_t>(value);
}
std::vector<uint8_t> TCPStore::get(const std::string& key) { std::vector<uint8_t> TCPStore::get(const std::string& key) {
wait(key); wait(key);
_client->send_command_for_key(Command::GET, _key_prefix + key); _client->send_command_for_key(Command::GET, _key_prefix + key);
...@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) { ...@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
void TCPStore::wait(const std::string& key) { void TCPStore::wait(const std::string& key) {
ReplyType reply; ReplyType reply;
VLOG(3) << "TCPStore wait.";
do { do {
_client->send_command_for_key(Command::WAIT, _key_prefix + key); _client->send_command_for_key(Command::WAIT, _key_prefix + key);
...@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) { ...@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
TCPStore::~TCPStore() { TCPStore::~TCPStore() {
_client->send_command_for_key(Command::STOP, ""); _client->send_command_for_key(Command::STOP, "");
VLOG(3) << "~TCPStore";
ReplyType ret = _client->receive_value<ReplyType>(); ReplyType ret = _client->receive_value<ReplyType>();
PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -27,15 +27,16 @@ namespace paddle { ...@@ -27,15 +27,16 @@ namespace paddle {
namespace distributed { namespace distributed {
enum class ReplyType { WAITING, STOP_WAIT }; enum class ReplyType { WAITING, STOP_WAIT };
enum class Command { ADD, GET, WAIT, STOP }; enum class Command { ADD, GET, SET, WAIT, STOP };
namespace detail { namespace detail {
class MasterDaemon { class MasterDaemon {
public: public:
static std::unique_ptr<MasterDaemon> start(SocketType listen_socket); static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
int nranks);
MasterDaemon() = delete; MasterDaemon() = delete;
explicit MasterDaemon(SocketType listen_socket); explicit MasterDaemon(SocketType listen_socket, int nranks);
~MasterDaemon(); ~MasterDaemon();
private: private:
...@@ -43,18 +44,20 @@ class MasterDaemon { ...@@ -43,18 +44,20 @@ class MasterDaemon {
void _do_add(SocketType socket); void _do_add(SocketType socket);
void _do_wait(SocketType socket); void _do_wait(SocketType socket);
void _do_get(SocketType socket); void _do_get(SocketType socket);
void _do_set(SocketType socket);
void _do_stop(SocketType socket); void _do_stop(SocketType socket);
SocketType _listen_socket; SocketType _listen_socket;
std::vector<SocketType> _sockets; std::vector<SocketType> _sockets;
std::unordered_map<std::string, std::vector<uint8_t>> _store; std::unordered_map<std::string, std::vector<uint8_t>> _store;
std::thread _background_thread{}; std::thread _background_thread{};
int _nranks;
bool _stop = false; bool _stop = false;
}; };
class TCPServer { class TCPServer {
public: public:
TCPServer() = default; TCPServer() = default;
static std::unique_ptr<TCPServer> create(std::uint16_t port); static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
private: private:
std::unique_ptr<MasterDaemon> _master_daemon; std::unique_ptr<MasterDaemon> _master_daemon;
...@@ -97,6 +100,7 @@ class TCPStore : public Store { ...@@ -97,6 +100,7 @@ class TCPStore : public Store {
int64_t add(const std::string& key, int64_t value) override; int64_t add(const std::string& key, int64_t value) override;
std::vector<uint8_t> get(const std::string& key) override; std::vector<uint8_t> get(const std::string& key) override;
void wait(const std::string& key) override; void wait(const std::string& key) override;
void set(const std::string& key, const std::vector<uint8_t>& value) override;
private: private:
void waitWorkers(); void waitWorkers();
......
...@@ -46,9 +46,10 @@ void close_socket(SocketType socket) { ...@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
hints.ai_socktype = SOCK_STREAM; hints.ai_socktype = SOCK_STREAM;
const char* node = host.empty() ? nullptr : host.c_str(); const char* node = host.empty() ? nullptr : host.c_str();
const char* port_cstr = port.empty() ? nullptr : port.c_str();
int n; int n;
n = ::getaddrinfo(node, port.c_str(), &hints, &res); n = ::getaddrinfo(node, port_cstr, &hints, &res);
const char* gai_err = ::gai_strerror(n); const char* gai_err = ::gai_strerror(n);
const char* proto = const char* proto =
(family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
......
...@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { ...@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
public: public:
// Constructor: configure fwd input tensors to grad node // Constructor: configure fwd input tensors to grad node
explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
VLOG(6) << "Construct GradNodeAccumulation";
weak_grad_ = meta->WeakGrad(); weak_grad_ = meta->WeakGrad();
SetDefaultGradInOutMeta(); SetDefaultGradInOutMeta();
} }
~GradNodeAccumulation() override = default; ~GradNodeAccumulation() override {
VLOG(6) << "Destruct GradNodeAccumulation";
}
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
......
...@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { ...@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
const std::vector<paddle::experimental::Tensor>& tensors); const std::vector<paddle::experimental::Tensor>& tensors);
void SetAttributes_scale(float scale); void SetAttributes_scale(float scale);
std::string name() override { return ""; }
// Members: define fwd input tensors // Members: define fwd input tensors
// For Scale there is no fwd input tensor needed // For Scale there is no fwd input tensor needed
private: private:
......
...@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent( ...@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
// then generate: "egr::AutogradMeta* p_autograd_out = // then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; std::string get_autograd_meta_str = " // Prepare Autograd Meta \n";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
for (const proto::OpProto::Var& input : in_vars) { for (const proto::OpProto::Var& input : in_vars) {
const std::string& input_name = input.name(); const std::string& input_name = input.name();
const std::string& input_autograd_name = "p_autograd_" + input_name; const std::string& input_autograd_name = "p_autograd_" + input_name;
...@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
} }
VLOG(6) << "Generated inputs autograd_meta"; VLOG(6) << "Generated inputs autograd_meta";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
// Skip Intermediate Tensor
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
std::string prepare_autograd_meta_str = ""; std::string prepare_autograd_meta_str = "";
prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += get_autograd_meta_str;
prepare_autograd_meta_str += "\n"; prepare_autograd_meta_str += "\n";
...@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
" %s" " %s"
" bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
" if(require_any_grad) {\n" " if(require_any_grad) {\n"
" VLOG(6) << \" Construct Grad for %s \"; \n"
" egr::EagerUtils::PassStopGradient(%s);\n" " egr::EagerUtils::PassStopGradient(%s);\n"
"%s\n }"; "%s\n }";
std::string grad_node_creation_body_str = paddle::string::Sprintf( std::string grad_node_creation_body_str = paddle::string::Sprintf(
GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
compute_require_grad_args, pass_stop_gradient_args, compute_require_grad_args, op_type, pass_stop_gradient_args,
grad_node_creation_str); grad_node_creation_str);
return grad_node_creation_body_str; return grad_node_creation_body_str;
...@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
const char* GRAD_NODE_TEMPLATE = const char* GRAD_NODE_TEMPLATE =
"class GradNode%s : public egr::GradNodeBase {\n" "class GradNode%s : public egr::GradNodeBase {\n"
" public:\n" " public:\n"
" GradNode%s() : egr::GradNodeBase() {}\n" " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
"GradNode%s \"; }\n"
" GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
"egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
" ~GradNode%s() override = default;\n" "Construct GradNode%s \"; }\n"
" ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
"\n" "\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> " " virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const " "operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) " "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"override;\n" "override;\n"
"\n" "\n"
" std::string name() override { return \" GradNode%s \"; } \n "
"\n"
" // SetX, SetY, ...\n" " // SetX, SetY, ...\n"
"%s\n" "%s\n"
" // SetAttrMap\n" " // SetAttrMap\n"
"%s\n" "%s\n"
" std::string name() { return \"GradNode%s\"; }\n"
"\n"
" private:\n" " private:\n"
" // TensorWrappers\n" " // TensorWrappers\n"
"%s\n" "%s\n"
...@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
VLOG(6) << "Generated TensorWrapper"; VLOG(6) << "Generated TensorWrapper";
std::string grad_node_str = paddle::string::Sprintf( std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
set_tensor_wrappers_str, set_attr_map_str, op_type, op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
tensor_wrapper_members_str, attr_members_str); tensor_wrapper_members_str, attr_members_str);
return grad_node_str; return grad_node_str;
......
...@@ -213,8 +213,12 @@ def ParseYamlReturns(string): ...@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
returns = [x.strip() for x in string.strip().split(",")] returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)): for i in range(len(returns)):
ret = returns[i] ret_type = returns[i]
returns_list.append(["", ret, i])
assert ret_type in yaml_types_mapping.keys()
ret_type = yaml_types_mapping[ret_type]
returns_list.append(["", ret_type, i])
return returns_list return returns_list
...@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{ ...@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override; const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
std::string name() override {{ return \" {} \"; }}
// SetTensorWrapperX, SetTensorWrapperY, ... // SetTensorWrapperX, SetTensorWrapperY, ...
{} {}
// SetAttributes // SetAttributes
...@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{ ...@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{
""" """
node_declaration_str = NODE_DECLARATION_TEMPLATE.format( node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name,
set_tensor_wrapper_methods_str, set_attribute_methods_str, grad_node_name, set_tensor_wrapper_methods_str,
tensor_wrapper_members_str, attribute_members_str) set_attribute_methods_str, tensor_wrapper_members_str,
attribute_members_str)
return node_declaration_str return node_declaration_str
......
...@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
} }
visited.insert(node); visited.insert(node);
PADDLE_ENFORCE_NOT_NULL(
node,
paddle::platform::errors::Fatal(
"We got null node when we traverse the backward graph, and this "
"should not happened please check your code and contact us."));
// Find and append next nodes // Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges(); const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) { for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) { for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get(); GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no // Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached // AccumulationNode attached
// Or it could also originated from dispensable inputs // Or it could also originated from dispensable inputs
...@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
} }
} }
} }
return node_in_degree_map; return node_in_degree_map;
} }
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
namespace egr { namespace egr {
GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
VLOG(6) << "Construct GradNodeBase";
bwd_in_meta_.resize(bwd_in_slot_num); bwd_in_meta_.resize(bwd_in_slot_num);
bwd_out_meta_.resize(bwd_out_slot_num); bwd_out_meta_.resize(bwd_out_slot_num);
// adj_edges has the same num as backward outputs // adj_edges has the same num as backward outputs
...@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) { ...@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
// its pre-ops // its pre-ops
if (meta && !meta->StopGradient()) { if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode(); auto node = meta->GetMutableGradNode();
if (node) { if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo()); meta->OutRankInfo());
} else { } else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta)); meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo()); meta->OutRankInfo());
} }
...@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { ...@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
"inputs's slot num.")); "inputs's slot num."));
if (meta && !meta->StopGradient()) { if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode(); auto node = meta->GetMutableGradNode();
if (node) { if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name(); << this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
......
...@@ -76,10 +76,10 @@ class GradSlotMeta { ...@@ -76,10 +76,10 @@ class GradSlotMeta {
class GradNodeBase { class GradNodeBase {
public: public:
GradNodeBase() = default; GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
// TODO(jiabin): Should we have other constructor here? // TODO(jiabin): Should we have other constructor here?
virtual ~GradNodeBase() = default; virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
/** /**
* operator() designed to contian the real backward execution logic, it should * operator() designed to contian the real backward execution logic, it should
......
...@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode(float val, int in_num, int out_num) GradTestNode(float val, int in_num, int out_num)
: GradNodeBase(in_num, out_num), val_(val) {} : GradNodeBase(in_num, out_num), val_(val) {}
GradTestNode() : GradNodeBase() { val_ = 1.0; } GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()( std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override { override {
......
...@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( ...@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas, void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
const std::shared_ptr<GradNodeBase>& grad_node) { const std::shared_ptr<GradNodeBase>& grad_node) {
for (const auto& autograd_meta : *autograd_metas) { for (const auto& autograd_meta : *autograd_metas) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) { if (autograd_meta->GradNode()) {
VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " VLOG(7) << "Should not set grad node twice, original node is:"
"detected"; << autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
} }
autograd_meta->SetGradNode(grad_node); autograd_meta->SetGradNode(grad_node);
} }
...@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas, ...@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
void EagerUtils::SetHistory(AutogradMeta* autograd_meta, void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
const std::shared_ptr<GradNodeBase>& grad_node) { const std::shared_ptr<GradNodeBase>& grad_node) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) { if (autograd_meta->GradNode()) {
VLOG(6) VLOG(7) << "Should not set grad node twice, original node is:"
<< "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; << autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
} }
autograd_meta->SetGradNode(grad_node); autograd_meta->SetGradNode(grad_node);
} }
......
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <random>
#include <string> #include <string>
#include <unordered_set>
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp> #include <boost/logic/tribool.hpp>
#include <random>
#include <unordered_set> #include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
...@@ -25,7 +26,7 @@ ...@@ -25,7 +26,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose); USE_OP(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
......
...@@ -409,7 +409,7 @@ class ThreadPoolTempl { ...@@ -409,7 +409,7 @@ class ThreadPoolTempl {
return false; return false;
} }
platform::RecordEvent("SleepWaitForWork", platform::RecordEvent("SleepWaitForWork",
platform::TracerEventType::UserDefined, 2); platform::TracerEventType::UserDefined, 10);
ec_.CommitWait(waiter); ec_.CommitWait(waiter);
blocked_--; blocked_--;
return true; return true;
......
...@@ -2106,6 +2106,8 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2106,6 +2106,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
for (size_t offset = 0; offset < outs_vector.size(); ++offset) { for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
phi::TensorBase* tensor_out = nullptr; phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset]; auto* var = outs_vector[offset];
if (var) {
if (var->template IsType<framework::LoDTensor>()) { if (var->template IsType<framework::LoDTensor>()) {
tensor_out = var->template GetMutable<framework::LoDTensor>(); tensor_out = var->template GetMutable<framework::LoDTensor>();
} else if (var->template IsType<phi::SelectedRows>()) { } else if (var->template IsType<phi::SelectedRows>()) {
...@@ -2115,6 +2117,8 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2115,6 +2117,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.", "Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type()))); framework::ToTypeName(var->Type())));
} }
}
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
...@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
vector_int_attr.end()); vector_int_attr.end());
pt_kernel_context->EmplaceBackAttr(vector_int64_attr); pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
} }
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int32_t>))) { std::type_index(typeid(std::vector<int32_t>))) {
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr); const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
...@@ -314,6 +314,7 @@ void BuildDygraphPhiKernelContext( ...@@ -314,6 +314,7 @@ void BuildDygraphPhiKernelContext(
phi::TensorBase* tensor_out = nullptr; phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset]->MutableVar(); auto* var = outs_vector[offset]->MutableVar();
if (var) {
if (var->template IsType<phi::DenseTensor>()) { if (var->template IsType<phi::DenseTensor>()) {
tensor_out = var->template GetMutable<phi::DenseTensor>(); tensor_out = var->template GetMutable<phi::DenseTensor>();
} else if (var->template IsType<phi::SelectedRows>()) { } else if (var->template IsType<phi::SelectedRows>()) {
...@@ -323,6 +324,8 @@ void BuildDygraphPhiKernelContext( ...@@ -323,6 +324,8 @@ void BuildDygraphPhiKernelContext(
"Unsupported output `%s` type when call pt kernel.", "Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type()))); framework::ToTypeName(var->Type())));
} }
}
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>); ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
ops::BatchNormDoubleGradOpInplaceInferer); ops::BatchNormDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>; ...@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
template <typename T> template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType; using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, framework::DataLayout layout>
static __global__ void BNForwardInference(
const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
const double epsilon, double exponentialAverageFactor, T *y,
BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T>
class BatchNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
auto *y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
&transformed_y);
} else {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(), est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(), est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C, est_mean->dims()[0], est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C, est_var->dims()[0], est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(), epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
Tensor mom_cpu;
paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
&mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
framework::TensorCopy(*x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
platform::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
transformed_y.template data<T>(), bn_param_desc_,
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T, block,
DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T, block,
DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
epsilon, this_factor, transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationForwardTraining(
handle, mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_y.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(), this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const double epsilon, const int N,
const int C, const int HxW, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon, int C, int M,
const int num, const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const framework::DataLayout layout, T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, double epsilon, int C,
int M, const int num, const T *y, int grid2, const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy, const T *x, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy, const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean, const T *x,
const BatchNormParamType<T> *variance, const int C, const int N,
const int HxW, T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
const Tensor *x;
bool is_inplace;
if (ctx.HasInput("Y")) {
x = ctx.Input<Tensor>("Y");
is_inplace = true;
if (d_x) {
PADDLE_ENFORCE_EQ(d_x, d_y,
platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD not inplace in inplace mode"));
}
} else {
x = ctx.Input<Tensor>("X");
is_inplace = false;
if (d_x) {
PADDLE_ENFORCE_NE(
d_x, d_y, platform::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
const bool is_test = ctx.Attr<bool>("is_test");
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x->dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5, true,
platform::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(), x_dims));
int N, C, H, W, D;
ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale->dims().size(), 1UL,
platform::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale->dims().size(), scale->dims()));
PADDLE_ENFORCE_EQ(
scale->dims()[0], C,
platform::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C, scale->dims()[0]));
auto dtype = platform::CudnnDataType<T>::type;
const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
#ifdef PADDLE_WITH_HIP
auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
: DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
reserve_space != nullptr;
auto compute_format =
fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
Tensor transformed_x(x->type());
Tensor transformed_d_y(d_y->type());
Tensor transformed_d_x;
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
&transformed_x);
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
&transformed_d_y);
if (d_x) {
ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
&transformed_d_x);
}
} else {
transformed_x.ShareDataWith(*x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = dev_ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<platform::CUDADeviceContext,
BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
data_desc_, mode_));
#endif
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
const auto *saved_mean_data =
saved_mean->template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format, transformed_x.data<T>(),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
saved_mean_data, saved_var_data, epsilon, C, H * W * D,
num, transformed_x.data<T>(), grid2, block, stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
Tensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/dev_ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/transformed_d_x.template mutable_data<T>(
ctx.GetPlace()),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<
T, block,
DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<
T, block,
DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale->template data<BatchNormParamType<T>>(), saved_mean_data,
saved_var_data, C, N, H * W * D, epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnBatchNormalizationBackward(
dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(), data_desc_,
transformed_x.template data<T>(), data_desc_,
transformed_d_y.template data<T>(), data_desc_,
transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon, saved_mean_data, saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
epsilon, N, C, H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
} else {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = *x;
inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
scale->template data<BatchNormParamType<T>>(),
bias->template data<BatchNormParamType<T>>(),
running_mean_data, running_var_data, epsilon, C,
H * W * D, num, x->data<T>(), grid2, block, stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, block,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
};
template <typename T>
class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *X = ctx.Input<Tensor>("X");
const auto *Scale = ctx.Input<Tensor>("Scale");
const auto *dY = ctx.Input<Tensor>("DY");
const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE_EQ(
is_test, false,
platform::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *ddX = ctx.Input<Tensor>("DDX");
const auto *ddScale = ctx.Input<Tensor>("DDScale");
const auto *ddBias = ctx.Input<Tensor>("DDBias");
auto *dX = ctx.Output<Tensor>("DX");
auto *dScale = ctx.Output<Tensor>("DScale");
auto *ddY = ctx.Output<Tensor>("DDY");
NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
#else
REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
#endif
...@@ -25,10 +25,10 @@ limitations under the License. */ ...@@ -25,10 +25,10 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif #endif
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit); DECLARE_uint64(conv_workspace_size_limit);
...@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input; Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
...@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// cuDNN only supports padding the same amount on every dimension. // cuDNN only supports padding the same amount on every dimension.
// So we create a new padded input tensor. // So we create a new padded input tensor.
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input(input->type()); Tensor transformed_input(input->type());
Tensor transformed_input_grad(input->type()); Tensor transformed_input_grad(input->type());
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
...@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type()); Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type()); Tensor transformed_ddX(X->type());
...@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif #endif
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0); std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_input; Tensor transformed_input;
...@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, input_transpose, pad_value, &transformed_input); dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, input_transpose, pad_value, &transformed_input); dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break; } break;
default: default:
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
...@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0); std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_output_grad; Tensor transformed_output_grad;
...@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, output_grad_transpose, pad_value, dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad); &transformed_output_grad);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, output_grad_transpose, pad_value, dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad); &transformed_output_grad);
} break; } break;
default: default:
...@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type()); Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type()); Tensor transformed_ddX(X->type());
...@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (dO) { if (dO) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_dO_channel, pad_value, dev_ctx, input_pad, transformed_dO_channel, pad_value,
&transformed_dO); &transformed_dO);
} }
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
......
...@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
bool is_fix_seed, int seed_val, const Tensor& x, bool is_fix_seed, int seed_val, const Tensor& x,
const Tensor* seed, Tensor* mask, Tensor* y) { const Tensor* seed, Tensor* mask, Tensor* y) {
auto& place = *dev_ctx.eigen_device(); auto& place = *dev_ctx.eigen_device();
if (!is_test) {
int64_t x_numel = x.numel(); int64_t x_numel = x.numel();
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (!is_test) {
auto* mask_data = mask->data<uint8_t>(); auto* mask_data = mask->data<uint8_t>();
size_t size = phi::product(mask->dims()); size_t size = phi::product(mask->dims());
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (dropout_prob == 1.0f) { if (dropout_prob == 1.0f) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
...@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
} }
#endif #endif
} else { } else {
auto X = EigenMatrix<T>::Reshape(x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
if (upscale_in_train) { if (upscale_in_train) {
Y.device(place) = X; // todo: can y share with data with x directly?
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
hipMemcpyDeviceToDevice, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
cudaMemcpyDeviceToDevice, stream));
#endif
} else { } else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob); T factor = static_cast<T>(1.0f - dropout_prob);
std::vector<const framework::Tensor*> ins = {&x};
std::vector<framework::Tensor*> outs = {y};
auto functor = phi::funcs::ScaleFunctor<T>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} }
} }
} }
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_int64(cudnn_exhaustive_search_times); DECLARE_int64(cudnn_exhaustive_search_times);
...@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input; Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
......
...@@ -32,7 +32,7 @@ namespace platform = paddle::platform; ...@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
namespace op = paddle::operators; namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
......
...@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
axis = static_cast<int>(cpu_axis.data<int32_t>()[0]); axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT64) { } else if (axis_type == framework::proto::VarType::INT64) {
axis = static_cast<int>(cpu_axis.data<int64_t>()[0]); axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT16) {
axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
} }
} }
const auto &place = ctx.GetPlace(); const auto &place = ctx.GetPlace();
...@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
} else if (index_type == framework::proto::VarType::INT64) { } else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
dev_ctx); dev_ctx);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
dev_ctx);
} }
return; return;
} }
...@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output); phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT64) { } else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output); phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
} }
} }
}; };
...@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>, ...@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>, ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>, ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>, ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<int16_t>,
ops::GatherOpCUDAKernel<plat::float16>, ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>); ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
......
...@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { ...@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
auto ids_dims = ctx->GetInputDim("Ids");
auto parents_dims = ctx->GetInputDim("Parents");
PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
platform::errors::InvalidArgument(
"The shape of Input(Parents) must be same with the "
"shape of Input(Ids)."));
ctx->SetOutputDim("Out", ids_dims);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -72,4 +61,8 @@ selected ids. ...@@ -72,4 +61,8 @@ selected ids.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
PT_INFER_META(phi::GatherTreeMeta));
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
GatherTreeInferShapeFunctor);
...@@ -26,27 +26,6 @@ namespace paddle { ...@@ -26,27 +26,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T>
class CPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std);
auto shape = GetShape(context);
tensor->Resize(shape);
int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
}; // namespace operators
template <typename T> template <typename T>
class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
...@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator. ...@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
ops::GaussianRandomOpMaker); ops::GaussianRandomOpMaker);
REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
ops::CPUGaussianRandomKernel<double>);
REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
ops::CPUGaussianRandomBatchSizeLikeKernel<float>, ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
ops::CPUGaussianRandomBatchSizeLikeKernel<double>); ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
......
...@@ -52,53 +52,6 @@ struct GaussianGenerator { ...@@ -52,53 +52,6 @@ struct GaussianGenerator {
} }
}; };
template <typename T>
class GPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out");
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
auto shape = GetShape(context);
tensor->Resize(shape);
auto& dev_cxt =
context.template device_context<platform::CUDADeviceContext>();
T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
int64_t size = tensor->numel();
int device_id = context.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::normal_distribution<MT> dist;
distribution::normal_transform<MT> trans(mean, std);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
} else {
auto func = GaussianGenerator<T>(mean, std, seed);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
}
};
template <typename T> template <typename T>
class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
public: public:
...@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { ...@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(
gaussian_random,
paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
gaussian_random_batch_size_like, gaussian_random_batch_size_like,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
...@@ -213,7 +214,33 @@ class InplaceABNKernel ...@@ -213,7 +214,33 @@ class InplaceABNKernel
auto activation = auto activation =
GetInplaceABNActivationType(ctx.Attr<std::string>("activation")); GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
...@@ -222,8 +249,7 @@ class InplaceABNKernel ...@@ -222,8 +249,7 @@ class InplaceABNKernel
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
...@@ -244,7 +270,52 @@ class InplaceABNGradKernel ...@@ -244,7 +270,52 @@ class InplaceABNGradKernel
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
}; };
......
...@@ -15,14 +15,15 @@ limitations under the License. */ ...@@ -15,14 +15,15 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/inplace_abn_op.h"
#include "paddle/fluid/operators/sync_batch_norm_op.cu.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel
: public paddle::operators::SyncBatchNormKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
...@@ -36,7 +37,33 @@ class InplaceABNKernel ...@@ -36,7 +37,33 @@ class InplaceABNKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormKernel<DeviceContext, T>::Compute(ctx); // BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
} }
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
...@@ -49,8 +76,7 @@ class InplaceABNKernel ...@@ -49,8 +76,7 @@ class InplaceABNKernel
// https://kevinzakka.github.io/2016/09/14/batch_normalization/ // https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel
: public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* y = ctx.Input<Tensor>("Y"); const auto* y = ctx.Input<Tensor>("Y");
...@@ -74,7 +100,50 @@ class InplaceABNGradKernel ...@@ -74,7 +100,50 @@ class InplaceABNGradKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
} }
}; };
......
...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( ...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, void NormDoubleGradFunctor(const DeviceContext &ctx,
const DataLayout data_layout, const Tensor *X, const DataLayout data_layout, const Tensor *X,
const Tensor *Scale, const Tensor *dY, const Tensor *Scale, const Tensor *dY,
const Tensor *Saved_mean, const Tensor *Saved_mean,
const Tensor *Saved_variance, const double epsilon, const Tensor *Saved_variance, const Tensor *Mean,
const Tensor *Variance, const double epsilon,
const bool use_global_stats, const Tensor *ddX, const bool use_global_stats, const Tensor *ddX,
const Tensor *ddScale, const Tensor *ddBias, const Tensor *ddScale, const Tensor *ddBias,
Tensor *dX, Tensor *dScale, Tensor *ddY) { Tensor *dX, Tensor *dScale, Tensor *ddY) {
...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>()); const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); phi::funcs::SetConstant<DeviceContext, T> set_constant;
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
auto &x_dims = X->dims(); auto &x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
Tensor scale_tmp; Tensor scale_tmp;
if (!Scale) { if (!Scale) {
scale_tmp.mutable_data<T>({C}, ctx.GetPlace()); scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1)); set_constant(ctx, &scale_tmp, static_cast<T>(1));
} }
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>(); const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
#else #else
const int block = 512; const int block = 512;
#endif #endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks); int grid = std::min(C, max_blocks);
int grid1 = (num + block - 1) / block; int grid1 = (num + block - 1) / block;
const T *mean_data, *variance_data; const T *mean_data, *variance_data;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean"); const auto *running_mean = Mean;
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto *running_var = Variance;
const auto *running_mean_data = running_mean->template data<T>(); const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>(); const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data; mean_data = running_mean_data;
...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} else { } else {
const T *smean_data = Saved_mean->data<T>(); const T *smean_data = Saved_mean->data<T>();
const T *svariance_data = Saved_variance->data<T>(); const T *svariance_data = Saved_variance->data<T>();
mean_data = smean_data; mean_data = smean_data;
variance_data = svariance_data; variance_data = svariance_data;
} }
if (dX) { if (dX) {
T *dx_data = dX->mutable_data<T>(ctx.GetPlace()); T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dX, static_cast<T>(0)); set_constant(ctx, dX, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} else { } else {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} else { } else {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} }
...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (dScale) { if (dScale) {
T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace()); T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dScale, static_cast<T>(0)); set_constant(ctx, dScale, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} else { } else {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} else { } else {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} }
...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (ddY) { if (ddY) {
T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace()); T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, ddY, static_cast<T>(0)); set_constant(ctx, ddY, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else { } else {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} else { } else {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} }
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/padding.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -50,7 +50,8 @@ class PadConstantLikeKernel : public framework::OpKernel<T> { ...@@ -50,7 +50,8 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]); pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
} }
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, phi::funcs::PaddingFunctor<DeviceContext, T>(
rank, context.template device_context<DeviceContext>(), pads, pad_value,
*in_y, out); *in_y, out);
} }
}; };
...@@ -82,7 +83,8 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> { ...@@ -82,7 +83,8 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]); pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
} }
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout, phi::funcs::PaddingGradFunctor<DeviceContext, T>(
rank, context.template device_context<DeviceContext>(), pads, *in_dout,
d_y); d_y);
} }
}; };
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/pad_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
namespace paddle { namespace paddle {
...@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ...@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>, ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>); ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class PadKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
float pad_value = context.Attr<float>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
int rank = x->dims().size();
math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
static_cast<T>(pad_value), *x, out);
}
};
template <typename DeviceContext, typename T>
class PadGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x == nullptr) {
return;
}
d_x->mutable_data<T>(context.GetPlace());
int rank = d_out->dims().size();
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
d_x);
}
};
} // namespace operators
} // namespace paddle
...@@ -20,9 +20,11 @@ namespace cub = hipcub; ...@@ -20,9 +20,11 @@ namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) { ...@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) {
} }
template <typename T> template <typename T>
__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, struct NonzeroFunctor {
const int ignore_index, const int limit, HOSTDEVICE explicit inline NonzeroFunctor() {}
T *out_data, T *counts) { HOSTDEVICE inline T operator()(const T x) const {
CUDA_KERNEL_LOOP(i, limit) { return static_cast<T>(static_cast<double>(x) != 0);
T x = x_data[i]; }
T label = label_data[i]; };
template <typename T>
struct SigmoidFwdFunctor {
T ignore_index_;
T eps = static_cast<T>(1e-5); T eps = static_cast<T>(1e-5);
T diff = label - static_cast<T>(ignore_index);
HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
: ignore_index_(ignore_index) {}
HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
T counts;
T out_data;
T diff = label - static_cast<T>(ignore_index_);
if ((diff > -eps) && (diff < eps)) { if ((diff > -eps) && (diff < eps)) {
out_data[i] = static_cast<T>(0.); out_data = static_cast<T>(0.);
counts[i] = 0; counts = 0;
} else { } else {
T term1 = (x > 0) ? x : 0; T term1 = (x > 0) ? x : 0;
T term2 = x * label; T term2 = x * label;
T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x)))); T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
out_data[i] = term1 - term2 + term3;
counts[i] = 1;
}
}
}
template <typename T, int BlockDim> out_data = term1 - term2 + term3;
__global__ void Sum(const T *counts, int num, const T eps, T *sum) { counts = 1;
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
T in = 0;
for (int i = threadIdx.x; i < num; i += BlockDim) {
in += counts[i];
}
__syncthreads();
auto out =
BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
T a = out > eps ? out : eps;
sum[0] = a;
} }
} phi::Array<T, 2> outs;
template <typename T> outs[0] = out_data;
__global__ void Div(T *loss, const int num, const T *norm) { outs[1] = counts;
CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } return outs;
} }
};
template <typename T> template <typename T>
__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, struct SigmoidBwdFunctor {
const int ignore_index, const T *dout_data, T ignore_index_;
const int limit, T *dx_data, T *counts) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T label = label_data[i];
T dout = dout_data[i];
T eps = static_cast<T>(1e-5); T eps = static_cast<T>(1e-5);
T diff = label - static_cast<T>(ignore_index);
HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
: ignore_index_(ignore_index) {}
HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
const T dout) {
T counts;
T dx_data;
T diff = label - static_cast<T>(ignore_index_);
if ((diff > -eps) && (diff < eps)) { if ((diff > -eps) && (diff < eps)) {
dx_data[i] = static_cast<T>(0.); dx_data = static_cast<T>(0.);
counts[i] = 0; counts = 0;
} else { } else {
T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x)); T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
T diff = simoid_x - label; T diff = simoid_x - label;
dx_data[i] = dout * diff; dx_data = dout * diff;
counts[i] = 1; counts = 1;
} }
phi::Array<T, 2> outs;
outs[0] = dx_data;
outs[1] = counts;
return outs;
} }
} };
template <typename T>
struct DivFunctor {
const T norm_;
HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
HOSTDEVICE inline T operator()(T loss) {
loss /= norm_;
return loss;
}
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> { ...@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
bool normalize = context.Attr<bool>("normalize"); bool normalize = context.Attr<bool>("normalize");
// Temporary memory // Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); Tensor *counts_tensor = new Tensor();
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr()); counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(Out->dims());
int limit = Out->numel(); int limit = Out->numel();
int blocks = NumBlocks(limit); int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( std::vector<const framework::Tensor *> ins = {X, Labels};
X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts); std::vector<framework::Tensor *> outs = {Out, counts_tensor};
auto functor = SigmoidFwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
if (normalize) { if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
T *norm = reinterpret_cast<T *>(norm_ptr->ptr()); Tensor *norm_tensor = new Tensor();
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
counts, limit, static_cast<T>(1e-5), norm); auto dims = phi::vectorize(counts_tensor->dims());
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm); std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {Out};
std::vector<framework::Tensor *> div_outs = {Out};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
delete counts_tensor;
} }
} }
}; };
...@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel ...@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel
auto &dev_ctx = context.cuda_device_context(); auto &dev_ctx = context.cuda_device_context();
// Temporary memory // Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); Tensor *counts_tensor = new Tensor();
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr()); counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(dX->dims());
int limit = dX->numel(); int limit = dX->numel();
int blocks = NumBlocks(limit); int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit, std::vector<framework::Tensor *> outs = {dX, counts_tensor};
dx_data, counts); auto functor = SigmoidBwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
bool normalize = context.Attr<bool>("normalize"); bool normalize = context.Attr<bool>("normalize");
if (normalize) { if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
T *norm = reinterpret_cast<T *>(norm_ptr->ptr()); Tensor *norm_tensor = new Tensor();
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
counts, limit, static_cast<T>(1e-5), norm); auto dims = phi::vectorize(counts_tensor->dims());
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm); std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {dX};
std::vector<framework::Tensor *> div_outs = {dX};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
} }
} }
}; };
......
...@@ -23,9 +23,9 @@ ...@@ -23,9 +23,9 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/padding.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "thrust/device_vector.h" #include "thrust/device_vector.h"
#endif #endif
...@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> { ...@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
std::vector<int> pads(rank * 2, 0); std::vector<int> pads(rank * 2, 0);
pads[axes.back() * 2 + 1] = zero_length; pads[axes.back() * 2 + 1] = zero_length;
paddle::operators::math::PaddingFunctor<DeviceContext, C>( phi::funcs::PaddingFunctor<DeviceContext, C>(
rank, ctx, pads, static_cast<C>(0), *dy, &full_dy); rank, ctx.template device_context<DeviceContext>(), pads,
static_cast<C>(0), *dy, &full_dy);
fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
!forward); !forward);
} }
......
...@@ -23,12 +23,9 @@ namespace paddle { ...@@ -23,12 +23,9 @@ namespace paddle {
namespace platform { namespace platform {
bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device); auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) { auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
ops = get_kl2_ops(); : get_kl2_ops();
}
if (ops.find(op_name) != ops.end() && if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) { ops[op_name].find(type) != ops[op_name].end()) {
return true; return true;
...@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) { ...@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
bool is_xpu_kp_support_op(const std::string& op_name, bool is_xpu_kp_support_op(const std::string& op_name,
const pOpKernelType& type) { const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device); auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) { auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
ops = get_kp_ops(); : get_kp_ops();
}
if (ops.find(op_name) != ops.end() && if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) { ops[op_name].find(type) != ops[op_name].end()) {
return true; return true;
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) { ...@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) {
#endif #endif
void InitDevices() { void InitDevices() {
// set name at the entry point of Paddle
platform::SetCurrentThreadName("MainThread");
// CUPTI attribute should be set before any CUDA context is created (see CUPTI // CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute). // documentation about CUpti_ActivityAttribute).
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) { ...@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
using paddle::platform::GetCurrentThreadName; using paddle::platform::GetCurrentThreadName;
using paddle::platform::SetCurrentThreadName; using paddle::platform::SetCurrentThreadName;
using paddle::platform::GetAllThreadNames; using paddle::platform::GetAllThreadNames;
EXPECT_EQ("unset", GetCurrentThreadName()); SetCurrentThreadName("MainThread");
EXPECT_TRUE(SetCurrentThreadName("MainThread"));
EXPECT_FALSE(SetCurrentThreadName("MainThread")); EXPECT_FALSE(SetCurrentThreadName("MainThread"));
auto names = GetAllThreadNames(); auto names = GetAllThreadNames();
EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end()); EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
......
...@@ -189,7 +189,10 @@ struct ThreadEventSection { ...@@ -189,7 +189,10 @@ struct ThreadEventSection {
class ThreadEventRecorder { class ThreadEventRecorder {
public: public:
ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); } ThreadEventRecorder() {
thread_id_ = GetCurrentThreadSysId();
thread_name_ = GetCurrentThreadName();
}
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
...@@ -202,7 +205,7 @@ class ThreadEventRecorder { ...@@ -202,7 +205,7 @@ class ThreadEventRecorder {
ThreadEventSection GatherEvents() { ThreadEventSection GatherEvents() {
ThreadEventSection thr_sec; ThreadEventSection thr_sec;
thr_sec.thread_name = GetCurrentThreadName(); thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_; thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce()); thr_sec.events = std::move(base_evt_cntr_.Reduce());
return thr_sec; return thr_sec;
...@@ -210,6 +213,7 @@ class ThreadEventRecorder { ...@@ -210,6 +213,7 @@ class ThreadEventRecorder {
private: private:
uint64_t thread_id_; uint64_t thread_id_;
std::string thread_name_;
EventContainer<CommonEvent> base_evt_cntr_; EventContainer<CommonEvent> base_evt_cntr_;
}; };
......
...@@ -85,6 +85,9 @@ if(NOT ON_INFER) ...@@ -85,6 +85,9 @@ if(NOT ON_INFER)
if (WITH_NCCL) if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif() endif()
if (WITH_GLOO)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
endif()
set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
endif() endif()
......
...@@ -31,9 +31,15 @@ namespace pybind { ...@@ -31,9 +31,15 @@ namespace pybind {
using TCPStore = paddle::distributed::TCPStore; using TCPStore = paddle::distributed::TCPStore;
void BindTCPStore(py::module* m) { void BindTCPStore(py::module* m) {
py::class_<TCPStore>(*m, "TCPStore") py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
.def( .def(py::init([](std::string hostname, uint16_t port, bool is_master,
py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>()) size_t world_size, std::chrono::seconds timeout) {
return std::make_shared<TCPStore>(hostname, port, is_master,
world_size, timeout);
}),
py::arg("hostname"), py::arg("port"), py::arg("is_master"),
py::arg("world_size"), py::arg("timeout"),
py::call_guard<py::gil_scoped_release>())
.def("add", &TCPStore::add) .def("add", &TCPStore::add)
.def("get", &TCPStore::get); .def("get", &TCPStore::get);
} }
......
...@@ -35,6 +35,11 @@ limitations under the License. */ ...@@ -35,6 +35,11 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#endif #endif
#if defined(PADDLE_WITH_GLOO)
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
#endif
namespace py = pybind11; namespace py = pybind11;
namespace paddle { namespace paddle {
...@@ -42,6 +47,14 @@ namespace pybind { ...@@ -42,6 +47,14 @@ namespace pybind {
using Tensor = paddle::experimental::Tensor; using Tensor = paddle::experimental::Tensor;
#if defined(PADDLE_WITH_GLOO)
using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
#endif
static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT
void BindDistributed(py::module *m) { void BindDistributed(py::module *m) {
py::enum_<distributed::ReduceOp>(*m, "ReduceOp") py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
.value("SUM", distributed::ReduceOp::SUM) .value("SUM", distributed::ReduceOp::SUM)
...@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) { ...@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) {
.def(py::init<>()) .def(py::init<>())
.def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
.def(py::init<>())
.def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
.def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
auto ProcessGroup = auto ProcessGroup =
py::class_<distributed::ProcessGroup, py::class_<distributed::ProcessGroup,
std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup") std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
...@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) { ...@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) {
return self.Recv(tensors, src); return self.Recv(tensors, src);
}, },
py::arg("tensor"), py::arg("src"), py::arg("tensor"), py::arg("src"),
py::call_guard<py::gil_scoped_release>())
.def("all_gather",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllGather(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("alltoall",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllToAll(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("reduce",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
int dst, distributed::ReduceOp op) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
distributed::ReduceOptions opts;
opts.reduce_op = op;
opts.root_rank = dst;
std::vector<Tensor> tensors = {in_tensor};
return self.Reduce(tensors, opts);
},
py::arg("tensor"), py::arg("dst"),
py::arg("op") = distributed::ReduceOp::SUM,
py::call_guard<py::gil_scoped_release>())
.def("scatter",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor, int src) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
distributed::ScatterOptions opts;
opts.root_rank = src;
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.Scatter(in_tensors, out_tensors, opts);
},
py::arg("in"), py::arg("out"), py::arg("src"),
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) { ...@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) {
*m, "ProcessGroupNCCL", ProcessGroup) *m, "ProcessGroupNCCL", ProcessGroup)
.def(py::init<const distributed::ProcessGroupStrategy &, int, int>(), .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#endif
py::class_<distributed::ProcessGroup::Task, py::class_<distributed::ProcessGroup::Task,
std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task") std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
...@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) { ...@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) {
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("synchronize", &distributed::ProcessGroup::Task::Synchronize, .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#endif
// define parallel strategy, it will be removed // define parallel strategy, it will be removed
py::class_<distributed::ProcessGroupStrategy> pg_strategy( py::class_<distributed::ProcessGroupStrategy> pg_strategy(
...@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) { ...@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) {
self.nrings_ = nrings; self.nrings_ = nrings;
}); });
#if defined(PADDLE_WITH_GLOO)
py::class_<GlooOptions>(*m, "GlooOptions")
.def(py::init<>())
.def_readwrite("_device", &GlooOptions::device)
.def_static("create", &GlooOptions::create);
py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
.def(py::init(
[](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
return std::make_shared<GlooStore>(store);
}),
py::call_guard<py::gil_scoped_release>());
py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
*m, "ProcessGroupGloo", ProcessGroup)
.def(py::init<const std::shared_ptr<GlooStore> &, int, int,
std::shared_ptr<GlooOptions> &>(),
py::call_guard<py::gil_scoped_release>())
.def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
int world_size) {
auto opts = GlooOptions::create();
char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
if (ifname && strlen(ifname) > 1) {
opts->device = ProcessGroupGloo::createDeviceForInterface(
std::string(ifname));
} else {
opts->device = ProcessGroupGloo::createDefaultDevice();
}
return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
opts);
}),
py::arg("store"), py::arg("rank"),
py::arg("world_size"), // py::arg("timeout") =
// kProcessGroupDefaultTimeout,
py::call_guard<py::gil_scoped_release>())
.def_static("create_default_device",
&ProcessGroupGloo::createDefaultDevice);
#endif
m->def("eager_assign_group_by_size", m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient, [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits, std::vector<size_t> group_size_limits,
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/hooks.h"
...@@ -30,10 +31,12 @@ limitations under the License. */ ...@@ -30,10 +31,12 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self, ...@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
extern PyTypeObject* p_tensor_type; extern PyTypeObject* p_tensor_type;
Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
PADDLE_ENFORCE_EQ(
tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"We can only support initialized tensor in slice, however we got "
"uninitialized tensor %s, please check your code.",
tensor.name()));
return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
CastPyArg2Tensor(obj, 0).impl().get())));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject* obj) {
return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
}
static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self, ...@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
// NOTE(wuweilong): Set value and not change self's original place static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
VLOG(4) << "Value " << self->tensor.name(); PyObject* _index = PyTuple_GET_ITEM(args, 0);
pybind11::object numpy_value = VLOG(4) << "Call _getitem_index_not_tensor";
pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true); std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
InitTensorWithNumpyValue(self, numpy_value, false); decrease_axis, none_axes, infer_flags, list_select_idxs;
Py_INCREF(Py_None); // if index is a list, list_select_flag will be true
return Py_None; bool list_select_flag = false;
PADDLE_ENFORCE_EQ(
self->tensor.is_initialized(), true,
platform::errors::InvalidArgument(
"tensor %s has not been initialized, we can only slice initialized "
"tensor please init it first with numpy or other tensor.",
self->tensor.name()));
auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
&slice_strides, &decrease_axis, &none_axes, &infer_flags,
&list_select_idxs, &list_select_flag);
auto out = slice_axes.empty() && !list_select_flag
? self->tensor
: paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
if (!slice_axes.empty()) {
framework::AttributeMap attrs = {{"axes", slice_axes},
{"starts", slice_starts},
{"ends", slice_ends},
{"infer_flags", infer_flags},
{"decrease_axis", decrease_axis}};
std::string op_type = "slice";
for (auto stride : slice_strides) {
if (stride != 1) {
op_type = "strided_slice";
attrs.insert({"strides", slice_strides});
attrs.erase("decrease_axis");
break;
}
}
if (op_type == "slice") {
out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(),
std::move(attrs));
} else if (op_type == "strided_slice") {
out = strided_slice_dygraph_function(self->tensor, attrs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Slice is only support slice and strided_slice, but we got %s which "
"is impossible, please check your code first or contact us by "
"issue. ",
op_type));
}
}
if (!none_axes.empty()) {
// Deal with cases when all axes are decreased.
// After slice, the shape of out is [1], which should have been
// [], but Paddle doesn't support scalar.
// In order to ensure the correctness of the final shape of out,
// one dimension of out needs to be decreased.
// For example:
// # x.shape: (2,3,4)
// out = x[0, 1, 1, None] # out.shape : (1)
if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
none_axes.pop_back();
}
if (!none_axes.empty()) {
// Deal with cases that decrease_axes is not empty
// For example:
// # x.shape: (2,3,4)
// out = x[0, 0:2, None] # out.shape : (2, 1, 4)
for (auto& axis : none_axes) {
int len = 0;
for (int da : decrease_axis) {
if (da < axis) {
len++;
}
}
axis -= len;
}
paddle::experimental::Tensor new_out;
framework::AttributeMap attrs = {{"axes", none_axes}};
new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
return ToPyObject(new_out);
}
}
// the index is a list
if (list_select_flag) {
auto select_index = paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
auto idx_tensor = std::make_shared<phi::DenseTensor>();
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
egr::Controller::Instance().GetExpectedPlace());
paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
idx_tensor.get());
framework::AttributeMap attrs = {{"dim", 0}};
out = index_select_dygraph_function(self->tensor, select_index,
std::move(attrs));
}
return ToPyObject(out);
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
...@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = { ...@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
{"get_tensor", {"get_tensor",
(PyCFunction)(void (*)(void))tensor_method_get_underline_tensor, (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, {"_getitem_index_not_tensor",
(PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_register_grad_hook", {"_register_grad_hook",
(PyCFunction)(void (*)(void))tensor_register_grad_hook, (PyCFunction)(void (*)(void))tensor_register_grad_hook,
......
...@@ -16,8 +16,11 @@ limitations under the License. */ ...@@ -16,8 +16,11 @@ limitations under the License. */
#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
...@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { ...@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
} }
} }
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos) {
return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
}
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor( std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos) { PyObject* obj, ssize_t arg_pos) {
std::vector<paddle::experimental::Tensor> result; std::vector<paddle::experimental::Tensor> result;
...@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs( ...@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
return result; return result;
} }
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); ...@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos);
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor( std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos); PyObject* obj, ssize_t arg_pos);
platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos); platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
...@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs( ...@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
const std::string& op_type, const std::string& arg_name, PyObject* args, const std::string& op_type, const std::string& arg_name, PyObject* args,
ssize_t arg_idx, bool dispensable = false); ssize_t arg_idx, bool dispensable = false);
// end of Slice related methods
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -54,6 +54,7 @@ limitations under the License. */ ...@@ -54,6 +54,7 @@ limitations under the License. */
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/op_function.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
namespace paddle { namespace paddle {
...@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) { ...@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
} }
} }
Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
if (py::isinstance<imperative::VarBase>(obj)) {
VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
return GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(obj)
->Var()
.Get<framework::LoDTensor>());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>; using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
// NOTE(zjl): py::handle is a very light wrapper of PyObject *. // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
...@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { ...@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
return result; return result;
} }
static bool IsNumpyType(PyObject *obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
// cast numpy type form S to T, this may allocate new memory // cast numpy type form S to T, this may allocate new memory
template <class T, class S> template <class T, class S>
...@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( ...@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
return result; return result;
} }
static bool PyCheckInteger(PyObject *obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static Py_ssize_t GetSliceIndexFromTensor(
const std::shared_ptr<imperative::VarBase> &tensor_index) {
const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
Py_ssize_t *start, Py_ssize_t *stop,
Py_ssize_t *step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor *tensor, PyObject *_index,
std::vector<int> *slice_axes, std::vector<int> *slice_starts,
std::vector<int> *slice_ends, std::vector<int> *slice_strides,
std::vector<int> *decrease_axis, std::vector<int> *none_axes,
std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
bool *list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto &shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject *slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject *slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
template <typename P> template <typename P>
static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT
imperative::VarBase &dst, // NOLINT imperative::VarBase &dst, // NOLINT
......
...@@ -80,6 +80,7 @@ limitations under the License. */ ...@@ -80,6 +80,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
...@@ -101,7 +102,6 @@ limitations under the License. */ ...@@ -101,7 +102,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/gloo_context_py.h" #include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h" #include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/metrics_py.h"
...@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) { ...@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
PYBIND11_MODULE(core_noavx, m) { PYBIND11_MODULE(core_noavx, m) {
#endif #endif
BindImperative(&m);
BindEager(&m); BindEager(&m);
BindCudaStream(&m); BindCudaStream(&m);
...@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
m.def("_promote_types_if_complex_exists", m.def("_promote_types_if_complex_exists",
&paddle::framework::PromoteTypesIfComplexExists); &paddle::framework::PromoteTypesIfComplexExists);
BindImperative(&m);
py::class_<framework::Tensor> framework_tensor(m, "Tensor", py::class_<framework::Tensor> framework_tensor(m, "Tensor",
py::buffer_protocol()); py::buffer_protocol());
g_framework_tensor_pytype = g_framework_tensor_pytype =
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Python.h>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
static bool PyCheckTensor(PyObject* obj);
static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
// Slice related methods
static bool PyCheckInteger(PyObject* obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static bool IsNumpyType(PyObject* obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
Py_ssize_t* start, Py_ssize_t* stop,
Py_ssize_t* step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromPyObject(r->step);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromPyObject(r->start);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromPyObject(r->stop);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor* tensor, PyObject* _index,
std::vector<int>* slice_axes, std::vector<int>* slice_starts,
std::vector<int>* slice_ends, std::vector<int>* slice_strides,
std::vector<int>* decrease_axis, std::vector<int>* none_axes,
std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
bool* list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto& shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject* slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject* slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
} // namespace pybind
} // namespace paddle
...@@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc) ...@@ -32,6 +32,14 @@ set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
set(bw_api_header_file_tmp ${bw_api_header_file}.tmp) set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
set(bw_api_source_file_tmp ${bw_api_source_file}.tmp) set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
# sparse api file
set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
set(sparse_api_header_file_tmp ${api_header_file}.tmp)
set(sparse_api_source_file_tmp ${api_source_file}.tmp)
# wrapped infermeta file # wrapped infermeta file
set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
...@@ -73,6 +81,19 @@ add_custom_command( ...@@ -73,6 +81,19 @@ add_custom_command(
DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base} DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
VERBATIM) VERBATIM)
# generate sparse api
add_custom_command(
OUTPUT ${sparse_api_header_file} ${sparse_api_source_file}
COMMAND ${PYTHON_EXECUTABLE} ${sparse_api_gen_file}
--api_yaml_path ${sparse_api_yaml_file}
--api_header_path ${sparse_api_header_file_tmp}
--api_source_path ${sparse_api_source_file_tmp}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
VERBATIM)
# generate wrapped infermeta # generate wrapped infermeta
add_custom_command( add_custom_command(
OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
...@@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) ...@@ -87,12 +108,14 @@ cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h"
#include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/storage.h"
......
...@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,26 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
/* ------------------ for input ----------------------- */ /* ------------------ for input ----------------------- */
inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor( std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
const Tensor& tensor) {
return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()); return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
} }
inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor( std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
const paddle::optional<Tensor>& tensor) { const paddle::optional<Tensor>& tensor) {
if (tensor) { if (tensor) {
return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl()); return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
...@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor( ...@@ -39,7 +31,7 @@ inline std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
return nullptr; return nullptr;
} }
inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor( std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
const std::vector<Tensor>& tensors) { const std::vector<Tensor>& tensors) {
auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>(); auto pt_tensors = std::make_unique<std::vector<phi::DenseTensor>>();
pt_tensors->reserve(tensors.size()); pt_tensors->reserve(tensors.size());
...@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor( ...@@ -52,12 +44,11 @@ inline std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
return std::move(pt_tensors); return std::move(pt_tensors);
} }
inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows( std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
const Tensor& tensor) {
return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl()); return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
} }
inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows( std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
const paddle::optional<Tensor>& tensor) { const paddle::optional<Tensor>& tensor) {
if (tensor) { if (tensor) {
return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl()); return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
...@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows( ...@@ -67,11 +58,11 @@ inline std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
/* ----------------- for infer_meta --------------------- */ /* ----------------- for infer_meta --------------------- */
inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
return phi::MetaTensor(tensor); return phi::MetaTensor(tensor);
} }
inline paddle::optional<phi::MetaTensor> MakeMetaTensor( paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::DenseTensor&>& tensor) { const paddle::optional<const phi::DenseTensor&>& tensor) {
if (tensor) { if (tensor) {
return {phi::MetaTensor(*tensor)}; return {phi::MetaTensor(*tensor)};
...@@ -79,7 +70,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor( ...@@ -79,7 +70,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
return {paddle::none}; return {paddle::none};
} }
inline std::vector<phi::MetaTensor> MakeMetaTensor( std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor>& tensors) { const std::vector<phi::DenseTensor>& tensors) {
std::vector<phi::MetaTensor> meta_tensors; std::vector<phi::MetaTensor> meta_tensors;
meta_tensors.reserve(tensors.size()); meta_tensors.reserve(tensors.size());
...@@ -89,11 +80,11 @@ inline std::vector<phi::MetaTensor> MakeMetaTensor( ...@@ -89,11 +80,11 @@ inline std::vector<phi::MetaTensor> MakeMetaTensor(
return meta_tensors; return meta_tensors;
} }
inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
return phi::MetaTensor(tensor); return phi::MetaTensor(tensor);
} }
inline paddle::optional<phi::MetaTensor> MakeMetaTensor( paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::SelectedRows&>& tensor) { const paddle::optional<const phi::SelectedRows&>& tensor) {
if (tensor) { if (tensor) {
return {phi::MetaTensor(*tensor)}; return {phi::MetaTensor(*tensor)};
...@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor( ...@@ -103,7 +94,7 @@ inline paddle::optional<phi::MetaTensor> MakeMetaTensor(
/* ------------------ for output ----------------------- */ /* ------------------ for output ----------------------- */
inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
if (!out->initialized()) { if (!out->initialized()) {
auto dense_tensor = std::make_shared<phi::DenseTensor>( auto dense_tensor = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)), phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
...@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { ...@@ -114,8 +105,9 @@ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
return static_cast<phi::DenseTensor*>(out->impl().get()); return static_cast<phi::DenseTensor*>(out->impl().get());
} }
inline std::vector<phi::DenseTensor*> SetKernelOutput( std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
size_t out_size, Backend backend, std::vector<Tensor>* out) { Backend backend,
std::vector<Tensor>* out) {
out->reserve(out_size); out->reserve(out_size);
std::vector<phi::DenseTensor*> results(out_size); std::vector<phi::DenseTensor*> results(out_size);
for (size_t i = 0; i < out_size; ++i) { for (size_t i = 0; i < out_size; ++i) {
...@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput( ...@@ -129,8 +121,7 @@ inline std::vector<phi::DenseTensor*> SetKernelOutput(
return results; return results;
} }
inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
Tensor* out) {
if (!out->initialized()) { if (!out->initialized()) {
auto select_rows = std::make_shared<phi::SelectedRows>(); auto select_rows = std::make_shared<phi::SelectedRows>();
out->set_impl(select_rows); out->set_impl(select_rows);
...@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, ...@@ -139,5 +130,29 @@ inline phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend,
return static_cast<phi::SelectedRows*>(out->impl().get()); return static_cast<phi::SelectedRows*>(out->impl().get());
} }
phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
if (!out->initialized()) {
if (type == TensorType::SPARSE_COO) {
auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
phi::DenseTensor(), phi::DenseTensor(), phi::DDim{-1});
out->set_impl(sparse_tensor);
return sparse_tensor.get();
} else if (type == TensorType::SPARSE_CSR) {
auto sparse_tensor =
std::make_shared<phi::SparseCsrTensor>(phi::DenseTensor(),
phi::DenseTensor(),
phi::DenseTensor(),
phi::DDim{-1});
out->set_impl(sparse_tensor);
return sparse_tensor.get();
} else {
auto dense_tensor = std::make_shared<phi::DenseTensor>();
out->set_impl(dense_tensor);
return dense_tensor.get();
}
}
return out->impl().get();
}
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace paddle {
namespace experimental {
enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO };
/* ------------------ for input ----------------------- */
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
const paddle::optional<Tensor>& tensor);
std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
const std::vector<Tensor>& tensors);
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
const paddle::optional<Tensor>& tensor);
/* ----------------- for infer_meta --------------------- */
phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::DenseTensor&>& tensor);
std::vector<phi::MetaTensor> MakeMetaTensor(
const std::vector<phi::DenseTensor>& tensors);
phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
paddle::optional<phi::MetaTensor> MakeMetaTensor(
const paddle::optional<const phi::SelectedRows&>& tensor);
/* ------------------ for output ----------------------- */
phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out);
std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
Backend backend,
std::vector<Tensor>* out);
phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
} // namespace experimental
} // namespace paddle
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
#include <memory> #include <memory>
#include "glog/logging.h" #include "glog/logging.h"
...@@ -20,29 +20,12 @@ limitations under the License. */ ...@@ -20,29 +20,12 @@ limitations under the License. */
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/infermeta/unary.h"
PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
#endif
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
namespace sparse { namespace sparse {
PADDLE_API Tensor to_sparse_coo(const Tensor& x, Tensor to_sparse_coo_impl(const Tensor& x,
Backend backend, Backend backend,
const int64_t sparse_dim) { const int64_t sparse_dim) {
if (x.layout() == phi::DataLayout::SPARSE_COO) { if (x.layout() == phi::DataLayout::SPARSE_COO) {
...@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x, ...@@ -105,7 +88,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x,
return out; return out;
} }
PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
if (x.layout() == phi::DataLayout::SPARSE_CSR) { if (x.layout() == phi::DataLayout::SPARSE_CSR) {
return x; return x;
} }
...@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { ...@@ -171,7 +154,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) {
return out; return out;
} }
PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { Tensor to_dense_impl(const Tensor& x, Backend backend) {
if (x.layout() != phi::DataLayout::SPARSE_CSR && if (x.layout() != phi::DataLayout::SPARSE_CSR &&
x.layout() != phi::DataLayout::SPARSE_COO) { x.layout() != phi::DataLayout::SPARSE_COO) {
return x; return x;
......
...@@ -21,13 +21,13 @@ namespace paddle { ...@@ -21,13 +21,13 @@ namespace paddle {
namespace experimental { namespace experimental {
namespace sparse { namespace sparse {
PADDLE_API Tensor to_sparse_coo(const Tensor& x, Tensor to_dense_impl(const Tensor& x, Backend backend);
Tensor to_sparse_coo_impl(const Tensor& x,
Backend backend, Backend backend,
const int64_t sparse_dim); const int64_t sparse_dim);
PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend); Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
PADDLE_API Tensor to_dense(const Tensor& x, Backend backend);
} // namespace sparse } // namespace sparse
} // namespace experimental } // namespace experimental
......
...@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase, ...@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0) override; size_t requested_size = 0) override;
void set_dims(const DDim& dims) { this->dims_ = dims; }
private: private:
// save the indices of non zero elements in original dense tensor // save the indices of non zero elements in original dense tensor
......
...@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input, ...@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input,
out->share_lod(input); out->share_lod(input);
} }
void GatherTreeMeta(const MetaTensor& ids,
const MetaTensor& parents,
MetaTensor* out) {
auto ids_dims = ids.dims();
auto parents_dims = parents.dims();
PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
true,
phi::errors::InvalidArgument(
"The shape of Input(Parents) must be same with the "
"shape of Input(Ids)."));
out->set_dims(ids_dims);
}
} // namespace phi } // namespace phi
...@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input, ...@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input,
const MetaTensor& label, const MetaTensor& label,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void GatherTreeMeta(const MetaTensor& ids,
const MetaTensor& parents,
MetaTensor* out);
} // namespace phi } // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad);
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void BatchNormKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context& ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
const auto* d_y = &y_grad;
DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
auto* d_x = x_grad;
auto* d_scale = scale_grad;
auto* d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
// batch_norm with inplace as false will take X as grad input, which
// is same as cuDNN batch_norm backward calculation, batch_norm
// with inplace as true only take Y as input and X should be calculate
// by inverse operation of batch_norm on Y
if (is_inplace) {
if (d_x) {
PADDLE_ENFORCE_EQ(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
} else {
if (d_x) {
PADDLE_ENFORCE_NE(d_x,
d_y,
phi::errors::InvalidArgument(
"X@GRAD and Y@GRAD inplaced in non-inplace mode"));
}
}
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// input dimension is 2 and the format is NCHW. The input can be regarded as
// NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
const T* mean_data = saved_mean.data<T>();
const T* inv_var_data = saved_variance.data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
T* d_bias_data = nullptr;
T* d_scale_data = nullptr;
if (d_scale && d_bias) {
d_bias_data = ctx.template Alloc<T>(d_bias);
d_scale_data = ctx.template Alloc<T>(d_scale);
}
// d_bias = np.sum(d_y, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
if (d_scale && d_bias) {
d_bias_arr.setZero();
d_scale_arr.setZero();
}
if (d_x && (N * sample_size) == 1 && !use_global_stats) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
return;
}
int scale_coefff = use_global_stats ? 1 : N * sample_size;
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
DenseTensor dy_sum;
dy_sum.Resize({C});
auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
auto dy_mul_x_sub_mean_mul_invstd_sum_data =
ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
dy_mul_x_sub_mean_mul_invstd_sum_data, C);
dy_sum_arr.setZero();
dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
// inplace calculation
// Y: ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
// X: (y - bias) / scale / (inv_var) + est_mean
// formula transform ====>
// (y - bias) / (scale * inv_var) + est_mean
switch (data_layout) {
case DataLayout::kNCHW: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
scale_inv_var_nhw(nc % C) / scale_coefff +
mean_arr(nc % C);
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
dy_sum_arr(c) += d_y_arr.col(nc).sum();
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
.sum();
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), sample_size, N * C);
if (!use_global_stats) {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) =
scale_inv_var_nhw(c) *
(d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
(x_arr.col(nc) - mean_arr[c]) *
dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
}
} else {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
}
}
}
break;
}
case DataLayout::kNHWC: {
if (is_inplace) {
auto px = x;
EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; nhw++) {
x_data.col(nhw) =
(y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
mean_arr;
}
}
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
dy_sum_arr += d_y_arr.col(nhw);
dy_mul_x_sub_mean_mul_invstd_sum_arr +=
(x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
}
if (d_scale && d_bias) {
d_bias_arr = dy_sum_arr;
d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
}
if (d_x) {
EigenArrayMap<T> d_x_arr(
ctx.template Alloc<T>(d_x), C, N * sample_size);
if (!use_global_stats) {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) =
scale_inv_var_nhw *
(d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
(x_arr.col(nhw) - mean_arr) *
dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
}
} else {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
}
}
}
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context& dev_ctx,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> reserve_space,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context& ctx,
const DenseTensor& x_grad_grad,
const DenseTensor& scale_grad_grad,
const DenseTensor& bias_grad_grad,
const DenseTensor& y_grad,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& saved_mean,
const DenseTensor& saved_variance,
paddle::optional<const DenseTensor&> mean,
paddle::optional<const DenseTensor&> variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* x_grad,
DenseTensor* scale_grad,
DenseTensor* y_grad_grad) {
const auto* X = &x;
const auto* Scale = &scale;
const auto* dY = &y_grad;
const auto* Saved_mean = &saved_mean;
const auto* Saved_variance = &saved_variance;
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const auto data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto* ddX = &x_grad_grad;
const auto* ddScale = &scale_grad_grad;
const auto* ddBias = &bias_grad_grad;
auto* dX = x_grad;
auto* dScale = scale_grad;
auto* ddY = y_grad_grad;
ctx.template Alloc<T>(dX);
ctx.template Alloc<T>(ddY);
const auto& x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = X->numel() / C;
phi::funcs::SetConstant<Context, T> set_constant;
const T* mean_data = Saved_mean->data<T>();
const T* inv_var_data = Saved_variance->data<T>();
DenseTensor inv_var_tensor;
if (use_global_stats) {
const auto* running_mean = mean.get_ptr();
const auto* running_variance = variance.get_ptr();
mean_data = running_mean->data<T>();
inv_var_tensor.Resize({C});
T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
inv_var_data = running_inv_var_data;
}
// transpose NCHW -> NHWC for easy calculate
DenseTensor transformed_x(X->type());
DenseTensor transformed_dy(dY->type());
DenseTensor transformed_ddx(ddX->type());
DenseTensor transformed_dx(dX->type());
DenseTensor transformed_ddy(ddY->type());
if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
// Input Tensor
ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
TransToChannelLast<Context, T>(ctx, X, &transformed_x);
ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
// Output Tensor
ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
} else {
transformed_x.ShareDataWith(*X);
transformed_dy.ShareDataWith(*dY);
transformed_ddx.ShareDataWith(*ddX);
transformed_dx.ShareDataWith(*dX);
transformed_ddy.ShareDataWith(*ddY);
}
ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
Tensor mean_tile;
mean_tile.Resize({C, sample_size});
EigenArrayMap<T> mean_tile_data(
ctx.template Alloc<T>(&mean_tile), C, sample_size);
DenseTensor inv_var_tile;
inv_var_tile.Resize({C, sample_size});
EigenArrayMap<T> inv_var_tile_data(
ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
mean_tile_data = mean_arr.replicate(1, sample_size);
inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
DenseTensor Scale_data;
if (!Scale) {
Scale_data.Resize({C});
ctx.template Alloc<T>(&Scale_data);
set_constant(ctx, &Scale_data, static_cast<T>(1));
}
ConstEigenVectorArrayMap<T> scale_arr(
Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
Tensor scale_tile;
scale_tile.Resize({C, sample_size});
EigenArrayMap<T> scale_tile_data(
ctx.template Alloc<T>(&scale_tile), C, sample_size);
scale_tile_data = scale_arr.replicate(1, sample_size);
ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
DenseTensor x_sub_mean_mul_invstd;
x_sub_mean_mul_invstd.Resize({C, sample_size});
EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
if (dX) {
ctx.template Alloc<T>(dX);
EigenArrayMap<T> dx_arr(
ctx.template Alloc<T>(&transformed_dx), C, sample_size);
dx_arr.setZero();
if (use_global_stats) {
// math: dx = (ddscale * dy) * inv_var
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
}
} else {
// math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
// axis=(n,h,w)) *
// np.sum(dy, axis=(n,h,w)) -
// np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
// mean),
// axis=(n,h,w)) * inv_var.pow(2) *
// np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
// NxHxW *
// np.sum(ddx * (x - mean)) *
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
// inv_var
// *
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
if (ddX) {
dx_arr +=
(x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
sample_size)
.colwise() *
(ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
(dy_arr * ddx_arr).rowwise().sum() +
3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
(dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
sample_size *
(ddx_arr.rowwise().sum() / sample_size - ddx_arr);
dx_arr = scale_tile_data * dx_arr;
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
dx_arr +=
(dy_arr * inv_var_tile_data -
(dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
inv_var_tile_data -
x_sub_mean_mul_invstd_arr * inv_var_tile_data *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size) *
ddscale_tile_data;
}
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
}
}
if (dScale) {
EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
dscale_arr.setZero();
if (use_global_stats) {
// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
if (ddX) {
dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
}
} else {
// math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
// ddx
if (ddX) {
Tensor first_grad;
first_grad.Resize({C, sample_size});
EigenArrayMap<T> first_grad_arr(
ctx.template Alloc<T>(&first_grad), C, sample_size);
first_grad_arr.setZero();
first_grad_arr +=
inv_var_tile_data *
(dy_arr -
dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(dy_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
}
}
}
if (ddY) {
ctx.template Alloc<T>(ddY);
EigenArrayMap<T> ddy_arr(
ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
ddy_arr.setZero();
if (use_global_stats) {
// math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if (ddX) {
ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
}
} else {
// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// np.mean(ddx * (x - mean), axis=(n,h,w)))
if (ddX) {
ddy_arr +=
scale_tile_data * inv_var_tile_data *
(ddx_arr -
ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
x_sub_mean_mul_invstd_arr *
(ddx_arr * x_sub_mean_mul_invstd_arr)
.rowwise()
.sum()
.replicate(1, sample_size) /
sample_size);
}
}
if (ddScale) {
ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
Tensor ddscale_tile;
ddscale_tile.Resize({C, sample_size});
EigenArrayMap<T> ddscale_tile_data(
ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
}
if (ddBias) {
ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
Tensor ddbias_tile;
ddbias_tile.Resize({C, sample_size});
EigenArrayMap<T> ddbias_tile_data(
ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
ddy_arr += ddbias_tile_data;
}
if (data_layout == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
CPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double) {}
PD_REGISTER_KERNEL(batch_norm_grad_grad,
CPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace phi {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T, typename Context>
void BatchNormKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space) {
bool test_mode = is_test && (!trainable_statistics);
bool global_stats = test_mode || use_global_stats;
auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
const auto& x_dims = x.dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
phi::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensionss is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x.numel() / N / C;
// alloc memory
ctx.template Alloc<T>(y);
ctx.template Alloc<T>(mean_out);
ctx.template Alloc<T>(variance_out);
ctx.template Alloc<T>(saved_mean);
ctx.template Alloc<T>(saved_variance);
// input dimension is 2 and the format is NCHW. The input can be regarded
// as NHWC format
if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
data_layout = DataLayout::kNHWC;
}
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
EigenVectorArrayMap<T> saved_variance_e(
ctx.template Alloc<T>(saved_variance), C);
saved_mean_e.setZero();
saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
C);
if ((N * sample_size) == 1) {
// Only 1 element in normalization dimension,
// we skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
return;
}
switch (data_layout) {
case DataLayout::kNCHW: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum();
}
saved_mean_e /= N * sample_size;
for (int nc = 0; nc < N * C; ++nc) {
saved_variance_e(nc % C) +=
(x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
}
saved_variance_e /= N * sample_size;
break;
}
case DataLayout::kNHWC: {
ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
for (int i = 0; i < N * sample_size; ++i) {
saved_mean_e += x_arr.col(i);
}
saved_mean_e /= N * sample_size;
for (int i = 0; i < N * sample_size; ++i) {
saved_variance_e +=
(x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
}
saved_variance_e /= N * sample_size;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
data_layout_str));
}
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
running_mean_arr =
running_mean_arr * momentum + saved_mean_e * (1. - momentum);
running_var_arr =
running_var_arr * momentum + saved_variance_e * (1. - momentum);
}
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
inv_std = (var_arr + epsilon).sqrt().inverse();
} else {
EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr;
switch (data_layout) {
case DataLayout::kNCHW: {
EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
}
break;
}
case DataLayout::kNHWC: {
EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
(ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
new_scale)
.colwise() +
new_bias;
break;
}
default:
PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
data_layout));
}
}
} // namespace phi
PD_REGISTER_KERNEL(
batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/gaussian_random_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/generator.h"
namespace phi {
template <typename T, typename Context>
void GaussianRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out) {
auto tensor = out;
std::normal_distribution<T> dist(mean, std);
tensor->Resize(phi::make_ddim(shape.GetData()));
int64_t size = tensor->numel();
T* data = dev_ctx.template Alloc<T>(tensor);
auto engine = paddle::framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
} // namespace phi
PD_REGISTER_KERNEL(gaussian_random,
CPU,
ALL_LAYOUT,
phi::GaussianRandomKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
PD_REGISTER_KERNEL(pad_grad,
CPU,
ALL_LAYOUT,
phi::PadGradKernel,
float,
double,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
PD_REGISTER_KERNEL(pad,
CPU,
ALL_LAYOUT,
phi::PadKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
...@@ -15,21 +15,26 @@ limitations under the License. */ ...@@ -15,21 +15,26 @@ limitations under the License. */
#pragma once #pragma once
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
namespace paddle { namespace phi {
namespace operators { namespace funcs {
namespace math {
template <typename T, size_t D, int MajorType = Eigen::RowMajor, template <typename T,
size_t D,
int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; using EigenTensor = EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
void PadFunction(const framework::ExecutionContext& context, void PadFunction(const DeviceContext& context,
const std::vector<int>& pads, const framework::Tensor& src, const std::vector<int>& pads,
T pad_value, framework::Tensor* out) { const DenseTensor& src,
T pad_value,
DenseTensor* out) {
std::array<std::pair<int64_t, int64_t>, D> paddings; std::array<std::pair<int64_t, int64_t>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) { for (size_t i = 0; i < paddings.size(); ++i) {
...@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context, ...@@ -40,16 +45,16 @@ void PadFunction(const framework::ExecutionContext& context,
auto src_tensor = EigenTensor<T, D>::From(src); auto src_tensor = EigenTensor<T, D>::From(src);
auto out_tensor = EigenTensor<T, D>::From(*out); auto out_tensor = EigenTensor<T, D>::From(*out);
auto& place = auto& place = *(context.eigen_device());
*context.template device_context<DeviceContext>().eigen_device();
EigenPad<std::decay_t<decltype(place)>, T, D>::Eval( EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_tensor, src_tensor, paddings, pad_value); place, out_tensor, src_tensor, paddings, pad_value);
} }
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
void PadGradFunction(const framework::ExecutionContext& context, void PadGradFunction(const DeviceContext& context,
const std::vector<int>& pads, const framework::Tensor& src, const std::vector<int>& pads,
framework::Tensor* d_out) { const DenseTensor& src,
DenseTensor* d_out) {
std::array<std::pair<int64_t, int64_t>, D> paddings; std::array<std::pair<int64_t, int64_t>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) { for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = -pads[i * 2]; paddings[i].first = -pads[i * 2];
...@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context, ...@@ -58,16 +63,18 @@ void PadGradFunction(const framework::ExecutionContext& context,
auto d_out_tensor = EigenTensor<T, D>::From(*d_out); auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
auto src_tensor = EigenTensor<T, D>::From(src); auto src_tensor = EigenTensor<T, D>::From(src);
auto& place = auto& place = *(context.eigen_device());
*context.template device_context<DeviceContext>().eigen_device();
EigenPad<std::decay_t<decltype(place)>, T, D>::Eval( EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
place, d_out_tensor, src_tensor, paddings, static_cast<T>(0)); place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void PaddingFunctor(int rank, const framework::ExecutionContext& context, void PaddingFunctor(int rank,
const std::vector<int>& pads, T pad_value, const DeviceContext& context,
const framework::Tensor& src, framework::Tensor* out) { const std::vector<int>& pads,
T pad_value,
const DenseTensor& src,
DenseTensor* out) {
switch (rank) { switch (rank) {
case 1: case 1:
PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out); PadFunction<DeviceContext, T, 1>(context, pads, src, pad_value, out);
...@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context, ...@@ -88,16 +95,18 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out); PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
break; break;
default: default:
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(
"PadOp only support tensors with no more" phi::errors::Unimplemented("PadOp only support tensors with no more"
" than 6 dimensions currently.")); " than 6 dimensions currently."));
} }
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, void PaddingGradFunctor(int rank,
const DeviceContext& context,
const std::vector<int>& pads, const std::vector<int>& pads,
const framework::Tensor& src, framework::Tensor* out) { const DenseTensor& src,
DenseTensor* out) {
switch (rank) { switch (rank) {
case 1: case 1:
PadGradFunction<DeviceContext, T, 1>(context, pads, src, out); PadGradFunction<DeviceContext, T, 1>(context, pads, src, out);
...@@ -118,8 +127,8 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, ...@@ -118,8 +127,8 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
PadGradFunction<DeviceContext, T, 6>(context, pads, src, out); PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
break; break;
default: default:
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(
"PadOp only support tensors with no more" phi::errors::Unimplemented("PadOp only support tensors with no more"
" than 6 dimensions currently.")); " than 6 dimensions currently."));
} }
} }
...@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads, ...@@ -137,6 +146,5 @@ inline bool IsSymmetricPadding(const std::vector<int>& pads,
} }
return is_sys_pad; return is_sys_pad;
} }
} // namespace math } // namespace funcs
} // namespace operators } // namespace phi
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
namespace phi {
template <typename T, typename Context>
void GaussianRandomKernel(const Context& ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
const T *dy,
const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const double epsilon,
const int N,
const int C,
const int HxW,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T, phi::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon,
const int C,
const int HxW,
const int num,
T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T>
static __global__ void KeBNRestoreData(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
x[i] = static_cast<T>(x_i);
}
}
template <typename T>
class InplaceHelper {
public:
void operator()(const phi::DataLayout layout,
T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
double epsilon,
int C,
int M,
const int num,
const T *y,
int grid2,
const int block,
const gpuStream_t &stream) {
PADDLE_ENFORCE_EQ(x,
y,
phi::errors::InvalidArgument(
"X and Y should be inplaced in inplace mode"));
KeBNRestoreData<<<grid2, block, 0, stream>>>(
layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
}
};
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
const T *dy,
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *saved_mean,
const BatchNormParamType<T> *saved_inv_variance,
const int C,
const int N,
const int HxW,
const double epsilon,
T *dx,
BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> inv_var_val;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> dscale_val;
__shared__ BatchNormParamType<T> dbias_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
if (saved_mean && saved_inv_variance) {
if (threadIdx.x == 0) {
inv_var_val = saved_inv_variance[i];
mean_val = saved_mean[i];
}
} else {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i =
static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
inv_var_val =
1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
}
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
ds_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
db_sum += dy_i;
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale_val = ds_sum * inv_var_val;
dbias_val = db_sum;
dscale[i] = dscale_val;
dbias[i] = dbias_val;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] = scale[i] * inv_var_val *
(static_cast<BatchNormParamType<T>>(dy[index]) -
dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
inv_var_val * dscale_val / inner_size);
}
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *mean,
const T *x,
const BatchNormParamType<T> *variance,
const int C,
const int N,
const int HxW,
T *dx) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage dy_storage;
__shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
__shared__ BatchNormParamType<T> dy_sum_val;
__shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> inv_var_i = variance[i];
BatchNormParamType<T> mean_i = mean[i];
BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> dy_x_sub_mean_sum =
static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> dy_i =
static_cast<BatchNormParamType<T>>(dy[index]);
dy_sum += dy_i;
dy_x_sub_mean_sum +=
dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
}
dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
.Reduce(dy_x_sub_mean_sum, cub::Sum());
if (threadIdx.x == 0) {
dy_sum_val = dy_sum;
dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
dx[index] =
(static_cast<BatchNormParamType<T>>(dy[index]) -
dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
scale[i] * inv_var_i;
}
}
}
template <typename T, typename Context>
void BatchNormGradRawKernel(const Context &ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
bool is_inplace,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
double epsilon = static_cast<double>(epsilon_f);
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const auto *d_y = &y_grad;
auto *d_x = x_grad;
auto *d_scale = scale_grad;
auto *d_bias = bias_grad;
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5."
"But received: the size of input's dimensions is [%d],"
"the dimensions of input is [%s]",
x_dims.size(),
x_dims));
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
// init output
if (d_x) {
ctx.template Alloc<T>(d_x);
}
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
scale.dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of scale's dimensions must equal to 1. But received: "
"the size of scale's dimensions is [%d], the dimensions of scale "
"is [%s].",
scale.dims().size(),
scale.dims()));
PADDLE_ENFORCE_EQ(
scale.dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of scale must equal to Channels[%d]. But "
"received: the first dimension of scale is [%d]",
C,
scale.dims()[0]));
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
FLAGS_cudnn_batchnorm_spatial_persistent &&
(reserve_space.get_ptr() != nullptr);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_d_y(d_y->type());
DenseTensor transformed_d_x;
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
if (d_x) {
ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
}
} else {
transformed_x.ShareDataWith(x);
transformed_d_y.ShareDataWith(*d_y);
if (d_x) {
transformed_d_x.ShareDataWith(*d_x);
}
}
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
const int num = transformed_x.numel();
#ifdef HIPCC
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
auto stream = ctx.stream();
InplaceHelper<T> inplace_functor;
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
if (d_x) {
paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
}
phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
return;
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(
&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
// data_desc_, mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_, data_desc_, mode_));
#endif
const auto *saved_mean_data =
saved_mean.template data<BatchNormParamType<T>>();
const auto *saved_var_data =
saved_variance.template data<BatchNormParamType<T>>();
if (is_inplace) {
inplace_functor(compute_format,
transformed_x.data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
epsilon,
C,
H * W * D,
num,
transformed_x.data<T>(),
grid2,
block,
stream);
}
// This branch calls CUDNN APIs
if (d_x && d_scale && d_bias) {
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
auto reserve_space_size = reserve_space->memory_size();
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationBackwardExWorkspaceSize(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*yDesc=*/data_desc_,
/*dyDesc=*/data_desc_,
/*dzDesc=*/nullptr,
/*dxDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
/*handle=*/ctx.cudnn_handle(),
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*alphaDataDiff=*/CudnnDataType<T>::kOne(),
/*betaDataDiff=*/CudnnDataType<T>::kZero(),
/*alphaParamDiff=*/CudnnDataType<T>::kOne(),
/*betaParamDiff=*/CudnnDataType<T>::kZero(),
/*xDesc=*/data_desc_,
/*xData=*/transformed_x.template data<T>(),
/*yDesc=*/nullptr,
/*yData=*/nullptr,
/*dyDesc=*/data_desc_,
/*dyData=*/transformed_d_y.template data<T>(),
/*dzDesc=*/nullptr,
/*dzData=*/nullptr,
/*dxDesc=*/data_desc_,
/*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
/*dBnScaleBiasDesc=*/bn_param_desc_,
/*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
/*bnBiasData=*/nullptr,
/*dBnScaleData=*/d_scale
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*dBnBiasData=*/d_bias
->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
/*epsilon=*/epsilon,
/*savedMean=*/saved_mean_data,
/*savedInvVariance=*/saved_var_data,
/*activationDesc=*/nullptr,
/*workspace=*/workspace_ptr,
/*workSpaceSizeInBytes=*/workspace_size,
/*reserveSpace=*/const_cast<T *>(
reserve_space->template data<T>()),
/*reserveSpaceSizeInBytes=*/reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
if (compute_format == DataLayout::kNCHW) {
BNBackward<T,
block,
DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
} else {
BNBackward<T,
block,
DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
saved_mean_data,
saved_var_data,
C,
N,
H * W * D,
epsilon,
transformed_d_x.template data<T>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()));
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationBackward(
// dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
// CudnnDataType<T>::kZero(), data_desc_,
// transformed_x.template data<T>(), data_desc_,
// transformed_d_y.template data<T>(), data_desc_,
// transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
// bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
// d_scale->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// d_bias->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationBackward(
ctx.cudnn_handle(),
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
transformed_d_y.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_d_x),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
d_scale->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
d_bias->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean_data,
saved_var_data));
#endif
}
if (data_layout == DataLayout::kNHWC &&
compute_format == DataLayout::kNCHW) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
}
} else {
// This branch call CUDA kernels
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
BNBackwardData<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
saved_mean_data,
x.data<T>(),
saved_var_data,
C,
N,
H * W * D,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
saved_mean_data,
saved_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(
bn_param_desc_));
#endif
} else {
const auto *running_mean = mean.get_ptr();
const auto *running_var = variance.get_ptr();
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
if (is_inplace) {
auto px = x;
inplace_functor(data_layout,
ctx.template Alloc<T>(&px),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
running_mean_data,
running_var_data,
epsilon,
C,
H * W * D,
num,
x.data<T>(),
grid2,
block,
stream);
}
if (compute_format == DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<T,
phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
d_y->data<T>(),
scale.data<BatchNormParamType<T>>(),
running_var_data,
epsilon,
C,
H * W,
num,
d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T,
block,
phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
d_y->data<T>(),
x.data<T>(),
running_mean_data,
running_var_data,
epsilon,
N,
C,
H * W * D,
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
template <typename T, typename Context>
void BatchNormGradKernel(const Context &dev_ctx,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> reserve_space,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *bias_grad) {
BatchNormGradRawKernel<T, Context>(dev_ctx,
y_grad,
x,
scale,
bias,
saved_mean,
saved_variance,
reserve_space,
mean,
variance,
momentum,
epsilon,
data_layout,
is_test,
use_global_stats,
trainable_statistics,
fuse_with_relu,
false,
x_grad,
scale_grad,
bias_grad);
}
template <typename T, typename Context>
void BatchNormDoubleGradKernel(const Context &ctx,
const DenseTensor &x_grad_grad,
const DenseTensor &scale_grad_grad,
const DenseTensor &bias_grad_grad,
const DenseTensor &y_grad,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &saved_mean,
const DenseTensor &saved_variance,
paddle::optional<const DenseTensor &> mean,
paddle::optional<const DenseTensor &> variance,
float momentum,
float epsilon,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *x_grad,
DenseTensor *scale_grad,
DenseTensor *y_grad_grad) {
PADDLE_ENFORCE_EQ(is_test,
false,
phi::errors::InvalidArgument(
"`is_test = True` CANNOT be used in train program. If "
"you want to use global status in pre_train model, "
"please set `use_global_stats = True`"));
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
const DenseTensor *running_mean = nullptr;
const DenseTensor *running_variance = nullptr;
if (use_global_stats) {
running_mean = mean.get_ptr();
running_variance = variance.get_ptr();
}
paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
data_layout,
&x,
&scale,
&y_grad,
&saved_mean,
&saved_variance,
running_mean,
running_variance,
epsilon,
use_global_stats,
&x_grad_grad,
&scale_grad_grad,
&bias_grad_grad,
x_grad,
scale_grad,
y_grad_grad);
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormGradKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
PD_REGISTER_KERNEL(batch_norm_grad_raw,
GPU,
ALL_LAYOUT,
phi::BatchNormGradRawKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#else
PD_REGISTER_KERNEL(batch_norm_grad_grad,
GPU,
ALL_LAYOUT,
phi::BatchNormDoubleGradKernel,
float,
double) {}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/operators/norm_utils.cu.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
namespace phi {
template <typename T>
using CudnnDataType = paddle::platform::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T, phi::DataLayout layout>
static __global__ void BNForwardInference(const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
double exponentialAverageFactor,
T *y,
BatchNormParamType<T> *mean,
BatchNormParamType<T> *variance,
BatchNormParamType<T> *save_mean,
BatchNormParamType<T> *save_inv_variance) {
int outer_size = C;
int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage mean_storage;
__shared__ typename BlockReduce::TempStorage variance_storeage;
__shared__ BatchNormParamType<T> mean_val;
__shared__ BatchNormParamType<T> variance_val;
__shared__ BatchNormParamType<T> inv_var_val;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
x_sum += x_i;
x_square_sum += x_i * x_i;
}
x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
x_square_sum =
BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
if (threadIdx.x == 0) {
mean_val = x_sum / inner_size;
variance_val = x_square_sum / inner_size - mean_val * mean_val;
inv_var_val = 1 / sqrt(variance_val + epsilon);
if (save_mean && save_inv_variance) {
save_mean[i] = mean_val;
save_inv_variance[i] = inv_var_val;
}
mean[i] = (1 - exponentialAverageFactor) * mean_val +
exponentialAverageFactor * mean[i];
variance[i] = (1 - exponentialAverageFactor) * variance_val +
exponentialAverageFactor * variance[i];
}
__syncthreads();
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == phi::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
}
}
}
template <typename T, typename Context>
void BatchNormKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &scale,
const DenseTensor &bias,
const DenseTensor &mean,
const DenseTensor &variance,
float momentum,
float epsilon_f,
const std::string &data_layout_str,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
DenseTensor *y,
DenseTensor *mean_out,
DenseTensor *variance_out,
DenseTensor *saved_mean,
DenseTensor *saved_variance,
DenseTensor *reserve_space) {
double epsilon = epsilon_f;
const bool trainable_stats = trainable_statistics;
const DataLayout data_layout =
paddle::framework::StringToDataLayout(data_layout_str);
bool test_mode = is_test && (!trainable_stats);
// Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x.dims();
PADDLE_ENFORCE_EQ(
x_dims.size() >= 2 && x_dims.size() <= 5,
true,
phi::errors::InvalidArgument(
"The size of input's dimensions should be between 2 and 5"
"But received: the size of input's dimensions is [%d]",
x_dims.size()));
ctx.template Alloc<T>(y);
int N, C, H, W, D;
paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP
auto compute_format =
data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// HIP do not support compute format of NHWC
// auto compute_format = DataLayout::kNCHW;
#else
const bool fast_nhwc_batch_norm =
test_mode ||
(dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
? DataLayout::kNHWC
: DataLayout::kNCHW;
#endif
DenseTensor transformed_x(x.type());
DenseTensor transformed_y(y->type());
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform input tensor from NHWC to NCHW.";
ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
} else {
transformed_x.ShareDataWith(x);
transformed_y.ShareDataWith(*y);
}
// ------------------- cudnn descriptors ---------------------
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// miopenTensorDescriptor_t data_desc_;
// miopenTensorDescriptor_t bn_param_desc_;
// miopenBatchNormMode_t mode_;
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
#else
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
#endif
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. Setting it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
}
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// mode_ = miopenBNSpatial;
#elif CUDNN_VERSION_MIN(7, 0, 1)
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
} else if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#else
if (H == 1 && W == 1) {
mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
} else {
mode_ = CUDNN_BATCHNORM_SPATIAL;
}
#endif // CUDNN_VERSION_MIN(7, 0, 1)
VLOG(3) << "Setting descriptors.";
std::vector<int> dims;
std::vector<int> strides;
if (compute_format == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * D * C, 1, W * D * C, D * C, C};
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
// data_desc_, CudnnDataType<T>::type,
// x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
// const_cast<int *>(strides.data())));
// Note: PERSISTENT not implemented for inference
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDeriveBNTensorDescriptor(
// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_,
CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4,
dims.data(),
strides.data()));
// Note: PERSISTENT not implemented for inference
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
bn_param_desc_,
data_desc_,
test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
#endif
auto handle = ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
bool training = !test_mode && !use_global_stats;
if (!training) {
// only when test we use input to do computation.
const auto *est_mean = &mean;
const auto *est_var = &variance;
// Run inference mode.
PADDLE_ENFORCE_EQ(
est_mean->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of mean's dimensions must equal to 1."
"But received: the size of mean's dimensions mean is [%d],"
"the dimensions of mean is [%s].",
est_mean->dims().size(),
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims().size(),
1UL,
phi::errors::InvalidArgument(
"The size of variance's dimensions must equal to 1."
"But received: the size of variance's dimensions is [%d],"
"the dimensions of variance is [%s].",
est_var->dims().size(),
est_var->dims()));
PADDLE_ENFORCE_EQ(
est_mean->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of mean must equal to the number of "
"Channels, which is [%d]. But received: the first dimension"
"of mean is [%d], the dimensions of mean is [%s].",
C,
est_mean->dims()[0],
est_mean->dims()));
PADDLE_ENFORCE_EQ(
est_var->dims()[0],
C,
phi::errors::InvalidArgument(
"The first dimension of variance must equal to the number"
"of Channels, which is [%d]. But received: the first dimension of"
"variance is [%d], the dimensions of variance is [%s].",
C,
est_var->dims()[0],
est_var->dims()));
#ifdef PADDLE_WITH_HIP
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
if (compute_format == DataLayout::kNCHW) {
BNForwardInference<
T,
DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
} else {
BNForwardInference<
T,
DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardInference(
// handle, miopenBNSpatial,
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_mean->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// est_var->template data<BatchNormParamType<T>>())),
// epsilon));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
handle,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
epsilon));
#endif
} else {
// if MomentumTensor is set, use MomentumTensor value, momentum
// is only used in this training branch
// need to solve here
// if (ctx.HasInput("MomentumTensor")) {
// const auto *mom_tensor = MomentumTensor;
// DenseTensor mom_cpu;
// paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
// &mom_cpu);
// momentum = mom_cpu.data<float>()[0];
// }
// Run training mode.
// obtain running mean and running inv var, and there is no need
// to initialize them.
mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
if ((N * H * W * D) == 1) {
// Only 1 element in normalization dimension,
// skip the batch norm calculation, let y = x.
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
// auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
phi::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));
// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));
reserve_space_ptr = reserve_space->mutable_data(
ctx.GetPlace(), transformed_x.type(), reserve_space_size);
workspace_ptr = workspace_tensor.mutable_data(
ctx.GetPlace(), transformed_x.type(), workspace_size);
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle,
mode_,
CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
nullptr,
nullptr,
data_desc_,
transformed_y.template data<T>(),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
nullptr,
workspace_ptr,
workspace_size,
reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<
T,
block,
DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<
T,
block,
DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
handle,
mode_,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
data_desc_,
ctx.template Alloc<T>(&transformed_y),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
mean_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
variance_out->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
epsilon,
saved_mean->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace()),
saved_variance->template mutable_data<BatchNormParamType<T>>(
ctx.GetPlace())));
#endif
}
}
}
if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
x_dims.size() > 2) {
VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
TransToChannelLast<Context, T>(ctx, &transformed_y, y);
}
#ifdef PADDLE_WITH_HIP
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// clean when exit.
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
#else
// clean when exit.
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
#endif
}
} // namespace phi
#ifdef PADDLE_WITH_HIP
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(batch_norm,
GPU,
ALL_LAYOUT,
phi::BatchNormKernel,
float,
double,
phi::dtype::float16) {
if (kernel_key.dtype() == phi::DataType::FLOAT16) {
kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
}
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
using Tensor = DenseTensor;
template <typename DeviceContext, typename T>
inline void ResizeToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[4];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
in_dims_vec[4] = input->dims()[3];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[3];
in_dims_vec[2] = input->dims()[1];
in_dims_vec[3] = input->dims()[2];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void ResizeToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[4];
in_dims_vec[4] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 2) {
// input
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[3];
in_dims_vec[3] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
} else if (dim == 1) {
transformed_input->Resize(input->dims());
auto in_dims_vec = phi::vectorize(input->dims());
in_dims_vec[1] = input->dims()[2];
in_dims_vec[2] = input->dims()[1];
transformed_input->Resize(phi::make_ddim(in_dims_vec));
context.template Alloc<T>(transformed_input);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelFirst(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
VLOG(5) << "Why am I called?";
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 4, 1, 2, 3};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 3, 1, 2};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
template <typename DeviceContext, typename T>
inline void TransToChannelLast(const DeviceContext& context,
const Tensor* input,
Tensor* transformed_input) {
int dim = input->dims().size() - 2;
if (dim == 3) {
std::vector<int> axis{0, 2, 3, 4, 1};
funcs::Transpose<DeviceContext, T, 5> trans5;
trans5(context, *input, transformed_input, axis);
} else if (dim == 2) {
std::vector<int> axis{0, 2, 3, 1};
funcs::Transpose<DeviceContext, T, 4> trans4;
trans4(context, *input, transformed_input, axis);
} else if (dim == 1) {
std::vector<int> axis{0, 2, 1};
funcs::Transpose<DeviceContext, T, 3> trans3;
trans3(context, *input, transformed_input, axis);
}
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/gaussian_random_kernel.h"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/fluid/framework/generator.h"
DECLARE_bool(use_curand);
namespace phi {
template <typename T>
struct GaussianGenerator {
T mean_, std_;
unsigned int seed_;
unsigned int offset_ = 0;
__host__ __device__ GaussianGenerator(T mean, T std, int seed)
: mean_(mean), std_(std), seed_(seed) {}
__host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
: mean_(mean), std_(std), seed_(seed), offset_(offset) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
thrust::normal_distribution<MT> dist(mean_, std_);
unsigned int new_n = n + offset_;
rng.discard(new_n);
MT out = dist(rng);
return static_cast<T>(out);
}
};
template <typename T, typename Context>
void GaussianRandomKernel(const Context& dev_ctx,
const ScalarArray& shape,
float mean,
float std,
int seed,
DataType dtype,
DenseTensor* out) {
auto tensor = out;
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
tensor->Resize(phi::make_ddim(shape.GetData()));
T* data = dev_ctx.template Alloc<T>(tensor);
int64_t size = tensor->numel();
int device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
funcs::normal_distribution<MT> dist;
funcs::normal_transform<MT> trans(mean, std);
funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
}
} else {
auto func = GaussianGenerator<MT>(mean, std, seed);
IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
}
}
} // namespace phi
PD_REGISTER_KERNEL(gaussian_random,
GPU,
ALL_LAYOUT,
phi::GaussianRandomKernel,
phi::dtype::float16,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/pad_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
PD_REGISTER_KERNEL(pad_grad,
GPU,
ALL_LAYOUT,
phi::PadGradKernel,
float,
double,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/complex.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/pad_kernel_impl.h"
#include "paddle/phi/kernels/pad_kernel.h"
PD_REGISTER_KERNEL(pad,
GPU,
ALL_LAYOUT,
phi::PadKernel,
float,
double,
int,
int64_t,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
...@@ -178,6 +178,8 @@ struct IndexCalculator { ...@@ -178,6 +178,8 @@ struct IndexCalculator {
: dim(dim) { : dim(dim) {
dims = details::VectorToArray<int, kMaxRank>(cal_dims); dims = details::VectorToArray<int, kMaxRank>(cal_dims);
strides = details::VectorToArray<int, kMaxRank>(full_strides); strides = details::VectorToArray<int, kMaxRank>(full_strides);
reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
#ifndef PADDLE_WITH_XPU_KP
std::vector<paddle::platform::FastDivMod> cal_divmoders; std::vector<paddle::platform::FastDivMod> cal_divmoders;
// fast divmod // fast divmod
for (auto i : cal_strides) { for (auto i : cal_strides) {
...@@ -185,9 +187,22 @@ struct IndexCalculator { ...@@ -185,9 +187,22 @@ struct IndexCalculator {
} }
divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>( divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
cal_divmoders); cal_divmoders);
#endif
} }
__device__ inline int operator()(int offset) const { __device__ inline int operator()(int offset) const {
#ifdef PADDLE_WITH_XPU_KP
int index = 0;
#pragma unroll
for (int i = 0; i < kMaxRank; ++i) {
if (i == dim) {
break;
}
index += (offset / reduce_strides[i]) * strides[dims[i]];
offset = offset % reduce_strides[i];
}
return index;
#else
int index = 0; int index = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < kMaxRank; ++i) { for (int i = 0; i < kMaxRank; ++i) {
...@@ -199,12 +214,16 @@ struct IndexCalculator { ...@@ -199,12 +214,16 @@ struct IndexCalculator {
offset = divmod.val[1]; offset = divmod.val[1];
} }
return index; return index;
#endif
} }
int dim; int dim;
phi::Array<int, kMaxRank> dims; phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides; phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders; phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif
}; };
template <bool ReduceLastDim = false> template <bool ReduceLastDim = false>
...@@ -247,7 +266,7 @@ struct ReduceIndexMapping { ...@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
__device__ __forceinline__ int BlockDimY() { __device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU2
return dim.deal_size_y; return 1;
#else #else
return blockDim.y; return blockDim.y;
#endif #endif
...@@ -454,10 +473,14 @@ struct ReduceConfig { ...@@ -454,10 +473,14 @@ struct ReduceConfig {
bool is_last_dim = bool is_last_dim =
(rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
if (rank == reduce_rank || is_last_dim) { if (rank == reduce_rank || is_last_dim) {
#ifdef PADDLE_WITH_XPU_KP
reduce_type = static_cast<int>(ReduceType::kReduceAny);
#else
reduce_type = static_cast<int>(ReduceType::kReduceLastDim); reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
#endif
} else if (reduce_rank == 1) { } else if (reduce_rank == 1) {
// ReduceFirstDim and reduceSecondDim // ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
if (reduce_dim[0] == 0) { if (reduce_dim[0] == 0) {
reduce_type = static_cast<int>(ReduceType::kReduceHigherDim); reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
} else { } else {
...@@ -471,6 +494,7 @@ struct ReduceConfig { ...@@ -471,6 +494,7 @@ struct ReduceConfig {
} }
} }
#ifndef PADDLE_WITH_XPU_KP
void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
constexpr int min_reduce_num_per_thread = 16; constexpr int min_reduce_num_per_thread = 16;
constexpr int max_reduce_num_per_thread = 256; constexpr int max_reduce_num_per_thread = 256;
...@@ -569,6 +593,7 @@ struct ReduceConfig { ...@@ -569,6 +593,7 @@ struct ReduceConfig {
grid_dim->y = details::AlignUp(reduce_num, blocking_size); grid_dim->y = details::AlignUp(reduce_num, blocking_size);
} }
} }
#endif
void SetBlockDim() { void SetBlockDim() {
// init // init
...@@ -577,14 +602,14 @@ struct ReduceConfig { ...@@ -577,14 +602,14 @@ struct ReduceConfig {
dim3 block_dim(block_num, 1, 1); dim3 block_dim(block_num, 1, 1);
dim3 grid_dim(left_num, 1, 1); dim3 grid_dim(left_num, 1, 1);
blocking_size = reduce_num; blocking_size = reduce_num;
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
if (reduce_last_dim) { if (reduce_last_dim) {
block_dim.x = 128; block_dim.x = 64;
block_dim.y = reduce_num; block_dim.y = reduce_num;
grid_dim.x = 8; grid_dim.x = 1;
grid_dim.y = 1; grid_dim.y = 8;
} else { } else {
block_dim.x = 128; block_dim.x = 64;
block_dim.y = left_num; block_dim.y = left_num;
grid_dim.x = 8; grid_dim.x = 8;
grid_dim.y = 1; grid_dim.y = 1;
...@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
store_offset = block.BlockIdY() * left_num + left_idx; store_offset = block.BlockIdY() * left_num + left_idx;
loop_left = min(block.GetLoopSize(), left_num - left_idx); loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = 1; stride_left = 1;
tid = threadIdx.x; tid = THREAD_ID_X;
} else { } else {
auto block = ReduceIndexMapping<false>(dim); auto block = ReduceIndexMapping<false>(dim);
input_idx = block.BlockIdY() * block.BlockDimY(); input_idx = block.BlockIdY() * block.BlockDimY();
...@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x, ...@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
loop_left = min(block.GetLoopSize(), left_num - left_idx); loop_left = min(block.GetLoopSize(), left_num - left_idx);
stride_left = block.BlockDimX() * block.GridDimX(); stride_left = block.BlockDimX() * block.GridDimX();
store_offset = block.BlockIdY() * left_num + left_idx; store_offset = block.BlockIdY() * left_num + left_idx;
tid = threadIdx.y; tid = THREAD_ID_Y;
} }
// calculate the offset, means the addr where each thread really start. // calculate the offset, means the addr where each thread really start.
// 1. reduce for each thread // 1. reduce for each thread
MPType input_compute[REDUCE_VEC_SIZE]; MPType input_compute[REDUCE_VEC_SIZE];
Tx input_reg[REDUCE_VEC_SIZE]; Tx input_reg[REDUCE_VEC_SIZE];
int input_idx_tmp = input_idx;
for (int i = 0; i < loop_left; i += stride_left) { for (int i = 0; i < loop_left; i += stride_left) {
int input_offset = left_index_calculator(left_idx + i); int input_offset = left_index_calculator(left_idx + i);
const Tx* input = x + input_offset; const _ptr_ Tx* input = x + input_offset;
MPType reduce_var = init; MPType reduce_var = init;
// load REDUCE_VEC_SIZE data once, and then compute // load REDUCE_VEC_SIZE data once, and then compute
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
input_idx = input_idx_tmp;
for (; input_idx + block_size < bound; for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) { input_idx += REDUCE_VEC_SIZE * stride) {
kps::ReadDataReduce<Tx, kps::ReadDataReduce<Tx,
...@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x, ...@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int loop_size = min(reduce_num - idy, blocking_size); int loop_size = min(reduce_num - idy, blocking_size);
int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY(); int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
int block_offset = idy * left_num + idz * reduce_num; int block_offset = idy * left_num + idz * reduce_num;
const Tx* input = x + block_offset; const _ptr_ Tx* input = x + block_offset;
Tx reduce_input; Tx reduce_input;
for (; idx < size; idx += stride) { for (; idx < size; idx += stride) {
MPType reduce_var = init; MPType reduce_var = init;
...@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
const ReduceOp& reducer, const ReduceOp& reducer,
const TransformOp& transform, const TransformOp& transform,
MPType init, MPType init,
gpuStream_t stream, KPStream stream,
ReduceConfig<Ty> config) { ReduceConfig<Ty> config) {
if (config.reduce_type == kReduceLastDim) { if (config.reduce_type == kReduceLastDim) {
int stride_reduce = 1; int stride_reduce = 1;
...@@ -855,13 +882,14 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -855,13 +882,14 @@ static void LaunchReduceKernel(const Tx* x_data,
0); 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
OneDimIndexCal><<<8, 128, stream>>>(x_data, OneDimIndexCal><<<8, 64, 0, stream>>>(
x_data,
config.output_data, config.output_data,
reducer, reducer,
transform, transform,
...@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
0); 0);
dim.SetRem(config.reduce_num % config.block.x, 0, 0); dim.SetRem(config.reduce_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx, ReduceAnyKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
TransformOp, TransformOp,
IndexCalculator><<<8, 128, stream>>>( IndexCalculator><<<8, 64, 0, stream>>>(
x_data, x_data,
config.output_data, config.output_data,
reducer, reducer,
...@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data, ...@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig dim = kps::DimConfig dim =
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0); dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<Ty, ReduceHigherDimKernel<
Ty,
Ty, Ty,
MPType, MPType,
ReduceOp, ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>( kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const paddle::platform::Place& place, const paddle::platform::Place& place,
gpuStream_t stream) { KPStream stream) {
auto reducer = ReduceOp<Ty>(); auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data, cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
transform); transform);
...@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data, ...@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
const TransformOp& transform, const TransformOp& transform,
int reduce_num, int reduce_num,
const paddle::platform::Place& place, const paddle::platform::Place& place,
gpuStream_t stream) { KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce().")); "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
} }
...@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
phi::DenseTensor* y, phi::DenseTensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { KPStream stream) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
auto x_dim = phi::vectorize<int>(x.dims()); auto x_dim = phi::vectorize<int>(x.dims());
...@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.SetOutputData(y_data, x.place(), &tmp); config.SetOutputData(y_data, x.place(), &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value; constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) { if (use_cub_reduce) {
CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>( CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
x_data, y_data, transform, config.reduce_num, x.place(), stream); x_data, y_data, transform, config.reduce_num, x.place(), stream);
return; return;
} }
#endif
using MPType = typename kps::details::MPTypeTrait<Ty>::Type; using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
auto reducer = ReduceOp<MPType>(); auto reducer = ReduceOp<MPType>();
...@@ -1124,12 +1155,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1124,12 +1155,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config.reduce_num % config.blocking_size, config.reduce_num % config.blocking_size,
0); 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<Tx, ReduceHigherDimKernel<Tx,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
TransformOp><<<8, 128, stream>>>(x_data, TransformOp><<<8, 64, 0, stream>>>(
x_data,
config.output_data, config.output_data,
reducer, reducer,
transform, transform,
...@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim2.SetRem(config.left_num % config.block.x, 0, 0); dim2.SetRem(config.left_num % config.block.x, 0, 0);
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel< ReduceHigherDimKernel<
Ty, Ty,
Ty, Ty,
MPType, MPType,
ReduceOp<MPType>, ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>( kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data, config.output_data,
y_data, y_data,
reducer, reducer,
...@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx, ...@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
template <typename T, template <typename T,
template <typename> class ReduceOp, template <typename> class ReduceOp,
template <typename, typename> class TransformOp> template <typename, typename> class TransformOp>
void Reduce(const GPUContext& dev_ctx, void Reduce(const KPDevice& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
bool reduce_all, bool reduce_all,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
...@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx, ...@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
reduce_num *= (x.dims())[i]; reduce_num *= (x.dims())[i];
} }
gpuStream_t stream = dev_ctx.stream(); KPStream stream = dev_ctx.stream();
if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype); auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace phi {
template <typename T, typename Context>
void PadGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* d_x) {
if (d_x == nullptr) {
return;
}
dev_ctx.template Alloc<T>(d_x);
int rank = d_out.dims().size();
phi::funcs::PaddingGradFunctor<Context, T>(
rank, dev_ctx, paddings, d_out, d_x);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <utility>
#include <vector>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace phi {
template <typename T, typename Context>
void PadKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
int rank = x.dims().size();
funcs::PaddingFunctor<Context, T>(
rank, dev_ctx, paddings, static_cast<T>(pad_value), x, out);
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void PadGradKernel(const Context& dev_ctx,
const DenseTensor& d_out,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* d_x);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void PadKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int>& paddings,
float pad_value,
DenseTensor* out);
} // namespace phi
set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel") register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
...@@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -107,7 +107,9 @@ void ProductRuleBook(const Context& dev_ctx,
f_calc_rulebook(nullptr); f_calc_rulebook(nullptr);
// alloc the rulebook // alloc the rulebook
rulebook->ResizeAndAllocate({3, rulebook_len}); DenseTensorMeta rulebook_meta(
DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
rulebook->set_meta(rulebook_meta);
dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
int* rulebook_ptr = rulebook->data<int>(); int* rulebook_ptr = rulebook->data<int>();
f_calc_rulebook(rulebook_ptr); f_calc_rulebook(rulebook_ptr);
......
...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx,
// 1. product rulebook // 1. product rulebook
DenseTensorMeta counter_meta( DenseTensorMeta counter_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW); DataType::INT32, {kernel_size}, DataLayout::NCHW);
// DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
ProductRuleBook<T, Context>(dev_ctx, ProductRuleBook<T, Context>(dev_ctx,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include "glog/logging.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
namespace phi {
namespace sparse {
// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
out1[i] = i;
out2[i] = i;
}
}
/**
* @brief: update the out index and indices
* unique_keys: save the index of the output feature list
* unique_values: indiates the index of key before deduplication
* out_indexs: indicates the position of the output index in the rulebook
* rulebook_len: indicates the length of rulebook
* out_dims: indicates the output dims
* out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
* rulebook_out_indexs: the output index in rulebook
**/
__global__ void UpdateIndexKernel(const int* unique_keys,
const int* unique_values,
const int* out_indexs,
const int non_zero_num,
const int rulebook_len,
const Dims4D out_dims,
int* out_indices,
int* rulebook_out_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
const int index = unique_keys[i];
int batch, x, y, z;
IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
// get out indices
out_indices[i] = batch;
out_indices[i + non_zero_num] = z;
out_indices[i + non_zero_num * 2] = y;
out_indices[i + non_zero_num * 3] = x;
// update rulebook
int start = unique_values[i];
int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
// max(end-start) = kernel_size
for (int j = start; j < end; j++) {
rulebook_out_indexs[out_indexs[j]] = i;
}
}
}
/**
* @brief product rulebook
* for input_i in x_indices:
* if input_i participate in the convolution calculation:
* infer the output_i by input_i and kernel_i
* save output_i
*
* x_indices: the indices of input features
* x_dims: the input dims
* kernel_dims: the kernel dims
* out_dims: the output dims
* non_zero_num: the number of input features
* rulebook: the rulebook to save the kernel index, input index and output index
* counter: save the number of times each location in the kernel participates in
*the caculation
**/
__global__ void ProductRuleBookKernel(const int* x_indices,
const Dims4D x_dims,
const Dims4D kernel_dims,
const Dims4D out_dims,
const int64_t non_zero_num,
const Dims4D paddings,
const Dims4D dilations,
const Dims4D strides,
int* rulebook,
int* counter) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int counter_buf[]; // kernel_size
const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
const int offset = kernel_size * non_zero_num;
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
counter_buf[i] = 0;
}
__syncthreads();
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
int kernel_index = 0;
for (int kz = 0; kz < kernel_dims[1]; kz++) {
for (int ky = 0; ky < kernel_dims[2]; ky++) {
for (int kx = 0; kx < kernel_dims[3]; kx++) {
int batch = x_indices[i];
int in_z = x_indices[i + non_zero_num];
int in_y = x_indices[i + 2 * non_zero_num];
int in_x = x_indices[i + 3 * non_zero_num];
int in_i = -1, out_index = -1;
if (Check(x_dims,
kernel_dims,
paddings,
dilations,
strides,
in_x,
in_y,
in_z,
kx,
ky,
kz)) {
int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
in_i = i;
out_index =
PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
atomicAdd(&counter_buf[kernel_index], 1);
}
rulebook[kernel_index * non_zero_num + i] = in_i;
rulebook[kernel_index * non_zero_num + offset + i] = out_index;
++kernel_index;
}
}
}
}
__syncthreads();
for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
atomicAdd(&counter[i], counter_buf[i]);
}
}
// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
// this kernel with phi::GatherCUDAKernel;
// Vectorization can be used to improve read and write bandwidth
/**
* brief: gather data from params according to indices
* params: the inputs
* indices: the indices you want to gather
* output: the outputs
* index_size: the size of indices
* slice_size: slice size corresponding to each index, here is the channel size
**/
template <typename T, typename IndexT = int>
__global__ void GatherKernel(const T* params,
const IndexT* indices,
T* output,
size_t index_size,
size_t slice_size) {
CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
int64_t indices_i = i / slice_size;
int64_t slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = indices[indices_i];
int64_t params_i = gather_i * slice_size + slice_i;
*(output + i) = *(params + params_i);
}
}
/**
* brief: scatter add
* input: the inputs
* unique_value: refer to UpdateIndexKernel notes
* out_index: the output feature index
* non_zero_num: the number of output features
* rulebook_len: the length of rulebook
* channels: the output channel size
* out: the outputs
**/
template <typename T>
__global__ void ScatterKernel(const T* input,
const int* unique_value,
const int* out_index,
const int non_zero_num,
const int rulebook_len,
const int channels,
T* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
int indices_i = i / channels;
int channels_i = i - indices_i * channels;
int start = unique_value[indices_i];
int end = indices_i == non_zero_num - 1 ? rulebook_len
: unique_value[indices_i + 1];
// max(end-start) = kernel_size
T sum = static_cast<T>(0);
for (int j = start; j < end; j++) {
const int out_feature_i = out_index[j];
sum += input[out_feature_i * channels + channels_i];
}
out[indices_i * channels + channels_i] = sum;
}
}
// brief: calculation the distance between start and end
__global__ void DistanceKernel(const int* start,
const int* end,
int* distance) {
if (threadIdx.x == 0) {
*distance = end - start;
}
}
// the basic algorithm can refer to convolution_kernel.cc or
// the second paper
// example:
// 1. the rulebook:
// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, ....
// the out_index(key): 20, 30, 33, 30, 33, 20, 25
// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, ....
// 3. sorted the (key, value)
// 4. unique the (key, value):
// unique_key: 20, 25, 30, 33
// unique_values: 0, 2, 3, 5
// the index of unique_values is: 0, 1, 2, 3
// 5. update the out_index by unique_key, uniqe_value and the index of
// unique_value:
// the new out_index: 0, 2, 3, 2, 3, 0, 1
template <typename T, typename Context>
int ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const DDim& out_dims,
DenseTensor* rulebook,
DenseTensor* counter_per_kernel,
DenseTensor* offsets_per_kernel,
DenseTensor* out_index,
DenseTensor* unique_key,
DenseTensor* unique_value,
SparseCooTensor* out,
std::vector<int>* h_counter,
std::vector<int>* h_offsets) {
const auto& kernel_dims = kernel.dims();
const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>();
dev_ctx.Alloc(counter_per_kernel,
counter_per_kernel->dtype(),
sizeof(int) * counter_per_kernel->numel());
int* counter_ptr = counter_per_kernel->data<int>();
dev_ctx.Alloc(offsets_per_kernel,
offsets_per_kernel->dtype(),
sizeof(int) * offsets_per_kernel->numel());
int* offsets_ptr = offsets_per_kernel->data<int>();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
int* rulebook_ptr = rulebook->data<int>();
const auto x_dims = x.dims();
Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
Dims4D d_strides(1, strides[2], strides[1], strides[0]);
Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
// 1. product rule book
phi::funcs::SetConstant<Context, int> set_zero;
set_zero(dev_ctx, counter_per_kernel, 0);
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
ProductRuleBookKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
kernel_size * sizeof(int),
dev_ctx.stream()>>>(indices_ptr,
d_x_dims,
d_kernel_dims,
d_out_dims,
non_zero_num,
d_paddings,
d_dilations,
d_strides,
rulebook_ptr,
counter_ptr);
// 2. remove -1
#ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
rulebook_ptr,
rulebook_ptr + 2 * kernel_size * non_zero_num,
-1);
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
counter_ptr,
counter_ptr + kernel_size,
offsets_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
counter_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
offsets_ptr,
kernel_size * sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
int rulebook_len =
(*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
// 3. sorted or merge the out index
out_index->ResizeAndAllocate({rulebook_len});
unique_value->ResizeAndAllocate({rulebook_len});
unique_key->ResizeAndAllocate({rulebook_len});
dev_ctx.Alloc(
out_index, out_index->dtype(), sizeof(int) * out_index->numel());
int* out_index_ptr = out_index->data<int>();
dev_ctx.Alloc(
unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
int* unique_value_ptr = unique_value->data<int>();
dev_ctx.Alloc(
unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
int* unique_key_ptr = unique_key->data<int>();
config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
InitByIndexKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
rulebook_len, out_index_ptr, unique_value_ptr);
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
rulebook_ptr + rulebook_len,
rulebook_len * sizeof(int),
hipMemcpyDeviceToDevice,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
rulebook_ptr + rulebook_len,
rulebook_len * sizeof(int),
cudaMemcpyDeviceToDevice,
dev_ctx.stream());
#endif
// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
// performance, but thrust::merge_by_key limited by data size
#ifdef PADDLE_WITH_HIP
thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
unique_key_ptr,
unique_key_ptr + rulebook_len,
out_index_ptr);
// 4. unique
thrust::pair<int*, int*> new_end =
#ifdef PADDLE_WITH_HIP
thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif
unique_key_ptr,
unique_key_ptr + rulebook_len,
unique_value_ptr);
// thrust::distance doesn't support stream parameters
// const int out_non_zero_num = thrust::distance(unique_key_ptr,
// new_end.first);
DistanceKernel<<<1, 1>>>(unique_key_ptr,
new_end.first,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
int out_non_zero_num = 0;
#ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
sizeof(int),
hipMemcpyDeviceToHost,
dev_ctx.stream());
#else
phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num,
rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream());
#endif
dev_ctx.Wait();
// 5. update out_indices and rulebook by unique_value_ptr
const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(
x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
dev_ctx.Alloc(
&out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
int* out_indices_ptr = out_indices.data<int>();
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
UpdateIndexKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(unique_key_ptr,
unique_value_ptr,
out_index_ptr,
out_non_zero_num,
rulebook_len,
d_out_dims,
out_indices_ptr,
rulebook_ptr + rulebook_len);
out->SetMember(out_indices, out_values, out_dims, true);
return rulebook_len;
}
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
SparseCooTensor* out,
DenseTensor* rulebook) {
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
const auto& x_dims = x.dims();
const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
DDim out_dims = {1, 1, 1, 1, 1};
GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
out->set_dims(out_dims);
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
// Second algorithm:
// https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
// 1. product rulebook
DenseTensorMeta counter_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensorMeta offsets_meta(
DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
int n = ProductRuleBook<T, Context>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
out_dims,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_key,
&unique_value,
out,
&h_counter,
&offsets);
const int* counter_ptr = counter_per_kernel.data<int>();
const int* offsets_ptr = counter_per_kernel.data<int>();
// 2. gather
DenseTensorMeta in_features_meta(
x.dtype(), {n, in_channels}, DataLayout::NCHW);
DenseTensorMeta out_features_meta(
x.dtype(), {n, out_channels}, DataLayout::NCHW);
phi::DenseTensor in_features =
phi::Empty(dev_ctx, std::move(in_features_meta));
phi::DenseTensor out_features =
phi::Empty(dev_ctx, std::move(out_features_meta));
dev_ctx.Alloc(
&in_features, in_features.dtype(), sizeof(T) * in_features.numel());
T* in_features_ptr = in_features.data<T>();
dev_ctx.Alloc(
&out_features, out_features.dtype(), sizeof(T) * out_features.numel());
T* out_features_ptr = out_features.data<T>();
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook->data<int>(),
in_features_ptr,
n,
in_channels);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto* out_values = out->mutable_non_zero_elements();
dev_ctx.Alloc(
out_values, out_values->dtype(), sizeof(T) * out_values->numel());
T* out_values_ptr = out_values->data<T>();
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
if (h_counter[i] <= 0) {
continue;
}
// call gemm: (n, in_channels) * (in_channels, out_channels)
const int M = h_counter[i];
const int K = in_channels;
const int N = out_channels;
T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
blas.GEMM(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1),
tmp_in_ptr,
tmp_kernel_ptr,
static_cast<T>(0),
tmp_out_ptr);
}
// 4. scatter
config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, out->nnz() * out_channels, 1);
ScatterKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(out_features_ptr,
unique_value.data<int>(),
out_index.data<int>(),
out->nnz(),
n,
out_channels,
out_values_ptr);
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(sparse_conv3d,
GPU,
ALL_LAYOUT,
phi::sparse::Conv3dKernel,
float,
double,
phi::dtype::float16) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm",
{"X", "Scale", "Bias", "Mean", "Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"Y",
"MeanOut",
"VarianceOut",
"SavedMean",
"SavedVariance",
"ReserveSpace"});
}
KernelSignature BatchNormGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"batch_norm_grad",
{GradVarName("Y"),
"X",
"Scale",
"Bias",
"SavedMean",
"SavedVariance",
"ReserveSpace",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
}
KernelSignature BatchNormGradGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("batch_norm_grad_grad",
{"DDX",
"DDScale",
"DDBias",
"DY",
"X",
"Scale",
"SavedMean",
"SavedVariance",
"Mean",
"Variance"},
{"momentum",
"epsilon",
"data_layout",
"is_test",
"use_global_stats",
"trainable_statistics",
"fuse_with_relu"},
{"DX", "DScale", "DDY"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
phi::BatchNormGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
phi::BatchNormGradGradOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature GaussianRandomOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.InputSize("ShapeTensorList") > 0) {
return KernelSignature("gaussian_random",
{},
{"ShapeTensorList", "mean", "std", "seed", "dtype"},
{"Out"});
}
const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
if (ctx.HasInput("ShapeTensor") && shape.empty()) {
return KernelSignature("gaussian_random",
{},
{"ShapeTensor", "mean", "std", "seed", "dtype"},
{"Out"});
}
return KernelSignature("gaussian_random",
{},
{"shape", "mean", "std", "seed", "dtype"},
{"Out"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
phi::GaussianRandomOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature PadGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("pad_grad",
{GradVarName("Out")},
{"paddings", "pad_value"},
{GradVarName("X")});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(pad_grad, phi::PadGradOpArgumentMapping);
...@@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_ ...@@ -25,3 +25,4 @@ cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_
cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils) cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils)
cc_test(test_sparse_conv_api SRCS test_sparse_conv_api.cc DEPS phi_tensor phi_api phi_api_utils)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See
the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
template <typename T>
void TestConv3dBase(const std::vector<int>& indices,
const std::vector<T>& features,
const phi::DDim& x_dims,
const std::vector<T>& kernel,
const phi::DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<T>& correct_out_features,
const phi::DDim& correct_out_dims,
const int non_zero_num,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const float diff = 1e-3) {
const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
phi::DenseTensor indices_tensor(
alloc.get(),
phi::DenseTensorMeta(
phi::DataType::INT32, {4, non_zero_num}, phi::DataLayout::NCHW));
memcpy(
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
phi::DenseTensor features_tensor(
alloc.get(),
phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{non_zero_num, in_channels},
phi::DataLayout::NHWC));
memcpy(
features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
auto x_tensor = std::make_shared<phi::SparseCooTensor>(
indices_tensor, features_tensor, x_dims);
paddle::experimental::Tensor x(x_tensor);
auto kernel_tensor = std::make_shared<phi::DenseTensor>(
alloc.get(),
phi::DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
kernel_dims,
phi::DataLayout::NHWC));
paddle::experimental::Tensor weight(kernel_tensor);
memcpy(kernel_tensor->mutable_data<T>(paddle::platform::CPUPlace()),
kernel.data(),
kernel.size() * sizeof(T));
if (!std::is_same<T, phi::dtype::float16>::value) {
auto outs = paddle::experimental::sparse::conv3d(
x, weight, paddings, dilations, strides, 1);
auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
std::get<0>(outs).impl());
ASSERT_EQ(correct_out_dims.size(), out->dims().size());
for (int i = 0; i < correct_out_dims.size(); i++) {
ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
}
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out->nnz());
int cmp_indices = memcmp(correct_out_indices.data(),
out->non_zero_indices().data<int>(),
correct_out_indices.size() * sizeof(int));
ASSERT_EQ(cmp_indices, 0);
for (uint64_t i = 0; i < correct_out_features.size(); i++) {
float tmp = std::fabs(static_cast<float>(
correct_out_features[i] - out->non_zero_elements().data<T>()[i]));
ASSERT_LT(tmp, diff);
}
}
}
void TestConv3d(const std::vector<int>& indices,
const std::vector<float>& features,
const phi::DDim& x_dims,
const std::vector<float>& kernel,
const phi::DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<float>& correct_out_features,
const phi::DDim& correct_out_dims,
const int non_zero_num,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations) {
// test float
TestConv3dBase<float>(indices,
features,
x_dims,
kernel,
kernel_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
TEST(API, sparse_conv2d) {
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
const int in_channels = 1;
const int out_channels = 1;
phi::DDim x_dims = {1, 1, 5, 5, in_channels};
phi::DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
phi::DDim out_dims = {1, 1, 3, 3, out_channels};
std::vector<int> paddings = {0, 0, 0};
std::vector<int> strides = {1, 1, 1};
std::vector<int> dilations = {1, 1, 1};
const int non_zero_num = 3;
std::vector<int> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
// 3*3*3=27
std::vector<float> kernel = {0.65820312,
0.75048828,
0.21411133,
0.17370605,
0.85546875,
0.53076172,
0.28833008,
0.71044922,
0.00659943};
std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
std::vector<float> out_features = {
-0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
TestConv3d(indices_flatten,
features,
x_dims,
kernel,
kernel_dims,
out_indices_flatten,
out_features,
out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
...@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices,
f_verify(grads[1].data<T>(), kernel_grad); f_verify(grads[1].data<T>(), kernel_grad);
} }
} }
// test gpu
#if defined(PADDLE_WITH_CUDA)
phi::GPUContext dev_ctx_gpu;
dev_ctx_gpu.PartialInitWithoutAllocator();
dev_ctx_gpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
.get());
dev_ctx_gpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
dev_ctx_gpu.Alloc(&d_indices_tensor,
d_indices_tensor.dtype(),
sizeof(int) * d_indices_tensor.numel());
phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
DenseTensor d_features_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{non_zero_num, in_channels},
DataLayout::NHWC));
dev_ctx_gpu.Alloc(&d_features_tensor,
d_features_tensor.dtype(),
sizeof(T) * d_features_tensor.numel());
phi::Copy(
dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
DenseTensor d_kernel_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
kernel_dims,
DataLayout::NHWC));
dev_ctx_gpu.Alloc(&d_kernel_tensor,
d_kernel_tensor.dtype(),
sizeof(T) * d_kernel_tensor.numel());
phi::Copy(
dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
d_x_tensor,
d_kernel_tensor,
paddings,
dilations,
strides,
1,
&d_rulebook);
ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
for (int i = 0; i < correct_out_dims.size(); i++) {
ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
}
DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
dev_ctx_cpu.Alloc(&h_indices_tensor,
h_indices_tensor.dtype(),
sizeof(int) * h_indices_tensor.numel());
phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(),
phi::CPUPlace(),
true,
&h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(),
correct_out_indices.size() * sizeof(int));
ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
{d_out.nnz()},
d_out.layout()));
dev_ctx_cpu.Alloc(&h_features_tensor,
h_features_tensor.dtype(),
sizeof(T) * h_features_tensor.numel());
phi::Copy(dev_ctx_gpu,
d_out.non_zero_elements(),
phi::CPUPlace(),
true,
&h_features_tensor);
for (uint64_t i = 0; i < correct_out_features.size(); i++) {
float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
h_features_tensor.data<T>()[i]));
ASSERT_LT(tmp, diff);
}
#endif
} }
void TestConv3d(const std::vector<int>& indices, void TestConv3d(const std::vector<int>& indices,
......
...@@ -1430,6 +1430,22 @@ class Fleet(object): ...@@ -1430,6 +1430,22 @@ class Fleet(object):
# cache original feed forward program # cache original feed forward program
self.origin_main_program = loss.block.program self.origin_main_program = loss.block.program
# add distributed attr
if not hasattr(self.origin_main_program, "distributed_info_"):
setattr(self.origin_main_program, "distributed_info_", dict())
self.origin_main_program.distributed_info_[
"dp_degree"] = self._user_defined_strategy.sharding_configs[
"dp_degree"]
self.origin_main_program.distributed_info_[
"mp_degree"] = self._user_defined_strategy.sharding_configs[
"mp_degree"]
self.origin_main_program.distributed_info_[
"pp_degree"] = self._user_defined_strategy.sharding_configs[
"pp_degree"]
self.origin_main_program.distributed_info_[
"sharding_degree"] = self._user_defined_strategy.sharding_configs[
"sharding_degree"]
context["origin_main_program"] = self.origin_main_program context["origin_main_program"] = self.origin_main_program
context["loss"] = loss context["loss"] = loss
if startup_program == None: if startup_program == None:
......
...@@ -351,10 +351,10 @@ endif() ...@@ -351,10 +351,10 @@ endif()
set_tests_properties(test_graph PROPERTIES TIMEOUT 120) set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120) set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
if(LINUX AND WITH_MKLDNN) if(LINUX AND WITH_MKLDNN)
set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120) set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120) set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
......
...@@ -26,7 +26,7 @@ import paddle.fluid as fluid ...@@ -26,7 +26,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph, _test_eager_guard
from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.container import Sequential
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
...@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer): ...@@ -122,7 +122,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
class TestImperativeOutSclae(unittest.TestCase): class TestImperativeOutSclae(unittest.TestCase):
def test_out_scale_acc(self): def func_out_scale_acc(self):
seed = 1000 seed = 1000
lr = 0.001 lr = 0.001
...@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase): ...@@ -166,9 +166,14 @@ class TestImperativeOutSclae(unittest.TestCase):
loss_list[i] > loss_list[i + 1], loss_list[i] > loss_list[i + 1],
msg='Failed to do the imperative qat.') msg='Failed to do the imperative qat.')
def test_out_scale_acc(self):
with _test_eager_guard():
self.func_out_scale_acc()
self.func_out_scale_acc()
class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
def test_save_quantized_model(self): def func_save_quantized_model(self):
lr = 0.001 lr = 0.001
load_param_path = "test_save_quantized_model/lenet.pdparams" load_param_path = "test_save_quantized_model/lenet.pdparams"
...@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): ...@@ -206,6 +211,11 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
loss_list[i] > loss_list[i + 1], loss_list[i] > loss_list[i + 1],
msg='Failed to do the imperative qat.') msg='Failed to do the imperative qat.')
def test_save_quantized_model(self):
with _test_eager_guard():
self.func_save_quantized_model()
self.func_save_quantized_model()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -29,6 +29,7 @@ import paddle.fluid as fluid ...@@ -29,6 +29,7 @@ import paddle.fluid as fluid
from paddle.fluid.contrib.slim.quantization import * from paddle.fluid.contrib.slim.quantization import *
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from paddle.dataset.common import download from paddle.dataset.common import download
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
from imperative_test_utils import ImperativeLinearBn_hook from imperative_test_utils import ImperativeLinearBn_hook
...@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase): ...@@ -194,7 +195,7 @@ class TestImperativePTQ(unittest.TestCase):
break break
return top1_correct_num / total_num return top1_correct_num / total_num
def test_ptq(self): def func_ptq(self):
start_time = time.time() start_time = time.time()
self.set_vars() self.set_vars()
...@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase): ...@@ -244,9 +245,14 @@ class TestImperativePTQ(unittest.TestCase):
end_time = time.time() end_time = time.time()
print("total time: %ss \n" % (end_time - start_time)) print("total time: %ss \n" % (end_time - start_time))
def test_ptq(self):
with _test_eager_guard():
self.func_ptq()
self.func_ptq()
class TestImperativePTQfuse(TestImperativePTQ): class TestImperativePTQfuse(TestImperativePTQ):
def test_ptq(self): def func_ptq(self):
start_time = time.time() start_time = time.time()
self.set_vars() self.set_vars()
...@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ): ...@@ -305,6 +311,11 @@ class TestImperativePTQfuse(TestImperativePTQ):
end_time = time.time() end_time = time.time()
print("total time: %ss \n" % (end_time - start_time)) print("total time: %ss \n" % (end_time - start_time))
def test_ptq(self):
with _test_eager_guard():
self.func_ptq()
self.func_ptq()
class TestImperativePTQHist(TestImperativePTQ): class TestImperativePTQHist(TestImperativePTQ):
def set_vars(self): def set_vars(self):
......
...@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose ...@@ -32,7 +32,7 @@ from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet from imperative_test_utils import fix_model_dict, ImperativeLenet
paddle.enable_static() paddle.enable_static()
...@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase): ...@@ -55,7 +55,7 @@ class TestImperativeQat(unittest.TestCase):
self.activation_quantize_type = 'moving_average_abs_max' self.activation_quantize_type = 'moving_average_abs_max'
print('weight_quantize_type', self.weight_quantize_type) print('weight_quantize_type', self.weight_quantize_type)
def test_qat(self): def func_qat(self):
self.set_vars() self.set_vars()
imperative_qat = ImperativeQuantAware( imperative_qat = ImperativeQuantAware(
...@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase): ...@@ -193,6 +193,11 @@ class TestImperativeQat(unittest.TestCase):
np.allclose(after_save, before_save.numpy()), np.allclose(after_save, before_save.numpy()),
msg='Failed to save the inference quantized model.') msg='Failed to save the inference quantized model.')
def test_qat(self):
with _test_eager_guard():
self.func_qat()
self.func_qat()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -27,7 +27,7 @@ import paddle.fluid as fluid ...@@ -27,7 +27,7 @@ import paddle.fluid as fluid
from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from paddle.dataset.common import download from paddle.dataset.common import download
from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet from imperative_test_utils import fix_model_dict, ImperativeLenet
os.environ["CPU_NUM"] = "1" os.environ["CPU_NUM"] = "1"
......
...@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D ...@@ -30,7 +30,7 @@ from paddle.fluid.dygraph import Pool2D
from paddle.fluid.dygraph import Linear from paddle.fluid.dygraph import Linear
from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from paddle.fluid.framework import _test_eager_guard
os.environ["CPU_NUM"] = "1" os.environ["CPU_NUM"] = "1"
_logger = get_logger( _logger = get_logger(
...@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase): ...@@ -157,7 +157,7 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
_logger.info("test act_preprocess") _logger.info("test act_preprocess")
self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT) self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
def test_quant_aware_training(self): def func_quant_aware_training(self):
imperative_qat = self.imperative_qat imperative_qat = self.imperative_qat
seed = 1 seed = 1
np.random.seed(seed) np.random.seed(seed)
...@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase): ...@@ -243,6 +243,11 @@ class TestUserDefinedActPreprocess(unittest.TestCase):
train(lenet) train(lenet)
test(lenet) test(lenet)
def test_quant_aware_training(self):
with _test_eager_guard():
self.func_quant_aware_training()
self.func_quant_aware_training()
class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess): class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
def setUp(self): def setUp(self):
......
...@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D ...@@ -32,6 +32,7 @@ from paddle.fluid.dygraph.nn import Pool2D
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
from paddle.fluid.framework import _test_eager_guard
os.environ["CPU_NUM"] = "1" os.environ["CPU_NUM"] = "1"
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
...@@ -42,7 +43,8 @@ _logger = get_logger( ...@@ -42,7 +43,8 @@ _logger = get_logger(
class TestImperativeOutSclae(unittest.TestCase): class TestImperativeOutSclae(unittest.TestCase):
def test_out_scale_acc(self): def func_out_scale_acc(self):
paddle.disable_static()
seed = 1000 seed = 1000
lr = 0.1 lr = 0.1
...@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase): ...@@ -125,6 +127,11 @@ class TestImperativeOutSclae(unittest.TestCase):
if find_matmul: if find_matmul:
self.assertTrue(matmul_skip_count == 1) self.assertTrue(matmul_skip_count == 1)
def test_out_scale_acc(self):
with _test_eager_guard():
self.func_out_scale_acc()
self.func_out_scale_acc()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -155,8 +155,7 @@ def prune_model(main_program=None, ...@@ -155,8 +155,7 @@ def prune_model(main_program=None,
n=2, n=2,
m=4, m=4,
mask_algo='mask_1d', mask_algo='mask_1d',
with_mask=True, with_mask=True):
sharding=False):
r""" r"""
Pruning parameters of supported layers in :attr:`main_program` via Pruning parameters of supported layers in :attr:`main_program` via
specified mask generation function given by :attr:`mask_algo`. This specified mask generation function given by :attr:`mask_algo`. This
...@@ -179,7 +178,6 @@ def prune_model(main_program=None, ...@@ -179,7 +178,6 @@ def prune_model(main_program=None,
mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`. mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'. The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True. with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
Returns: Returns:
dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable. dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
Examples: Examples:
...@@ -221,7 +219,10 @@ def prune_model(main_program=None, ...@@ -221,7 +219,10 @@ def prune_model(main_program=None,
# Must call `exe.run(startup_program)` first before calling `sparsity.prune_model` # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
sparsity.prune_model(main_program, mask_algo='mask_2d_best') sparsity.prune_model(main_program, mask_algo='mask_2d_best')
""" """
if sharding: if main_program is not None and hasattr(
main_program,
"distributed_info_") and main_program.distributed_info_[
"sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
place = paddle.CUDAPlace(gpu_id) place = paddle.CUDAPlace(gpu_id)
else: else:
......
...@@ -99,18 +99,19 @@ def param_guard(parameters): ...@@ -99,18 +99,19 @@ def param_guard(parameters):
yield yield
def _convert_into_variable(var_base): def _convert_into_variable(tensor):
""" """
Convert Varbase into Variable. Convert Varbase into Variable.
""" """
if isinstance(var_base, core.VarBase): if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
# Check whether has been created before. # Check whether has been created before.
new_var = var_base.block._find_var_recursive(var_base.name) new_var = tensor.block._find_var_recursive(tensor.name)
if new_var is not None: if new_var is not None:
assert isinstance(new_var, framework.Variable) assert isinstance(new_var, framework.Variable)
# Convert ParamBase into Parameter with same attributes in dy2stat. # Convert ParamBase into Parameter with same attributes in dy2stat.
elif isinstance(var_base, framework.ParamBase): elif isinstance(tensor,
new_var = var_base._to_static_var(to_parameter=True) (framework.EagerParamBase, framework.ParamBase)):
new_var = tensor._to_static_var(to_parameter=True)
else: else:
# Note(Aurelius84): Convert VarBase in self._buffers into Variable with # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
# same attributes and set persistable=True to allow saving this var. # same attributes and set persistable=True to allow saving this var.
...@@ -120,13 +121,13 @@ def _convert_into_variable(var_base): ...@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
# But if its shape is empty while created from `create_variable()`, we consider this buffer # But if its shape is empty while created from `create_variable()`, we consider this buffer
# non-persistable. See case of `drop_state` in lstm api. # non-persistable. See case of `drop_state` in lstm api.
is_persistable = len(var_base.shape) > 0 is_persistable = len(tensor.shape) > 0
new_var = var_base._to_static_var( new_var = tensor._to_static_var(
to_parameter=False, persistable=is_persistable) to_parameter=False, persistable=is_persistable)
return new_var return new_var
else: else:
return var_base return tensor
def enabled(): def enabled():
......
...@@ -61,7 +61,8 @@ class NestSequence(object): ...@@ -61,7 +61,8 @@ class NestSequence(object):
def _get_var_ids(self): def _get_var_ids(self):
var_ids = [] var_ids = []
for idx, var in enumerate(self.__input_list): for idx, var in enumerate(self.__input_list):
if isinstance(var, (framework.Variable, core.VarBase)): if isinstance(var, (framework.Variable, core.VarBase,
core.eager.Tensor)):
var_ids.append(idx) var_ids.append(idx)
return var_ids return var_ids
...@@ -73,7 +74,8 @@ class NestSequence(object): ...@@ -73,7 +74,8 @@ class NestSequence(object):
if need_check: if need_check:
warning_types = set() warning_types = set()
for var in self.__input_list: for var in self.__input_list:
if not isinstance(var, (framework.Variable, core.VarBase)): if not isinstance(var, (framework.Variable, core.VarBase,
core.eager.Tensor)):
warning_types.add(type(var)) warning_types.add(type(var))
if warning_types: if warning_types:
logging_utils.warn( logging_utils.warn(
...@@ -301,10 +303,17 @@ class PartialProgramLayer: ...@@ -301,10 +303,17 @@ class PartialProgramLayer:
for name in block.vars: for name in block.vars:
if "@GRAD" in name: if "@GRAD" in name:
var_desc = block.vars[name].desc var_desc = block.vars[name].desc
var_base = None
if not core._in_eager_mode():
var_base = core.VarBase(var_desc.dtype(), var_base = core.VarBase(var_desc.dtype(),
var_desc.shape(), var_desc.shape(),
var_desc.name(), var_desc.name(),
var_desc.type(), False) var_desc.type(), False)
else:
var_base = core.eager.Tensor(var_desc.dtype(),
var_desc.shape(),
var_desc.name(),
var_desc.type(), False)
double_grads.append(var_base) double_grads.append(var_base)
return self._valid_vars(double_grads) return self._valid_vars(double_grads)
...@@ -386,13 +395,22 @@ class PartialProgramLayer: ...@@ -386,13 +395,22 @@ class PartialProgramLayer:
expected_place = framework._current_expected_place() expected_place = framework._current_expected_place()
for i, value in enumerate(flatten_inputs): for i, value in enumerate(flatten_inputs):
if isinstance(value, np.ndarray): if isinstance(value, np.ndarray):
var = None
if not core._in_eager_mode():
var = core.VarBase( var = core.VarBase(
value=value, value=value,
name=self._inputs[i].desc.name(), name=self._inputs[i].desc.name(),
persistable=False, persistable=False,
place=expected_place, place=expected_place,
zero_copy=True) zero_copy=True)
elif isinstance(value, core.VarBase): else:
var = core.eager.Tensor(
value=value,
name=self._inputs[i].desc.name(),
persistable=False,
place=expected_place,
zero_copy=True)
elif isinstance(value, (core.VarBase, core.eager.Tensor)):
# NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
# into CUDAPlace when it's as input of multi Ops. so we move it in advance # into CUDAPlace when it's as input of multi Ops. so we move it in advance
# to avoid this problem. # to avoid this problem.
...@@ -411,9 +429,16 @@ class PartialProgramLayer: ...@@ -411,9 +429,16 @@ class PartialProgramLayer:
var = self._outputs[var_id] var = self._outputs[var_id]
assert isinstance(var, framework.Variable) assert isinstance(var, framework.Variable)
var_desc = var.desc var_desc = var.desc
varbase = None
if not core._in_eager_mode():
var_base = core.VarBase(var_desc.dtype(), var_base = core.VarBase(var_desc.dtype(),
var_desc.shape(), var_desc.shape(),
var_desc.name(), var_desc.type(), False) var_desc.name(), var_desc.type(), False)
else:
var_base = core.eager.Tensor(var_desc.dtype(),
var_desc.shape(),
var_desc.name(),
var_desc.type(), False)
return var_base return var_base
# Create VarBase to receive output data. # Create VarBase to receive output data.
...@@ -423,9 +448,16 @@ class PartialProgramLayer: ...@@ -423,9 +448,16 @@ class PartialProgramLayer:
def _create_scope_vec(self): def _create_scope_vec(self):
# Hold forward variables # Hold forward variables
tmp_scope_vec = None
if not core._in_eager_mode():
tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
"program_out_scope", "program_out_scope",
core.VarDesc.VarType.STEP_SCOPES, True) core.VarDesc.VarType.STEP_SCOPES, True)
# TODO(jiabin): Support this later.
# else:
# tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
# "program_out_scope",
# core.VarDesc.VarType.STEP_SCOPES, True)
inner_scope = core.Scope() inner_scope = core.Scope()
tmp_scope_vec.value().set_scope(inner_scope) tmp_scope_vec.value().set_scope(inner_scope)
...@@ -450,7 +482,8 @@ class PartialProgramLayer: ...@@ -450,7 +482,8 @@ class PartialProgramLayer:
return main_program.clone(for_test=True) return main_program.clone(for_test=True)
def _is_no_value(self, var): def _is_no_value(self, var):
if isinstance(var, core.VarBase) and var.shape == [1]: if isinstance(var,
(core.VarBase, core.eager.Tensor)) and var.shape == [1]:
# NOTE: .numpy() will insert MemcpySync operation, it hits performance. # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM: if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
return True return True
...@@ -460,7 +493,7 @@ class PartialProgramLayer: ...@@ -460,7 +493,7 @@ class PartialProgramLayer:
""" """
Removes invalid value for various-length return statement Removes invalid value for various-length return statement
""" """
if isinstance(out_vars, core.VarBase): if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
if self._is_no_value(out_vars): if self._is_no_value(out_vars):
return None return None
return out_vars return out_vars
...@@ -527,7 +560,7 @@ class PartialProgramLayer: ...@@ -527,7 +560,7 @@ class PartialProgramLayer:
param_and_buffer_names_set = set() param_and_buffer_names_set = set()
for i, var in enumerate(self._params): for i, var in enumerate(self._params):
# self._params constains parameters and buffers with persistable=True. # self._params constains parameters and buffers with persistable=True.
if not isinstance(var, core.VarBase): if not isinstance(var, (core.VarBase, core.eager.Tensor)):
raise TypeError( raise TypeError(
'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'. 'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
format(i, type(var))) format(i, type(var)))
...@@ -559,10 +592,18 @@ def _create_fake_var(): ...@@ -559,10 +592,18 @@ def _create_fake_var():
""" """
Create a fake_var (force on CPU) to handle empty input or output Create a fake_var (force on CPU) to handle empty input or output
""" """
if not core._in_eager_mode():
return [ return [
core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
core.VarDesc.VarType.RAW, False) core.VarDesc.VarType.RAW, False)
] ]
else:
return []
# TODO(jiabin): Support this later
# return [
# core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
# core.VarDesc.VarType.RAW, False)
# ]
def partial_program_from(concrete_program): def partial_program_from(concrete_program):
......
...@@ -25,7 +25,7 @@ import threading ...@@ -25,7 +25,7 @@ import threading
import six import six
import paddle import paddle
from paddle.fluid import core from paddle.fluid import core, dygraph
from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
from paddle.fluid.data_feeder import check_type from paddle.fluid.data_feeder import check_type
from paddle.fluid.layers.utils import flatten, pack_sequence_as from paddle.fluid.layers.utils import flatten, pack_sequence_as
...@@ -898,6 +898,7 @@ def save(layer, path, input_spec=None, **configs): ...@@ -898,6 +898,7 @@ def save(layer, path, input_spec=None, **configs):
state_var_dict[var.name] = var state_var_dict[var.name] = var
# 3. share parameters from Layer to scope & record var info # 3. share parameters from Layer to scope & record var info
with dygraph.guard():
for param_or_buffer in concrete_program.parameters: for param_or_buffer in concrete_program.parameters:
# share to scope # share to scope
if param_or_buffer.type == core.VarDesc.VarType.VOCAB: if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
...@@ -915,12 +916,14 @@ def save(layer, path, input_spec=None, **configs): ...@@ -915,12 +916,14 @@ def save(layer, path, input_spec=None, **configs):
if param_or_buffer.name not in extra_var_info: if param_or_buffer.name not in extra_var_info:
extra_info_dict = dict() extra_info_dict = dict()
if param_or_buffer.name in state_names_dict: if param_or_buffer.name in state_names_dict:
extra_info_dict['structured_name'] = state_names_dict[ extra_info_dict[
'structured_name'] = state_names_dict[
param_or_buffer.name] param_or_buffer.name]
extra_info_dict[ extra_info_dict[
'stop_gradient'] = param_or_buffer.stop_gradient 'stop_gradient'] = param_or_buffer.stop_gradient
if isinstance(param_or_buffer, ParamBase): if isinstance(param_or_buffer, ParamBase):
extra_info_dict['trainable'] = param_or_buffer.trainable extra_info_dict[
'trainable'] = param_or_buffer.trainable
extra_var_info[param_or_buffer.name] = extra_info_dict extra_var_info[param_or_buffer.name] = extra_info_dict
# 4. build input & output of save_infernece_model # 4. build input & output of save_infernece_model
......
...@@ -94,7 +94,7 @@ def monkey_patch_varbase(): ...@@ -94,7 +94,7 @@ def monkey_patch_varbase():
# Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
# It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None). # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
attr_not_need_keys = ['grad', 'T'] attr_not_need_keys = ['grad', 'T']
if isinstance(self, ParamBase): if isinstance(self, (ParamBase, EagerParamBase)):
attr_kwargs = self.__dict__.copy() attr_kwargs = self.__dict__.copy()
else: else:
attr_names = [] attr_names = []
...@@ -111,7 +111,7 @@ def monkey_patch_varbase(): ...@@ -111,7 +111,7 @@ def monkey_patch_varbase():
attr_kwargs.update(kwargs) attr_kwargs.update(kwargs)
if to_parameter or isinstance(self, ParamBase): if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
del attr_kwargs['persistable'] del attr_kwargs['persistable']
# NOTE(Aurelius84): All parameters should be placed into global block. # NOTE(Aurelius84): All parameters should be placed into global block.
attr_kwargs['block'] = attr_kwargs['block'].program.global_block() attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
......
...@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj): ...@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
@static_only @static_only
def _legacy_save(param_dict, model_path, protocol=2): def _legacy_save(param_dict, model_path, protocol=2):
def get_tensor(var): def get_tensor(var):
if isinstance(var, core.VarBase): if isinstance(var, (core.VarBase, core.eager.Tensor)):
return var.numpy() return var.numpy()
elif isinstance(var, core.LoDTensor): elif isinstance(var, core.LoDTensor):
return np.array(var) return np.array(var)
......
...@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None): ...@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'], x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
'flatten') 'flatten')
if in_dygraph_mode():
return _C_ops.flatten2(x, 'axis', axis)[0]
helper = LayerHelper('flatten', **locals()) helper = LayerHelper('flatten', **locals())
if not (isinstance(x, Variable)): if not (isinstance(x, Variable)):
......
...@@ -663,6 +663,8 @@ def assign(input, output=None): ...@@ -663,6 +663,8 @@ def assign(input, output=None):
}) })
if is_inplace and in_dygraph_mode(): if is_inplace and in_dygraph_mode():
# TODO(jiabin): Remove this when we support inplace
if not core._in_eager_mode():
output._bump_inplace_version() output._bump_inplace_version()
return output return output
......
...@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase): ...@@ -98,7 +98,7 @@ class TestFleetWithASPSharding(unittest.TestCase):
feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
exe.run(startup_prog) exe.run(startup_prog)
sparsity.prune_model(train_prog, sharding=True) sparsity.prune_model(train_prog)
data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
exe.run(train_prog, feed=feeder.feed([data])) exe.run(train_prog, feed=feeder.feed([data]))
......
...@@ -520,6 +520,7 @@ def predict_static(args, data): ...@@ -520,6 +520,7 @@ def predict_static(args, data):
paddle.enable_static() paddle.enable_static()
exe = fluid.Executor(args.place) exe = fluid.Executor(args.place)
# load inference model # load inference model
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model( fetch_targets] = fluid.io.load_inference_model(
args.model_save_dir, args.model_save_dir,
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import random
import numpy as np
import os
import shutil
import paddle
from paddle.fluid import core
import datetime
from datetime import timedelta
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard
from paddle.fluid.dygraph.parallel import ParallelEnv
class TestProcessGroupFp32(unittest.TestCase):
def setUp(self):
paddle.seed(2022)
random.seed(2022)
np.random.seed(2022)
self.config()
def config(self):
self.dtype = "float32"
self.shape = (2, 10, 5)
def test_create_process_group_gloo(self):
with _test_eager_guard():
nranks = ParallelEnv().nranks
rank = ParallelEnv().local_rank
is_master = True if rank == 0 else False
store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
nranks, datetime.timedelta(0))
gloo_store = paddle.fluid.core.GlooStore(store)
opt = paddle.fluid.core.GlooOptions()
pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
# test allreduce sum
# rank 0
paddle.device.set_device('cpu')
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
sum_result = x + y
if rank == 0:
task = pg.allreduce(tensor_x)
task.wait()
assert np.array_equal(tensor_x, sum_result)
else:
task = pg.allreduce(tensor_y)
task.wait()
assert np.array_equal(tensor_y, sum_result)
print("test allreduce sum api ok")
# test allreduce max
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
max_result = paddle.maximum(tensor_x, tensor_y)
if rank == 0:
task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
task.wait()
assert np.array_equal(tensor_x, max_result)
else:
task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
task.wait()
assert np.array_equal(tensor_y, max_result)
print("test allreduce max api ok")
# test broadcast
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
# rank 1
y = np.random.random(self.shape).astype(self.dtype)
tensor_y = paddle.to_tensor(y)
broadcast_result = paddle.assign(tensor_x)
if rank == 0:
task = pg.broadcast(tensor_x, 0)
task.synchronize()
assert task.is_completed()
assert np.array_equal(broadcast_result, tensor_x)
else:
task = pg.broadcast(tensor_y, 0)
task.synchronize()
assert task.is_completed()
assert np.array_equal(broadcast_result, tensor_y)
print("test broadcast api ok")
if __name__ == "__main__":
unittest.main()
...@@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase): ...@@ -144,23 +144,109 @@ class TestProcessGroupFp32(unittest.TestCase):
print("test barrier api ok\n") print("test barrier api ok\n")
# test send/recv # test allgather
# rank 0 # rank 0
x = np.random.random(self.shape).astype(self.dtype) x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x) tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
out_shape = list(self.shape)
out_shape[0] *= 2
out = np.random.random(out_shape).astype(self.dtype)
tensor_out = paddle.to_tensor(out)
if pg.rank() == 0: if pg.rank() == 0:
task = pg.send(tensor_x, dst=1) task = pg.all_gather(tensor_x, tensor_out)
task.wait() task.wait()
paddle.device.cuda.synchronize() paddle.device.cuda.synchronize()
# rank 1 # rank 1
else: else:
task = pg.all_gather(tensor_y, tensor_out)
task.wait()
paddle.device.cuda.synchronize()
out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
[out_shape[0]])
assert np.array_equal(tensor_x, out_1)
assert np.array_equal(tensor_y, out_2)
print("test allgather api ok\n")
# test alltoall
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype)
out1 = np.random.random(self.shape).astype(self.dtype)
out2 = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y) tensor_y = paddle.to_tensor(y)
task = pg.recv(tensor_y, src=0) tensor_out1 = paddle.to_tensor(out1)
tensor_out2 = paddle.to_tensor(out2)
raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
[self.shape[0]])
raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
[self.shape[0] // 2])
if pg.rank() == 0:
task = pg.alltoall(tensor_x, tensor_out1)
task.wait() task.wait()
paddle.device.cuda.synchronize() paddle.device.cuda.synchronize()
assert np.array_equal(tensor_x, tensor_y) # rank 1
print("test send/recv api ok\n") else:
task = pg.alltoall(tensor_y, tensor_out2)
task.wait()
paddle.device.cuda.synchronize()
out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
[self.shape[0]])
out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
if pg.rank() == 0:
assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
else:
assert np.array_equal(out2_1, raw_tensor_x_2)
print("test alltoall api ok\n")
# test Reduce
# rank 0
x = np.random.random(self.shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
sum_result = tensor_x + tensor_y
if pg.rank() == 0:
task = pg.reduce(tensor_x, 0)
task.wait()
paddle.device.cuda.synchronize()
# rank 1
else:
task = pg.reduce(tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
if pg.rank() == 0:
assert np.array_equal(tensor_x, sum_result)
print("test reduce sum api ok\n")
# test Scatter
# rank 0
in_shape = list(self.shape)
in_shape[0] *= 2
x = np.random.random(in_shape).astype(self.dtype)
y = np.random.random(self.shape).astype(self.dtype)
tensor_x = paddle.to_tensor(x)
tensor_y = paddle.to_tensor(y)
if pg.rank() == 0:
task = pg.scatter(tensor_x, tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
# rank 1
else:
task = pg.scatter(tensor_x, tensor_y, 0)
task.wait()
paddle.device.cuda.synchronize()
out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
[self.shape[0] * 2])
if pg.rank() == 0:
assert np.array_equal(tensor_y, out1)
else:
assert np.array_equal(tensor_y, out2)
print("test scatter api ok\n")
class TestProcessGroupFp16(TestProcessGroupFp32): class TestProcessGroupFp16(TestProcessGroupFp32):
......
...@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase): ...@@ -162,6 +162,7 @@ class TestIRPassBase(unittest.TestCase):
for k, v in self.get_strategy().items(): for k, v in self.get_strategy().items():
setattr(build_strategy, k, v) setattr(build_strategy, k, v)
self.check_before_applied(main2, startup2) self.check_before_applied(main2, startup2)
apply_build_strategy(main2, startup2, build_strategy, apply_build_strategy(main2, startup2, build_strategy,
{"use_cuda": self.use_cuda}) {"use_cuda": self.use_cuda})
self.check_after_applied(main2, startup2) self.check_after_applied(main2, startup2)
......
...@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -320,7 +320,7 @@ class TestBatchNormOpInference(unittest.TestCase):
def test_check_output(self): def test_check_output(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
...@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -342,13 +342,13 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def test_check_output(self): def test_check_output(self):
places = [] places = []
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
if core.is_float16_supported(place): if core.is_float16_supported(place):
places.append(place) places.append(place)
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: #for data_format in ["NCHW", "NHWC"]:
for data_format in ["NCHW"]:
self.check_with_place(place, data_format, self.dtype, self.check_with_place(place, data_format, self.dtype,
[2, 3, 4, 5]) [2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3]) self.check_with_place(place, data_format, self.dtype, [2, 3])
...@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -517,7 +517,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
...@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): ...@@ -657,7 +657,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
class TestDygraphBatchNormTrainableStats(unittest.TestCase): class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_dygraph(self): def test_dygraph(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
shape = [4, 10, 4, 4] shape = [4, 10, 4, 4]
...@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): ...@@ -678,7 +678,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
def test_static(self): def test_static(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
exe = fluid.Executor(p) exe = fluid.Executor(p)
...@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): ...@@ -716,4 +716,6 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main() unittest.main()
...@@ -28,7 +28,7 @@ import paddle ...@@ -28,7 +28,7 @@ import paddle
class TestBatchNorm(unittest.TestCase): class TestBatchNorm(unittest.TestCase):
def test_name(self): def test_name(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
with fluid.dygraph.guard(p): with fluid.dygraph.guard(p):
...@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestBatchNorm(unittest.TestCase):
def test_error(self): def test_error(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
#paddle.disable_static() #paddle.disable_static()
...@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -83,7 +83,7 @@ class TestBatchNorm(unittest.TestCase):
def test_dygraph(self): def test_dygraph(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
shape = [4, 10, 4, 4] shape = [4, 10, 4, 4]
...@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase): ...@@ -135,7 +135,7 @@ class TestBatchNorm(unittest.TestCase):
def test_static(self): def test_static(self):
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0)) places.append(fluid.CUDAPlace(0))
for p in places: for p in places:
exe = fluid.Executor(p) exe = fluid.Executor(p)
...@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase): ...@@ -177,7 +177,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
else: else:
paddle.set_default_dtype("float64") paddle.set_default_dtype("float64")
self.places = [fluid.CPUPlace()] self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0)) self.places.append(fluid.CUDAPlace(0))
def tearDown(self): def tearDown(self):
...@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase): ...@@ -247,7 +247,7 @@ class TestBatchNormChannelLast(unittest.TestCase):
class TestBatchNormUseGlobalStats(unittest.TestCase): class TestBatchNormUseGlobalStats(unittest.TestCase):
def setUp(self): def setUp(self):
self.places = [fluid.CPUPlace()] self.places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda():
self.places.append(fluid.CUDAPlace(0)) self.places.append(fluid.CUDAPlace(0))
self.init_test() self.init_test()
...@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats): ...@@ -300,4 +300,6 @@ class TestBatchNormUseGlobalStatsCase3(TestBatchNormUseGlobalStats):
if __name__ == '__main__': if __name__ == '__main__':
import paddle
paddle.enable_static()
unittest.main() unittest.main()
...@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus): ...@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
def test_process_group_nccl(self): def test_process_group_nccl(self):
self.run_mnist_2gpu('process_group_nccl.py') self.run_mnist_2gpu('process_group_nccl.py')
def test_process_group_gloo(self):
self.run_mnist_2gpu('process_group_gloo.py')
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle
import paddle import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class( ...@@ -1001,4 +1002,5 @@ create_test_cudnn_channel_last_fp16_class(
TestWithDilation_AsyPadding, grad_check=False) TestWithDilation_AsyPadding, grad_check=False)
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): ...@@ -771,13 +771,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr)) self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
ori_place = egr_tensor.place ori_place = egr_tensor.place
new_arr = np.random.rand(4, 4, 16, 32).astype('float32') new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr)) self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
egr_tensor._set_value(new_arr) egr_tensor.set_value(new_arr)
self.assertEqual(egr_tensor.stop_gradient, True) self.assertEqual(egr_tensor.stop_gradient, True)
self.assertTrue(egr_tensor.place._equals(ori_place)) self.assertTrue(egr_tensor.place._equals(ori_place))
self.assertEqual(egr_tensor.shape, [4, 4, 16, 32]) self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr)) self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
...@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase): ...@@ -880,7 +880,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
new_weight = np.ones([1, 3]).astype('float32') new_weight = np.ones([1, 3]).astype('float32')
self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight)) self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
linear.weight._set_value(new_weight) linear.weight.set_value(new_weight)
self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight)) self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
self.assertTrue(linear.weight.place._equals(ori_place)) self.assertTrue(linear.weight.place._equals(ori_place))
......
...@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase): ...@@ -231,4 +231,5 @@ class TestExpandV2API(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -23,6 +23,7 @@ import paddle.fluid as fluid ...@@ -23,6 +23,7 @@ import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import compiler from paddle.fluid import compiler
import paddle.fluid.unique_name as unique_name import paddle.fluid.unique_name as unique_name
import paddle
class TestInplaceANBOpTraining(unittest.TestCase): class TestInplaceANBOpTraining(unittest.TestCase):
...@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -138,14 +139,14 @@ class TestInplaceANBOpTraining(unittest.TestCase):
outs[0].name if not only_forward else None, outs[0].name if not only_forward else None,
build_strategy=build_strategy, build_strategy=build_strategy,
exec_strategy=exec_strategy) exec_strategy=exec_strategy)
bn_fetches = exe.run(program=comp_prog1, bn_fetches = exe.run(program=main,
feed={'input': data}, feed={'input': data},
fetch_list=fetch_name) fetch_list=fetch_name)
fetch_outs.append(bn_fetches) fetch_outs.append(bn_fetches)
fetch_names.append(fetch_name) fetch_names.append(fetch_name)
for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs + for bn_val, inplace_abn_val, name1, name2 in zip(*(
fetch_names)): fetch_outs + fetch_names)):
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
bn_val, inplace_abn_val, atol=1e-2), bn_val, inplace_abn_val, atol=1e-2),
...@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -156,6 +157,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
def test_op(self): def test_op(self):
use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
#use_cudas = [False]
for use_cuda in use_cudas: for use_cuda in use_cudas:
place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
layouts = ["NCHW", "NHWC"] layouts = ["NCHW", "NHWC"]
...@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase): ...@@ -186,4 +188,5 @@ class TestInplaceANBOpTraining(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -21,6 +21,7 @@ import paddle.fluid as fluid ...@@ -21,6 +21,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.core as core import paddle.fluid.core as core
import gradient_checker import gradient_checker
import paddle
from decorator_helper import prog_scope from decorator_helper import prog_scope
...@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5): ...@@ -167,4 +168,5 @@ class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm ...@@ -24,6 +24,7 @@ from simple_nets import init_data, simple_fc_net, fc_with_batchnorm
import seresnext_net import seresnext_net
from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
from fake_reader import fake_imdb_reader from fake_reader import fake_imdb_reader
import paddle
def lstm_net(use_feed): def lstm_net(use_feed):
...@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase): ...@@ -309,4 +310,5 @@ class TestProgramPruneBackward(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase): ...@@ -507,4 +507,5 @@ class TestReshapeZeroTensor(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static()
unittest.main() unittest.main()
...@@ -533,10 +533,6 @@ class TestTensorRegisterHook(unittest.TestCase): ...@@ -533,10 +533,6 @@ class TestTensorRegisterHook(unittest.TestCase):
size=[self.batch_size, self.in_size]).astype('float32') size=[self.batch_size, self.in_size]).astype('float32')
data_t = paddle.to_tensor(data) data_t = paddle.to_tensor(data)
if _in_eager_mode():
with self.assertRaises(TypeError):
out = jit_net(data_t)
else:
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
out = jit_net(data_t) out = jit_net(data_t)
......
...@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None): ...@@ -1402,7 +1402,8 @@ def gather(x, index, axis=None, name=None):
return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False) return _C_ops.gather(x, index, None, "axis", axis, "overwrite", False)
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], x, 'x',
['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
'gather') 'gather')
check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather') check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
......
...@@ -43,7 +43,9 @@ class BaseAPI(object): ...@@ -43,7 +43,9 @@ class BaseAPI(object):
self.is_base_api = False self.is_base_api = False
self.invoke = api_item_yaml['invoke'] self.invoke = api_item_yaml['invoke']
else: else:
self.infer_meta = self.parse_infer_meta(api_item_yaml['infer_meta']) if 'infer_meta' in api_item_yaml:
self.infer_meta = self.parse_infer_meta(api_item_yaml[
'infer_meta'])
self.kernel = self.parse_kernel(api_item_yaml['kernel']) self.kernel = self.parse_kernel(api_item_yaml['kernel'])
self.support_selected_rows_kernel = False if len(self.kernel[ self.support_selected_rows_kernel = False if len(self.kernel[
'func']) == 1 else True 'func']) == 1 else True
...@@ -182,9 +184,9 @@ class BaseAPI(object): ...@@ -182,9 +184,9 @@ class BaseAPI(object):
'Tensor': 'Tensor', 'Tensor': 'Tensor',
'Tensor[]': 'std::vector<Tensor>' 'Tensor[]': 'std::vector<Tensor>'
} }
if re.search(r'\(\w*\)', output_item): if re.search(r'\([a-zA-Z0-9_@]*\)', output_item):
result = re.search( result = re.search(
r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>\w+)\)", r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*\((?P<name>[a-zA-Z0-9_@]+)\)",
output_item) output_item)
out_type = result.group('out_type') out_type = result.group('out_type')
assert out_type in output_type_map, \ assert out_type in output_type_map, \
...@@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. ...@@ -499,11 +501,8 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
def get_kernel_args(self, code_indent): def get_kernel_args(self, code_indent):
input_trans_map = { input_trans_map = {
'const Tensor&': 'const phi::DenseTensor&', 'const Tensor&': 'const phi::DenseTensor&',
'const Tensor &': 'const phi::DenseTensor&',
'const std::vector<Tensor>&': 'const std::vector<Tensor>&':
'const std::vector<phi::DenseTensor>&', 'const std::vector<phi::DenseTensor>&',
'const std::vector<Tensor> &':
'const std::vector<phi::DenseTensor>&',
'const paddle::optional<Tensor>&': 'const paddle::optional<Tensor>&':
'paddle::optional<const phi::DenseTensor&>', 'paddle::optional<const phi::DenseTensor&>',
'const paddle::optional<std::vector<Tensor>>&': 'const paddle::optional<std::vector<Tensor>>&':
...@@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. ...@@ -592,7 +591,6 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
def get_selected_rows_kernel_args(self, code_indent): def get_selected_rows_kernel_args(self, code_indent):
input_trans_map = { input_trans_map = {
'const Tensor&': 'const phi::SelectedRows&', 'const Tensor&': 'const phi::SelectedRows&',
'const Tensor &': 'const phi::SelectedRows&',
'const paddle::optional<Tensor>&': 'const paddle::optional<Tensor>&':
'paddle::optional<const phi::SelectedRows&>' 'paddle::optional<const phi::SelectedRows&>'
} }
......
...@@ -105,7 +105,7 @@ def source_include(header_file_path): ...@@ -105,7 +105,7 @@ def source_include(header_file_path):
#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/storage.h"
......
...@@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI): ...@@ -56,8 +56,9 @@ class BackwardAPI(BaseAPI):
# check the attributes of backward # check the attributes of backward
for attr in self.attrs['names']: for attr in self.attrs['names']:
assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \ assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \ self.attrs['attr_info'][attr][1] is not None, \
f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
Please check the args of {self.api} in yaml." Please check the args of {self.api} in yaml."
# check the output of backward # check the output of backward
...@@ -145,7 +146,7 @@ def source_include(header_file_path): ...@@ -145,7 +146,7 @@ def source_include(header_file_path):
#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_custom_impl.h"
#include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/storage.h"
......
- sparse_api : conv3d
args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
kernel :
func : sparse_conv3d
layout : x
- sparse_api : to_dense
args : (Tensor x, Backend backend)
output : Tensor(out@DenseTensor)
invoke : to_dense_impl(x, backend)
- sparse_api : to_sparse_coo
args : (Tensor x, Backend backend, int64_t sparse_dim)
output : Tensor(out@SparseCooTensor)
invoke : to_sparse_coo_impl(x, backend, sparse_dim)
- sparse_api : to_sparse_csr
args : (Tensor x, Backend backend)
output : Tensor(out@SparseCsrTensor)
invoke : to_sparse_csr_impl(x, backend)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
import argparse
import re
from api_base import BaseAPI
class SparseAPI(BaseAPI):
def __init__(self, api_item_yaml):
super(SparseAPI, self).__init__(api_item_yaml)
def get_api_name(self, api_item_yaml):
return api_item_yaml['sparse_api']
def get_api_func_name(self):
return self.api
def get_return_type(self, out_type_list):
return out_type_list[0] if len(
out_type_list) == 1 else "std::tuple<" + ",".join(
out_type_list) + ">"
def gene_api_declaration(self):
return f"""
// {", ".join(self.outputs['names'])}
PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
"""
def get_kernel_tensor_out_type(self, output_name):
sparse_type = 'TensorType::DENSE_TENSOR'
if output_name.endswith('@SparseCooTensor'):
sparse_type = 'TensorType::SPARSE_COO'
elif output_name.endswith('@SparseCsrTensor'):
sparse_type = 'TensorType::SPARSE_CSR'
return sparse_type
def gene_output(self,
output_type_list,
set_out_func,
code_indent,
inplace_flag=False):
kernel_output = ""
output_names = []
output_create = ""
if len(output_type_list) == 1:
kernel_output = 'kernel_out'
output_names.append('kernel_out')
inplace_assign = " = " + self.inplace_map[self.outputs['names'][
0]] if inplace_flag and self.inplace_map is not None and self.outputs[
'names'][0] in self.inplace_map else ""
output_create = f"""
{self.outputs['return_type']} out{inplace_assign};
auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
elif len(output_type_list) > 1:
output_create = f"""
{self.outputs['return_type']} out;"""
for i in range(len(output_type_list)):
kernel_output = kernel_output + f'kernel_out_{i}, '
output_names.append(f'kernel_out_{i}')
if inplace_flag and self.inplace_map is not None and self.outputs[
'names'][i] in self.inplace_map:
output_create = output_create + f"""
std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
output_create = output_create + f"""
auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
kernel_output = kernel_output[:-2]
else:
raise ValueError(
"{} : Output error: the output should not be empty.".format(
self.api))
return kernel_output, output_names, output_create
def gen_sparse_kernel_context(self, kernel_output_names):
input_trans_map = {
'const Tensor&': 'const phi::TenseBase&',
'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
'const paddle::optional<Tensor>&':
'paddle::optional<const phi::TenseBase&>'
}
out_trans_map = {
'Tensor': 'phi::TenseBase*',
'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
}
input_names = self.inputs['names']
input_infos = self.inputs['input_info']
attr_names = self.attrs['names']
kernel_param = self.kernel['param']
if kernel_param is None:
kernel_param = input_names + attr_names
kernel_context_code = ""
for param in kernel_param:
if param in input_names:
if param in self.optional_vars:
raise ValueError(
f"{self.api} : Unsupport optional input({param}) for sparse api."
)
else:
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackInput({param}.impl().get());"""
continue
if param in attr_names:
# set attr for kernel_context
if 'ScalarArray' in self.attrs['attr_info'][param][0]:
param = 'phi::ScalarArray(' + param + ')'
elif 'Scalar' in self.attrs['attr_info'][param][0]:
param = 'phi::Scalar(' + param + ')'
elif isinstance(param, bool):
param = str(param).lower()
else:
param + str(param) + ", "
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackAttr({param});"""
for out_name in kernel_output_names:
kernel_context_code = kernel_context_code + f"""
kernel_context.EmplaceBackOutput({out_name});"""
return kernel_context_code
def gen_sparse_kernel_code(self, inplace_flag=False):
_, kernel_output_names, output_create = self.gene_output(
self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
kernel_context_code = self.gen_sparse_kernel_context(
kernel_output_names)
return f"""
auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto kernel_context = phi::KernelContext(dev_ctx);
{output_create}
{kernel_context_code}
phi_kernel(&kernel_context);
return out;"""
def gene_base_api_code(self, inplace_flag=False):
api_func_name = self.get_api_func_name()
return f"""
PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
{self.gene_kernel_select()}
{self.gen_sparse_kernel_code(inplace_flag)}
}}
"""
def header_include():
return """
#include <tuple>
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/scalar_array.h"
#include "paddle/utils/optional.h"
"""
def source_include(header_file_path):
return f"""
#include "{header_file_path}"
#include <memory>
#include "glog/logging.h"
#include "paddle/phi/api/lib/api_registry.h"
#include "paddle/phi/api/lib/api_gen_utils.h"
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/declarations.h"
"""
def api_register():
return """
PD_REGISTER_API(Test);
"""
def api_namespace():
return ("""
namespace paddle {
namespace experimental {
namespace sparse {
""", """
} // namespace sparse
} // namespace experimental
} // namespace paddle
""")
def generate_api(api_yaml_path, header_file_path, source_file_path):
with open(api_yaml_path, 'r') as f:
apis = yaml.load(f, Loader=yaml.FullLoader)
header_file = open(header_file_path, 'w')
source_file = open(source_file_path, 'w')
namespace = api_namespace()
header_file.write("#pragma once\n")
header_file.write(header_include())
header_file.write(namespace[0])
include_header_file = "paddle/phi/api/include/sparse_api.h"
source_file.write(source_include(include_header_file))
source_file.write(namespace[0])
for api in apis:
sparse_api = SparseAPI(api)
header_file.write(sparse_api.gene_api_declaration())
source_file.write(sparse_api.gene_api_code())
header_file.write(namespace[1])
source_file.write(namespace[1])
source_file.write(api_register())
header_file.close()
source_file.close()
def main():
parser = argparse.ArgumentParser(
description='Generate PaddlePaddle C++ Sparse API files')
parser.add_argument(
'--api_yaml_path',
help='path to sparse api yaml file',
default='python/paddle/utils/code_gen/sparse_api.yaml')
parser.add_argument(
'--api_header_path',
help='output of generated api header code file',
default='paddle/phi/api/include/sparse_api.h')
parser.add_argument(
'--api_source_path',
help='output of generated api source code file',
default='paddle/phi/api/lib/sparse_api.cc')
options = parser.parse_args()
api_yaml_path = options.api_yaml_path
header_file_path = options.api_header_path
source_file_path = options.api_source_path
generate_api(api_yaml_path, header_file_path, source_file_path)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册