提交 9e00395a 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_yolo_box_to_phi

...@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec ...@@ -7,9 +7,11 @@ paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/phi/api/backward/backward_api.h paddle/phi/api/backward/backward_api.h
paddle/phi/api/include/api.h paddle/phi/api/include/api.h
paddle/phi/api/include/sparse_api.h
paddle/phi/api/lib/api.cc paddle/phi/api/lib/api.cc
paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/dygraph_api.*
paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/backward_api.cc
paddle/phi/api/lib/sparse_api.cc
paddle/phi/extension.h paddle/phi/extension.h
paddle/phi/include/* paddle/phi/include/*
paddle/phi/infermeta/generated.* paddle/phi/infermeta/generated.*
......
cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
if (WITH_DISTRIBUTE)
cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
endif()
cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
if(WITH_NCCL) if(WITH_NCCL)
......
...@@ -117,6 +117,35 @@ class ProcessGroup { ...@@ -117,6 +117,35 @@ class ProcessGroup {
"ProcessGroup%s does not support receive", GetBackendName())); "ProcessGroup%s does not support receive", GetBackendName()));
} }
virtual std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllGather", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in /* tensors */, // NOLINT
std::vector<Tensor>& out /* tensors */) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors /* tensors */, // NOLINT
const ReduceOptions& opts) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Reduce", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Scatter(
std::vector<Tensor>& in_tensors /* tensors */, // NOLINT
std::vector<Tensor>& out_tensors /* tensors */, // NOLINT
const ScatterOptions&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support Scatter", GetBackendName()));
}
protected: protected:
const int rank_; const int rank_;
const int size_; const int size_;
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#ifdef _WIN32
#include <gloo/common/win.h>
#include <winsock2.h>
#include <ws2tcpip.h>
#else
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif
#include <gloo/broadcast.h>
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace distributed {
#ifdef _WIN32
#define GENERATE_FUNC(type, func, ...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(__VA_ARGS__); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(__VA_ARGS__); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(__VA_ARGS__); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(__VA_ARGS__); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#define HOST_NAME_MAX 256
#else
#define GENERATE_FUNC(type, func, args...) \
switch (type) { \
case experimental::DataType::FLOAT32: \
func<float>(args); \
break; \
case experimental::DataType::FLOAT64: \
func<double>(args); \
break; \
case experimental::DataType::FLOAT16: \
func<gloo::float16>(args); \
break; \
case experimental::DataType::INT32: \
func<int32_t>(args); \
break; \
case experimental::DataType::INT64: \
func<int64_t>(args); \
break; \
default: \
VLOG(0) << "Error: Unknown DataType."; \
exit(-1); \
}
#endif
typedef void (*reduce_func)(void*, const void*, const void*, size_t);
template <typename T>
reduce_func get_function(const ReduceOp& r) {
switch (r) {
case ReduceOp::SUM:
return reduce_func(&::gloo::sum<T>);
case ReduceOp::PRODUCT:
return reduce_func(&::gloo::product<T>);
case ReduceOp::MIN:
return reduce_func(&::gloo::min<T>);
case ReduceOp::MAX:
return reduce_func(&::gloo::max<T>);
case ReduceOp::AVG:
VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
exit(-1);
}
VLOG(0) << "Error: Unknown ReduceOp.";
exit(-1);
}
bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
return t.place() == PlaceType::kCPU;
});
}
template <typename T>
T* get_data(const Tensor& tensor) {
auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
return static_cast<T*>(raw_tensor->data());
}
template <typename T>
std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
std::vector<T*> ret(tensors.size());
for (size_t i = 0; i < tensors.size(); i++) {
ret[i] = get_data<T>(tensors[i]);
}
return ret;
}
template <typename T, typename P>
void set_output(P& opts, const Tensor& tensor) { // NOLINT
opts.setOutput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_input(P& opts, const Tensor& tensor) { // NOLINT
opts.setInput(get_data<T>(tensor), tensor.numel());
}
template <typename T, typename P>
void set_outputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
}
template <typename T, typename P>
void set_inputs(P& opts, const std::vector<Tensor>& tensors) { // NOLINT
opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
}
ProcessGroupGloo::GlooTask::GlooTask(int rank,
const std::vector<Tensor>& inputs,
CommType comm_type)
: ProcessGroup::Task(rank, inputs, comm_type) {
PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
platform::errors::Fatal(
"Only CPU place is supported for ProcessGroupGloo."));
}
ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
int rank, int world_size,
const std::shared_ptr<GlooOptions> options)
: ProcessGroup(rank, world_size), _tag(0), _store(store) {
_context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
auto prefix_store =
::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
_context->connectFullMesh(prefix_store, options->device);
}
class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
public:
BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
const std::vector<Tensor>& inputs, int rank, int root,
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
_context(context),
_root(root),
_inputs(inputs),
_tag(tag) {}
void Run() override { _do_broadcast(_inputs[0]); }
private:
std::shared_ptr<gloo::Context> _context;
const int _root;
std::vector<Tensor> _inputs{};
const uint32_t _tag;
void _do_broadcast(const Tensor& tensor) {
gloo::BroadcastOptions opts(_context);
const auto& dtype = tensor.type();
GENERATE_FUNC(dtype, set_output, opts, tensor);
opts.setRoot(_root);
opts.setTag(_tag);
gloo::broadcast(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
auto root = opts.source_rank;
std::unique_ptr<BroadcastGlooTask> task;
auto tag = next_tag();
auto context = get_context();
task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
task->Run();
return task;
}
class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
public:
AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
std::vector<Tensor>& inputs, ReduceOp reduce_op, // NOLINT
uint32_t tag)
: ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
_context(context),
_inputs(inputs),
_reduce_op(reduce_op),
_tag(tag) {}
void Run() override { _do_allreduce(_inputs); }
private:
std::shared_ptr<gloo::Context> _context;
std::vector<Tensor> _inputs;
const ReduceOp _reduce_op;
uint32_t _tag;
gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
const ReduceOp op) {
gloo::AllreduceOptions::Func fn;
GENERATE_FUNC(type, _get_function_impl, fn, op);
return fn;
}
template <typename T>
void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT
const ReduceOp op) {
fn = get_function<T>(op);
}
void _do_allreduce(std::vector<Tensor>& tensors) { // NOLINT
const auto& dtype = tensors[0].type();
gloo::AllreduceOptions opts(_context);
GENERATE_FUNC(dtype, set_inputs, opts, tensors);
GENERATE_FUNC(dtype, set_outputs, opts, tensors);
opts.setReduceFunction(_get_function(dtype, _reduce_op));
opts.setTag(_tag);
gloo::allreduce(opts);
}
};
std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
auto tag = next_tag();
std::shared_ptr<GlooTask> task;
auto context = get_context();
task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
opts.reduce_op, tag);
task->Run();
return task;
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
::gloo::transport::tcp::attr attr;
attr.iface = ifname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
::gloo::transport::tcp::attr attr;
attr.hostname = hostname;
return ::gloo::transport::tcp::CreateDevice(attr);
}
std::shared_ptr<::gloo::transport::Device>
ProcessGroupGloo::createDefaultDevice() {
std::array<char, HOST_NAME_MAX> hostname{};
auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
"Get hostname error for createDefaultDevice."));
::addrinfo* result;
result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
::addrinfo* cur;
for (cur = result; cur != nullptr; cur = cur->ai_next) {
SocketType socket =
::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
if (socket == -1) {
continue;
}
ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
#ifdef _WIN32
closesocket(socket);
#else
close(socket);
#endif
if (ret == -1) {
continue;
}
break;
}
freeaddrinfo(result);
if (cur != nullptr) {
return createDeviceForHostname(hostname.data());
}
return createDeviceForHostname("127.0.0.1");
}
} // namespace distributed
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <future>
#include <mutex>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
constexpr const char* GLOO_BACKEND_NAME = "GLOO";
namespace paddle {
namespace distributed {
class ProcessGroupGloo : public ProcessGroup {
public:
class GlooTask : public ProcessGroup::Task,
public std::enable_shared_from_this<GlooTask> {
public:
explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
CommType comm_type);
~GlooTask() = default;
virtual void Run() = 0;
bool Wait(std::chrono::milliseconds timeout) override { return true; }
bool IsCompleted() override { return true; }
void Synchronize() override {}
protected:
friend class ProcessGroupGloo;
};
class GlooStore : public ::gloo::rendezvous::Store {
public:
explicit GlooStore(
const std::shared_ptr<paddle::distributed::TCPStore>& store)
: _store(store) {}
~GlooStore() = default;
std::vector<char> get(const std::string& key) override {
VLOG(3) << "GlooStore::get";
auto value = _store->get(key);
return std::vector<char>(value.begin(), value.end());
}
void wait(const std::vector<std::string>& keys) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
}
void set(const std::string& key, const std::vector<char>& value) override {
VLOG(3) << "GlooStore::set";
std::vector<uint8_t> tmp(value.begin(), value.end());
_store->set(key, tmp);
}
void wait(const std::vector<std::string>& keys,
const std::chrono::milliseconds& timeout) override {
VLOG(3) << "GlooStore::wait";
for (auto& key : keys) {
_store->wait(key);
}
// wait(keys);
}
protected:
std::shared_ptr<paddle::distributed::TCPStore> _store;
};
class GlooOptions {
public:
GlooOptions() = default;
~GlooOptions() = default;
static std::shared_ptr<GlooOptions> create() {
return std::make_shared<GlooOptions>();
}
std::shared_ptr<::gloo::transport::Device> device;
};
explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
int world_size,
std::shared_ptr<GlooOptions> options);
~ProcessGroupGloo() = default;
std::shared_ptr<ProcessGroup::Task> Broadcast(
std::vector<Tensor>& inputs,
const BroadcastOptions& = BroadcastOptions()) override;
std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<Tensor>& inputs,
const AllreduceOptions& opts = AllreduceOptions()) override;
std::shared_ptr<::gloo::Context> get_context() { return _context; }
uint64_t next_tag() { return _tag++; }
const std::string GetBackendName() const override {
return GLOO_BACKEND_NAME;
}
// Helper functions for Gloo.
static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
const std::string& hostname);
static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
const std::string& ifname);
static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
protected:
uint32_t _tag;
std::shared_ptr<gloo::rendezvous::Context> _context;
std::shared_ptr<GlooStore> _store;
};
} // namespace distributed
} // namespace paddle
...@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv( ...@@ -473,5 +473,148 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
return task; return task;
} }
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
return platform::dynload::ncclAllGather(
input_tensor->data(), output_tensor->data(), input_tensor->numel(),
platform::ToNCCLDataType(input.type()), comm, stream);
},
CommType::ALLGATHER);
}
void* GetPointerByOffset(void* raw_pointer, size_t offset,
experimental::DataType type) {
if (type == experimental::DataType::FLOAT32) {
return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"This datatype in nccl is not supported."));
}
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLREDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
tensors, tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
input_tensor->data(), output_tensor->data(), input.numel(),
platform::ToNCCLDataType(input.type()),
ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream));
},
CommType::REDUCE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
std::vector<Tensor>& in_tensors, std::vector<Tensor>& out_tensors,
const ScatterOptions& opts) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors, out_tensors,
[&](const Tensor& input, Tensor& output, ncclComm_t comm,
const gpuStream_t& stream) {
auto input_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
auto output_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
size_t offset = 0;
if (rank_ == opts.root_rank) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input_tensor->data(), offset, input.type()),
input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), i, comm, stream));
offset += input_tensor->numel() / size_;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
output_tensor->data(), input_tensor->numel() / size_,
platform::ToNCCLDataType(input.type()), opts.root_rank, comm,
stream));
}
},
CommType::SCATTER);
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup { ...@@ -98,6 +98,20 @@ class ProcessGroupNCCL : public ProcessGroup {
std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors, std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
int src_rank) override; int src_rank) override;
std::shared_ptr<ProcessGroup::Task> AllGather(
std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors) override;
std::shared_ptr<ProcessGroup::Task> AllToAll(
std::vector<Tensor>& in, std::vector<Tensor>& out) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
std::vector<Tensor>& out_tensors,
const ScatterOptions&) override;
protected: protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask( virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places, int rank, CommType opType, std::vector<Place> places, int rank, CommType opType,
......
...@@ -36,5 +36,14 @@ struct BarrierOptions { ...@@ -36,5 +36,14 @@ struct BarrierOptions {
std::vector<int> place_ids; std::vector<int> place_ids;
}; };
struct ReduceOptions {
ReduceOp reduce_op = ReduceOp::SUM;
int root_rank = 0;
};
struct ScatterOptions {
int root_rank = 0;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,8 @@ class Store { ...@@ -32,6 +32,8 @@ class Store {
virtual int64_t add(const std::string& key, int64_t value) = 0; virtual int64_t add(const std::string& key, int64_t value) = 0;
virtual std::vector<uint8_t> get(const std::string& key) = 0; virtual std::vector<uint8_t> get(const std::string& key) = 0;
virtual void wait(const std::string& key) = 0; virtual void wait(const std::string& key) = 0;
virtual void set(const std::string& key,
const std::vector<uint8_t>& value) = 0;
virtual const std::chrono::seconds& timeout() const { return _timeout; } virtual const std::chrono::seconds& timeout() const { return _timeout; }
......
...@@ -27,11 +27,13 @@ namespace detail { ...@@ -27,11 +27,13 @@ namespace detail {
constexpr int INFTIME = -1; constexpr int INFTIME = -1;
std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) { std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
return std::make_unique<MasterDaemon>(socket); int nranks) {
return std::make_unique<MasterDaemon>(socket, nranks);
} }
MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { MasterDaemon::MasterDaemon(SocketType socket, int nranks)
: _listen_socket(socket), _nranks(nranks) {
_background_thread = std::thread{&MasterDaemon::run, this}; _background_thread = std::thread{&MasterDaemon::run, this};
} }
...@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) { ...@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
tcputils::send_value<int64_t>(socket, new_value); tcputils::send_value<int64_t>(socket, new_value);
} }
void MasterDaemon::_do_set(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_set";
std::string key = tcputils::receive_string(socket);
auto value = tcputils::receive_vector<uint8_t>(socket);
_store[key] = value;
}
void MasterDaemon::_do_get(SocketType socket) { void MasterDaemon::_do_get(SocketType socket) {
std::string key = tcputils::receive_string(socket); std::string key = tcputils::receive_string(socket);
auto iter = _store.find(key); auto iter = _store.find(key);
...@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) { ...@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
iter, _store.end(), iter, _store.end(),
platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
std::vector<uint8_t> value = iter->second; std::vector<uint8_t> value = iter->second;
VLOG(3) << "TCPStore: value ("
<< std::stoll(std::string(reinterpret_cast<char*>(value.data()),
value.size()))
<< ") for key (" << key << ").";
tcputils::send_vector<uint8_t>(socket, value); tcputils::send_vector<uint8_t>(socket, value);
} }
void MasterDaemon::_do_stop(SocketType socket) { void MasterDaemon::_do_stop(SocketType socket) {
VLOG(3) << "MasterDaemon::_do_stop";
ReplyType value = ReplyType::STOP_WAIT; ReplyType value = ReplyType::STOP_WAIT;
_stop = true; if (--_nranks == 0) {
_stop = true;
}
tcputils::send_value<ReplyType>(socket, value); tcputils::send_value<ReplyType>(socket, value);
} }
...@@ -140,21 +148,27 @@ void MasterDaemon::run() { ...@@ -140,21 +148,27 @@ void MasterDaemon::run() {
case Command::GET: case Command::GET:
_do_get(fds[i].fd); _do_get(fds[i].fd);
break; break;
case Command::SET:
_do_set(fds[i].fd);
break;
case Command::WAIT: case Command::WAIT:
_do_wait(fds[i].fd); _do_wait(fds[i].fd);
break; break;
case Command::STOP: case Command::STOP:
_do_stop(fds[i].fd); _do_stop(fds[i].fd);
break; break;
default:
VLOG(0) << "Unknow command: " << static_cast<int>(command);
exit(-1);
} }
} }
} }
} }
std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) { std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
auto server = std::make_unique<TCPServer>(); auto server = std::make_unique<TCPServer>();
server->_master_daemon = MasterDaemon::start(socket); server->_master_daemon = MasterDaemon::start(socket, nranks);
return server; return server;
} }
...@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, ...@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
size_t num_workers, std::chrono::seconds timeout) size_t num_workers, std::chrono::seconds timeout)
: Store(timeout), _is_master(is_master), _num_workers(num_workers) { : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
if (_is_master) { if (_is_master) {
_server = detail::TCPServer::create(port); _server = detail::TCPServer::create(port, num_workers);
} }
_client = detail::TCPClient::connect(host, port); _client = detail::TCPClient::connect(host, port);
...@@ -213,36 +227,41 @@ void TCPStore::waitWorkers() { ...@@ -213,36 +227,41 @@ void TCPStore::waitWorkers() {
} }
add(_init_key, 1); add(_init_key, 1);
if (_server) { auto begin = std::chrono::steady_clock::now();
auto begin = std::chrono::steady_clock::now(); do {
do { auto value = get(_init_key);
auto value = get(_init_key); int completed = std::stoi(std::string(value.begin(), value.end()));
int completed = std::stoi(std::string(value.begin(), value.end())); VLOG(3) << completed << " worker ready, total " << _num_workers;
VLOG(3) << completed << " worker ready, total " << _num_workers; if (completed >= _num_workers) {
if (completed >= _num_workers) { break;
break; }
} const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::steady_clock::now() - begin);
std::chrono::steady_clock::now() - begin);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
std::this_thread::sleep_for(std::chrono::milliseconds(100)); if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ( completed, _num_workers,
completed, _num_workers, platform::errors::InvalidArgument(
platform::errors::InvalidArgument( "TCPStore timeouted and not all workers got ready."));
"TCPStore timeouted and not all workers got ready.")); }
} } while (true);
} while (true);
}
VLOG(3) << "TCPStore initialized."; VLOG(3) << "TCPStore initialized.";
} }
int64_t TCPStore::add(const std::string& key, int64_t value) { int64_t TCPStore::add(const std::string& key, int64_t value) {
VLOG(3) << "TCPStore add.";
_client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_command_for_key(Command::ADD, _key_prefix + key);
_client->send_value<std::int64_t>(value); _client->send_value<std::int64_t>(value);
return _client->receive_value<std::int64_t>(); return _client->receive_value<std::int64_t>();
} }
void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
VLOG(3) << "TCPStore set.";
_client->send_command_for_key(Command::SET, _key_prefix + key);
_client->send_vector<std::uint8_t>(value);
}
std::vector<uint8_t> TCPStore::get(const std::string& key) { std::vector<uint8_t> TCPStore::get(const std::string& key) {
wait(key); wait(key);
_client->send_command_for_key(Command::GET, _key_prefix + key); _client->send_command_for_key(Command::GET, _key_prefix + key);
...@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) { ...@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
void TCPStore::wait(const std::string& key) { void TCPStore::wait(const std::string& key) {
ReplyType reply; ReplyType reply;
VLOG(3) << "TCPStore wait.";
do { do {
_client->send_command_for_key(Command::WAIT, _key_prefix + key); _client->send_command_for_key(Command::WAIT, _key_prefix + key);
...@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) { ...@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
TCPStore::~TCPStore() { TCPStore::~TCPStore() {
_client->send_command_for_key(Command::STOP, ""); _client->send_command_for_key(Command::STOP, "");
VLOG(3) << "~TCPStore";
ReplyType ret = _client->receive_value<ReplyType>(); ReplyType ret = _client->receive_value<ReplyType>();
PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -27,15 +27,16 @@ namespace paddle { ...@@ -27,15 +27,16 @@ namespace paddle {
namespace distributed { namespace distributed {
enum class ReplyType { WAITING, STOP_WAIT }; enum class ReplyType { WAITING, STOP_WAIT };
enum class Command { ADD, GET, WAIT, STOP }; enum class Command { ADD, GET, SET, WAIT, STOP };
namespace detail { namespace detail {
class MasterDaemon { class MasterDaemon {
public: public:
static std::unique_ptr<MasterDaemon> start(SocketType listen_socket); static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
int nranks);
MasterDaemon() = delete; MasterDaemon() = delete;
explicit MasterDaemon(SocketType listen_socket); explicit MasterDaemon(SocketType listen_socket, int nranks);
~MasterDaemon(); ~MasterDaemon();
private: private:
...@@ -43,18 +44,20 @@ class MasterDaemon { ...@@ -43,18 +44,20 @@ class MasterDaemon {
void _do_add(SocketType socket); void _do_add(SocketType socket);
void _do_wait(SocketType socket); void _do_wait(SocketType socket);
void _do_get(SocketType socket); void _do_get(SocketType socket);
void _do_set(SocketType socket);
void _do_stop(SocketType socket); void _do_stop(SocketType socket);
SocketType _listen_socket; SocketType _listen_socket;
std::vector<SocketType> _sockets; std::vector<SocketType> _sockets;
std::unordered_map<std::string, std::vector<uint8_t>> _store; std::unordered_map<std::string, std::vector<uint8_t>> _store;
std::thread _background_thread{}; std::thread _background_thread{};
int _nranks;
bool _stop = false; bool _stop = false;
}; };
class TCPServer { class TCPServer {
public: public:
TCPServer() = default; TCPServer() = default;
static std::unique_ptr<TCPServer> create(std::uint16_t port); static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
private: private:
std::unique_ptr<MasterDaemon> _master_daemon; std::unique_ptr<MasterDaemon> _master_daemon;
...@@ -97,6 +100,7 @@ class TCPStore : public Store { ...@@ -97,6 +100,7 @@ class TCPStore : public Store {
int64_t add(const std::string& key, int64_t value) override; int64_t add(const std::string& key, int64_t value) override;
std::vector<uint8_t> get(const std::string& key) override; std::vector<uint8_t> get(const std::string& key) override;
void wait(const std::string& key) override; void wait(const std::string& key) override;
void set(const std::string& key, const std::vector<uint8_t>& value) override;
private: private:
void waitWorkers(); void waitWorkers();
......
...@@ -46,9 +46,10 @@ void close_socket(SocketType socket) { ...@@ -46,9 +46,10 @@ void close_socket(SocketType socket) {
hints.ai_socktype = SOCK_STREAM; hints.ai_socktype = SOCK_STREAM;
const char* node = host.empty() ? nullptr : host.c_str(); const char* node = host.empty() ? nullptr : host.c_str();
const char* port_cstr = port.empty() ? nullptr : port.c_str();
int n; int n;
n = ::getaddrinfo(node, port.c_str(), &hints, &res); n = ::getaddrinfo(node, port_cstr, &hints, &res);
const char* gai_err = ::gai_strerror(n); const char* gai_err = ::gai_strerror(n);
const char* proto = const char* proto =
(family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
......
...@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { ...@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
public: public:
// Constructor: configure fwd input tensors to grad node // Constructor: configure fwd input tensors to grad node
explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
VLOG(6) << "Construct GradNodeAccumulation";
weak_grad_ = meta->WeakGrad(); weak_grad_ = meta->WeakGrad();
SetDefaultGradInOutMeta(); SetDefaultGradInOutMeta();
} }
~GradNodeAccumulation() override = default; ~GradNodeAccumulation() override {
VLOG(6) << "Destruct GradNodeAccumulation";
}
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
......
...@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { ...@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
const std::vector<paddle::experimental::Tensor>& tensors); const std::vector<paddle::experimental::Tensor>& tensors);
void SetAttributes_scale(float scale); void SetAttributes_scale(float scale);
std::string name() override { return ""; }
// Members: define fwd input tensors // Members: define fwd input tensors
// For Scale there is no fwd input tensor needed // For Scale there is no fwd input tensor needed
private: private:
......
...@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent( ...@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
// then generate: "egr::AutogradMeta* p_autograd_out = // then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; std::string get_autograd_meta_str = " // Prepare Autograd Meta \n";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
for (const proto::OpProto::Var& input : in_vars) { for (const proto::OpProto::Var& input : in_vars) {
const std::string& input_name = input.name(); const std::string& input_name = input.name();
const std::string& input_autograd_name = "p_autograd_" + input_name; const std::string& input_autograd_name = "p_autograd_" + input_name;
...@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
} }
VLOG(6) << "Generated inputs autograd_meta"; VLOG(6) << "Generated inputs autograd_meta";
// If single output slotname and not duplicable,
// then generate: "egr::AutogradMeta* p_autograd_out =
// egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
for (const proto::OpProto::Var& output : out_vars) {
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;
// Skip Intermediate Tensor
if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
} else {
const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
" egr::AutogradMeta* %s = "
"egr::EagerUtils::autograd_meta(&%s);\n";
get_autograd_meta_str += paddle::string::Sprintf(
GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
}
}
VLOG(6) << "Generated outputs autograd_meta";
std::string prepare_autograd_meta_str = ""; std::string prepare_autograd_meta_str = "";
prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += get_autograd_meta_str;
prepare_autograd_meta_str += "\n"; prepare_autograd_meta_str += "\n";
...@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent( ...@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
" %s" " %s"
" bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
" if(require_any_grad) {\n" " if(require_any_grad) {\n"
" VLOG(6) << \" Construct Grad for %s \"; \n"
" egr::EagerUtils::PassStopGradient(%s);\n" " egr::EagerUtils::PassStopGradient(%s);\n"
"%s\n }"; "%s\n }";
std::string grad_node_creation_body_str = paddle::string::Sprintf( std::string grad_node_creation_body_str = paddle::string::Sprintf(
GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
compute_require_grad_args, pass_stop_gradient_args, compute_require_grad_args, op_type, pass_stop_gradient_args,
grad_node_creation_str); grad_node_creation_str);
return grad_node_creation_body_str; return grad_node_creation_body_str;
...@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
const char* GRAD_NODE_TEMPLATE = const char* GRAD_NODE_TEMPLATE =
"class GradNode%s : public egr::GradNodeBase {\n" "class GradNode%s : public egr::GradNodeBase {\n"
" public:\n" " public:\n"
" GradNode%s() : egr::GradNodeBase() {}\n" " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
"GradNode%s \"; }\n"
" GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
"egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
" ~GradNode%s() override = default;\n" "Construct GradNode%s \"; }\n"
" ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
"\n" "\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> " " virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const " "operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) " "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"override;\n" "override;\n"
"\n" "\n"
" std::string name() override { return \" GradNode%s \"; } \n "
"\n"
" // SetX, SetY, ...\n" " // SetX, SetY, ...\n"
"%s\n" "%s\n"
" // SetAttrMap\n" " // SetAttrMap\n"
"%s\n" "%s\n"
" std::string name() { return \"GradNode%s\"; }\n"
"\n"
" private:\n" " private:\n"
" // TensorWrappers\n" " // TensorWrappers\n"
"%s\n" "%s\n"
...@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
VLOG(6) << "Generated TensorWrapper"; VLOG(6) << "Generated TensorWrapper";
std::string grad_node_str = paddle::string::Sprintf( std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
set_tensor_wrappers_str, set_attr_map_str, op_type, op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
tensor_wrapper_members_str, attr_members_str); tensor_wrapper_members_str, attr_members_str);
return grad_node_str; return grad_node_str;
......
...@@ -213,8 +213,12 @@ def ParseYamlReturns(string): ...@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
returns = [x.strip() for x in string.strip().split(",")] returns = [x.strip() for x in string.strip().split(",")]
for i in range(len(returns)): for i in range(len(returns)):
ret = returns[i] ret_type = returns[i]
returns_list.append(["", ret, i])
assert ret_type in yaml_types_mapping.keys()
ret_type = yaml_types_mapping[ret_type]
returns_list.append(["", ret_type, i])
return returns_list return returns_list
...@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{ ...@@ -534,7 +538,7 @@ class {} : public egr::GradNodeBase {{
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override; const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
std::string name() override {{ return \" {} \"; }}
// SetTensorWrapperX, SetTensorWrapperY, ... // SetTensorWrapperX, SetTensorWrapperY, ...
{} {}
// SetAttributes // SetAttributes
...@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{ ...@@ -549,8 +553,9 @@ class {} : public egr::GradNodeBase {{
""" """
node_declaration_str = NODE_DECLARATION_TEMPLATE.format( node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name,
set_tensor_wrapper_methods_str, set_attribute_methods_str, grad_node_name, set_tensor_wrapper_methods_str,
tensor_wrapper_members_str, attribute_members_str) set_attribute_methods_str, tensor_wrapper_members_str,
attribute_members_str)
return node_declaration_str return node_declaration_str
......
...@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
} }
visited.insert(node); visited.insert(node);
PADDLE_ENFORCE_NOT_NULL(
node,
paddle::platform::errors::Fatal(
"We got null node when we traverse the backward graph, and this "
"should not happened please check your code and contact us."));
// Find and append next nodes // Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges(); const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) { for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) { for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get(); GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no // Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached // AccumulationNode attached
// Or it could also originated from dispensable inputs // Or it could also originated from dispensable inputs
...@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
} }
} }
} }
return node_in_degree_map; return node_in_degree_map;
} }
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
namespace egr { namespace egr {
GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
VLOG(6) << "Construct GradNodeBase";
bwd_in_meta_.resize(bwd_in_slot_num); bwd_in_meta_.resize(bwd_in_slot_num);
bwd_out_meta_.resize(bwd_out_slot_num); bwd_out_meta_.resize(bwd_out_slot_num);
// adj_edges has the same num as backward outputs // adj_edges has the same num as backward outputs
...@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) { ...@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
// its pre-ops // its pre-ops
if (meta && !meta->StopGradient()) { if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode(); auto node = meta->GetMutableGradNode();
if (node) { if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo()); meta->OutRankInfo());
} else { } else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta)); meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
VLOG(6) << "Add Edges for slot: " << slot_id
<< " which is: " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo()); meta->OutRankInfo());
} }
...@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { ...@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
"inputs's slot num.")); "inputs's slot num."));
if (meta && !meta->StopGradient()) { if (meta && !meta->StopGradient()) {
auto node = meta->GetMutableGradNode(); auto node = meta->GetMutableGradNode();
if (node) { if (node && node.get()) {
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name(); << this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
......
...@@ -76,10 +76,10 @@ class GradSlotMeta { ...@@ -76,10 +76,10 @@ class GradSlotMeta {
class GradNodeBase { class GradNodeBase {
public: public:
GradNodeBase() = default; GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
// TODO(jiabin): Should we have other constructor here? // TODO(jiabin): Should we have other constructor here?
virtual ~GradNodeBase() = default; virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
/** /**
* operator() designed to contian the real backward execution logic, it should * operator() designed to contian the real backward execution logic, it should
......
...@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode(float val, int in_num, int out_num) GradTestNode(float val, int in_num, int out_num)
: GradNodeBase(in_num, out_num), val_(val) {} : GradNodeBase(in_num, out_num), val_(val) {}
GradTestNode() : GradNodeBase() { val_ = 1.0; } GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()( std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override { override {
......
...@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( ...@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas, void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
const std::shared_ptr<GradNodeBase>& grad_node) { const std::shared_ptr<GradNodeBase>& grad_node) {
for (const auto& autograd_meta : *autograd_metas) { for (const auto& autograd_meta : *autograd_metas) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) { if (autograd_meta->GradNode()) {
VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " VLOG(7) << "Should not set grad node twice, original node is:"
"detected"; << autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
} }
autograd_meta->SetGradNode(grad_node); autograd_meta->SetGradNode(grad_node);
} }
...@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas, ...@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
void EagerUtils::SetHistory(AutogradMeta* autograd_meta, void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
const std::shared_ptr<GradNodeBase>& grad_node) { const std::shared_ptr<GradNodeBase>& grad_node) {
if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) { if (autograd_meta->GradNode()) {
VLOG(6) VLOG(7) << "Should not set grad node twice, original node is:"
<< "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; << autograd_meta->GradNode()->name()
<< "current is: " << grad_node->name();
} }
autograd_meta->SetGradNode(grad_node); autograd_meta->SetGradNode(grad_node);
} }
......
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <random>
#include <string> #include <string>
#include <unordered_set>
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp> #include <boost/logic/tribool.hpp>
#include <random>
#include <unordered_set> #include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h"
...@@ -25,7 +26,7 @@ ...@@ -25,7 +26,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose); USE_OP(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
......
...@@ -409,7 +409,7 @@ class ThreadPoolTempl { ...@@ -409,7 +409,7 @@ class ThreadPoolTempl {
return false; return false;
} }
platform::RecordEvent("SleepWaitForWork", platform::RecordEvent("SleepWaitForWork",
platform::TracerEventType::UserDefined, 2); platform::TracerEventType::UserDefined, 10);
ec_.CommitWait(waiter); ec_.CommitWait(waiter);
blocked_--; blocked_--;
return true; return true;
......
...@@ -2106,15 +2106,19 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2106,15 +2106,19 @@ void OperatorWithKernel::BuildPhiKernelContext(
for (size_t offset = 0; offset < outs_vector.size(); ++offset) { for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
phi::TensorBase* tensor_out = nullptr; phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset]; auto* var = outs_vector[offset];
if (var->template IsType<framework::LoDTensor>()) {
tensor_out = var->template GetMutable<framework::LoDTensor>(); if (var) {
} else if (var->template IsType<phi::SelectedRows>()) { if (var->template IsType<framework::LoDTensor>()) {
tensor_out = var->template GetMutable<phi::SelectedRows>(); tensor_out = var->template GetMutable<framework::LoDTensor>();
} else { } else if (var->template IsType<phi::SelectedRows>()) {
PADDLE_THROW(platform::errors::Unimplemented( tensor_out = var->template GetMutable<phi::SelectedRows>();
"Unsupported output `%s` type when call pt kernel.", } else {
framework::ToTypeName(var->Type()))); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
} }
pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
...@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2215,8 +2219,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
vector_int_attr.end()); vector_int_attr.end());
pt_kernel_context->EmplaceBackAttr(vector_int64_attr); pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
} }
// TODO(YuanRisheng) Need support vector<int64_t> attr
} else if (attr_defs[i].type_index == } else if (attr_defs[i].type_index ==
std::type_index(typeid(std::vector<int32_t>))) { std::type_index(typeid(std::vector<int32_t>))) {
const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr); const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
......
...@@ -314,15 +314,18 @@ void BuildDygraphPhiKernelContext( ...@@ -314,15 +314,18 @@ void BuildDygraphPhiKernelContext(
phi::TensorBase* tensor_out = nullptr; phi::TensorBase* tensor_out = nullptr;
auto* var = outs_vector[offset]->MutableVar(); auto* var = outs_vector[offset]->MutableVar();
if (var->template IsType<phi::DenseTensor>()) { if (var) {
tensor_out = var->template GetMutable<phi::DenseTensor>(); if (var->template IsType<phi::DenseTensor>()) {
} else if (var->template IsType<phi::SelectedRows>()) { tensor_out = var->template GetMutable<phi::DenseTensor>();
tensor_out = var->template GetMutable<phi::SelectedRows>(); } else if (var->template IsType<phi::SelectedRows>()) {
} else { tensor_out = var->template GetMutable<phi::SelectedRows>();
PADDLE_THROW(platform::errors::Unimplemented( } else {
"Unsupported output `%s` type when call pt kernel.", PADDLE_THROW(platform::errors::Unimplemented(
framework::ToTypeName(var->Type()))); "Unsupported output `%s` type when call pt kernel.",
framework::ToTypeName(var->Type())));
}
} }
kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
} }
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
......
...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ...@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>); ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
ops::BatchNormDoubleGradOpInplaceInferer); ops::BatchNormDoubleGradOpInplaceInferer);
REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad_grad,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -25,10 +25,10 @@ limitations under the License. */ ...@@ -25,10 +25,10 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif #endif
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit); DECLARE_uint64(conv_workspace_size_limit);
...@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -148,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input; Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -196,13 +196,13 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
...@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -488,7 +488,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// cuDNN only supports padding the same amount on every dimension. // cuDNN only supports padding the same amount on every dimension.
// So we create a new padded input tensor. // So we create a new padded input tensor.
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input(input->type()); Tensor transformed_input(input->type());
Tensor transformed_input_grad(input->type()); Tensor transformed_input_grad(input->type());
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -544,13 +544,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
...@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -956,7 +956,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type()); Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type()); Tensor transformed_ddX(X->type());
...@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -1004,20 +1004,22 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif #endif
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/padding.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0); std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_input; Tensor transformed_input;
...@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, input_transpose, pad_value, &transformed_input); dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, input_transpose, pad_value, &transformed_input); dev_ctx, input_pad, input_transpose, pad_value,
&transformed_input);
} break; } break;
default: default:
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
...@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -375,7 +377,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
std::vector<int> input_pad(input_transpose.dims().size() * 2, 0); std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
Tensor transformed_output_grad; Tensor transformed_output_grad;
...@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -407,13 +409,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, output_grad_transpose, pad_value, dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad); &transformed_output_grad);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, output_grad_transpose, pad_value, dev_ctx, input_pad, output_grad_transpose, pad_value,
&transformed_output_grad); &transformed_output_grad);
} break; } break;
default: default:
...@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -735,7 +737,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_X(X->type()); Tensor transformed_X(X->type());
Tensor transformed_ddX(X->type()); Tensor transformed_ddX(X->type());
...@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -794,26 +796,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (dO) { if (dO) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_dO_channel, pad_value, dev_ctx, input_pad, transformed_dO_channel, pad_value,
&transformed_dO); &transformed_dO);
} }
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); dev_ctx, input_pad, transformed_X_channel, pad_value,
&transformed_X);
if (ddX) { if (ddX) {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_ddX_channel, pad_value, dev_ctx, input_pad, transformed_ddX_channel, pad_value,
&transformed_ddX); &transformed_ddX);
} }
} break; } break;
......
...@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
bool is_fix_seed, int seed_val, const Tensor& x, bool is_fix_seed, int seed_val, const Tensor& x,
const Tensor* seed, Tensor* mask, Tensor* y) { const Tensor* seed, Tensor* mask, Tensor* y) {
auto& place = *dev_ctx.eigen_device(); auto& place = *dev_ctx.eigen_device();
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (!is_test) { if (!is_test) {
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* mask_data = mask->data<uint8_t>(); auto* mask_data = mask->data<uint8_t>();
size_t size = phi::product(mask->dims()); size_t size = phi::product(mask->dims());
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (dropout_prob == 1.0f) { if (dropout_prob == 1.0f) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
...@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
} }
#endif #endif
} else { } else {
auto X = EigenMatrix<T>::Reshape(x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
if (upscale_in_train) { if (upscale_in_train) {
Y.device(place) = X; // todo: can y share with data with x directly?
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
hipMemcpyDeviceToDevice, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
cudaMemcpyDeviceToDevice, stream));
#endif
} else { } else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob); T factor = static_cast<T>(1.0f - dropout_prob);
std::vector<const framework::Tensor*> ins = {&x};
std::vector<framework::Tensor*> outs = {y};
auto functor = phi::funcs::ScaleFunctor<T>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} }
} }
} }
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/padding.h"
DECLARE_int64(cudnn_exhaustive_search_times); DECLARE_int64(cudnn_exhaustive_search_times);
...@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
in_data_dims, strides, ksize); in_data_dims, strides, ksize);
int data_dim = strides.size(); // 2d or 3d int data_dim = strides.size(); // 2d or 3d
bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
Tensor transformed_input; Tensor transformed_input;
std::vector<int> padding_common(data_dim, 0); std::vector<int> padding_common(data_dim, 0);
...@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
T pad_value(0.0); T pad_value(0.0);
switch (rank) { switch (rank) {
case 4: { case 4: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 4>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
case 5: { case 5: {
math::PadFunction<paddle::platform::CUDADeviceContext, T, 5>( phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
ctx, input_pad, transformed_input_channel, pad_value, dev_ctx, input_pad, transformed_input_channel, pad_value,
&transformed_input); &transformed_input);
} break; } break;
default: default:
......
...@@ -32,7 +32,7 @@ namespace platform = paddle::platform; ...@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
namespace op = paddle::operators; namespace op = paddle::operators;
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
USE_OP(batch_norm); USE_OP_ITSELF(batch_norm);
USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation);
USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
......
...@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
axis = static_cast<int>(cpu_axis.data<int32_t>()[0]); axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT64) { } else if (axis_type == framework::proto::VarType::INT64) {
axis = static_cast<int>(cpu_axis.data<int64_t>()[0]); axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
} else if (axis_type == framework::proto::VarType::INT16) {
axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
} }
} }
const auto &place = ctx.GetPlace(); const auto &place = ctx.GetPlace();
...@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
} else if (index_type == framework::proto::VarType::INT64) { } else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
dev_ctx); dev_ctx);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
dev_ctx);
} }
return; return;
} }
...@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> { ...@@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output); phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT64) { } else if (index_type == framework::proto::VarType::INT64) {
phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output); phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
} else if (index_type == framework::proto::VarType::INT16) {
phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
} }
} }
}; };
...@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>, ...@@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
ops::GatherOpCUDAKernel<double>, ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>, ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>, ops::GatherOpCUDAKernel<int>,
ops::GatherOpCUDAKernel<int16_t>,
ops::GatherOpCUDAKernel<plat::float16>, ops::GatherOpCUDAKernel<plat::float16>,
ops::GatherOpCUDAKernel<plat::bfloat16>); ops::GatherOpCUDAKernel<plat::bfloat16>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
......
...@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/binary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { ...@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
auto ids_dims = ctx->GetInputDim("Ids");
auto parents_dims = ctx->GetInputDim("Parents");
PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
platform::errors::InvalidArgument(
"The shape of Input(Parents) must be same with the "
"shape of Input(Ids)."));
ctx->SetOutputDim("Out", ids_dims);
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
...@@ -72,4 +61,8 @@ selected ids. ...@@ -72,4 +61,8 @@ selected ids.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
PT_INFER_META(phi::GatherTreeMeta));
REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
GatherTreeInferShapeFunctor);
...@@ -26,27 +26,6 @@ namespace paddle { ...@@ -26,27 +26,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T>
class CPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
std::normal_distribution<T> dist(mean, std);
auto shape = GetShape(context);
tensor->Resize(shape);
int64_t size = tensor->numel();
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = framework::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(*engine);
}
}
}; // namespace operators
template <typename T> template <typename T>
class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
...@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator. ...@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
ops::GaussianRandomOpMaker); ops::GaussianRandomOpMaker);
REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
ops::CPUGaussianRandomKernel<double>);
REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
ops::CPUGaussianRandomBatchSizeLikeKernel<float>, ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
ops::CPUGaussianRandomBatchSizeLikeKernel<double>); ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
......
...@@ -52,53 +52,6 @@ struct GaussianGenerator { ...@@ -52,53 +52,6 @@ struct GaussianGenerator {
} }
}; };
template <typename T>
class GPUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out");
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
bool seed_flag = false;
if (seed == 0) {
std::random_device rd;
seed = rd();
seed_flag = true;
}
T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std"));
auto shape = GetShape(context);
tensor->Resize(shape);
auto& dev_cxt =
context.template device_context<platform::CUDADeviceContext>();
T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
int64_t size = tensor->numel();
int device_id = context.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::normal_distribution<MT> dist;
distribution::normal_transform<MT> trans(mean, std);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
} else {
auto func = GaussianGenerator<T>(mean, std, seed);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
}
};
template <typename T> template <typename T>
class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
public: public:
...@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { ...@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(
gaussian_random,
paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
gaussian_random_batch_size_like, gaussian_random_batch_size_like,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
...@@ -213,7 +214,33 @@ class InplaceABNKernel ...@@ -213,7 +214,33 @@ class InplaceABNKernel
auto activation = auto activation =
GetInplaceABNActivationType(ctx.Attr<std::string>("activation")); GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
...@@ -222,8 +249,7 @@ class InplaceABNKernel ...@@ -222,8 +249,7 @@ class InplaceABNKernel
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel : public framework::OpKernel<T> {
: public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
...@@ -244,7 +270,52 @@ class InplaceABNGradKernel ...@@ -244,7 +270,52 @@ class InplaceABNGradKernel
InplaceABNActivation<DeviceContext, T> functor; InplaceABNActivation<DeviceContext, T> functor;
functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
}; };
......
...@@ -15,14 +15,15 @@ limitations under the License. */ ...@@ -15,14 +15,15 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/inplace_abn_op.h"
#include "paddle/fluid/operators/sync_batch_norm_op.cu.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
#include "paddle/phi/kernels/batch_norm_kernel.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNKernel class InplaceABNKernel
: public paddle::operators::SyncBatchNormKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
public paddle::operators::BatchNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
...@@ -36,7 +37,33 @@ class InplaceABNKernel ...@@ -36,7 +37,33 @@ class InplaceABNKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormKernel<DeviceContext, T>::Compute(ctx); // BatchNormKernel<DeviceContext, T>::Compute(ctx);
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* mean = ctx.Input<Tensor>("Mean");
auto* variance = ctx.Input<Tensor>("Variance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* mean_out = ctx.Output<Tensor>("MeanOut");
auto* variance_out = ctx.Output<Tensor>("VarianceOut");
auto* saved_mean = ctx.Output<Tensor>("SavedMean");
auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
mean_out, variance_out, saved_mean, saved_variance, reserve_space);
} }
auto cur_y = EigenVector<T>::Flatten(*y); auto cur_y = EigenVector<T>::Flatten(*y);
...@@ -49,8 +76,7 @@ class InplaceABNKernel ...@@ -49,8 +76,7 @@ class InplaceABNKernel
// https://kevinzakka.github.io/2016/09/14/batch_normalization/ // https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class InplaceABNGradKernel class InplaceABNGradKernel
: public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>, : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* y = ctx.Input<Tensor>("Y"); const auto* y = ctx.Input<Tensor>("Y");
...@@ -74,7 +100,50 @@ class InplaceABNGradKernel ...@@ -74,7 +100,50 @@ class InplaceABNGradKernel
if (ctx.Attr<bool>("use_sync_bn")) { if (ctx.Attr<bool>("use_sync_bn")) {
SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx); SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
} else { } else {
BatchNormGradKernel<DeviceContext, T>::Compute(ctx); auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* saved_mean = ctx.Input<Tensor>("SavedMean");
auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
auto momentum = ctx.Attr<float>("momentum");
auto epsilon = ctx.Attr<float>("epsilon");
auto data_layout = ctx.Attr<std::string>("data_layout");
auto is_test = ctx.Attr<bool>("is_test");
auto use_global_stats = ctx.Attr<bool>("use_global_stats");
auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
auto* mean = ctx.Input<Tensor>("ReserveSpace");
auto* variance = ctx.Input<Tensor>("ReserveSpace");
paddle::optional<const Tensor&> space_opt = paddle::none;
paddle::optional<const Tensor&> mean_opt = paddle::none;
paddle::optional<const Tensor&> variance_opt = paddle::none;
if (reserve_space != nullptr) {
space_opt = *reserve_space;
}
if (mean != nullptr) {
mean_opt = *mean;
}
if (variance != nullptr) {
variance_opt = *variance;
}
auto& dev_ctx = ctx.device_context<DeviceContext>();
phi::BatchNormGradRawKernel<T>(
static_cast<const typename framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
scale_grad, bias_grad);
} }
} }
}; };
......
...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( ...@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, void NormDoubleGradFunctor(const DeviceContext &ctx,
const DataLayout data_layout, const Tensor *X, const DataLayout data_layout, const Tensor *X,
const Tensor *Scale, const Tensor *dY, const Tensor *Scale, const Tensor *dY,
const Tensor *Saved_mean, const Tensor *Saved_mean,
const Tensor *Saved_variance, const double epsilon, const Tensor *Saved_variance, const Tensor *Mean,
const Tensor *Variance, const double epsilon,
const bool use_global_stats, const Tensor *ddX, const bool use_global_stats, const Tensor *ddX,
const Tensor *ddScale, const Tensor *ddBias, const Tensor *ddScale, const Tensor *ddBias,
Tensor *dX, Tensor *dScale, Tensor *ddY) { Tensor *dX, Tensor *dScale, Tensor *ddY) {
...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>()); const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); phi::funcs::SetConstant<DeviceContext, T> set_constant;
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
auto &x_dims = X->dims(); auto &x_dims = X->dims();
const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
Tensor scale_tmp; Tensor scale_tmp;
if (!Scale) { if (!Scale) {
scale_tmp.mutable_data<T>({C}, ctx.GetPlace()); scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_tmp, static_cast<T>(1)); set_constant(ctx, &scale_tmp, static_cast<T>(1));
} }
const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>(); const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
#else #else
const int block = 512; const int block = 512;
#endif #endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(C, max_blocks); int grid = std::min(C, max_blocks);
int grid1 = (num + block - 1) / block; int grid1 = (num + block - 1) / block;
const T *mean_data, *variance_data; const T *mean_data, *variance_data;
if (use_global_stats) { if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean"); const auto *running_mean = Mean;
const auto *running_var = ctx.Input<Tensor>("Variance"); const auto *running_var = Variance;
const auto *running_mean_data = running_mean->template data<T>(); const auto *running_mean_data = running_mean->template data<T>();
const auto *running_var_data = running_var->template data<T>(); const auto *running_var_data = running_var->template data<T>();
mean_data = running_mean_data; mean_data = running_mean_data;
...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} else { } else {
const T *smean_data = Saved_mean->data<T>(); const T *smean_data = Saved_mean->data<T>();
const T *svariance_data = Saved_variance->data<T>(); const T *svariance_data = Saved_variance->data<T>();
mean_data = smean_data; mean_data = smean_data;
variance_data = svariance_data; variance_data = svariance_data;
} }
if (dX) { if (dX) {
T *dx_data = dX->mutable_data<T>(ctx.GetPlace()); T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dX, static_cast<T>(0)); set_constant(ctx, dX, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} else { } else {
DoubleGradComputeDXWithGlobal< DoubleGradComputeDXWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
dx_data); dx_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} else { } else {
DoubleGradComputeDX< DoubleGradComputeDX<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
ddscale_data, N, C, sample_size, epsilon, dx_data); ddscale_data, N, C, sample_size, epsilon, dx_data);
} }
...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (dScale) { if (dScale) {
T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace()); T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, dScale, static_cast<T>(0)); set_constant(ctx, dScale, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} else { } else {
DoubleGradComputeDScaleWithGlobal< DoubleGradComputeDScaleWithGlobal<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
dscale_data); dscale_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} else { } else {
DoubleGradComputeDScale< DoubleGradComputeDScale<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddx_data, dy_data, N, C, x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
sample_size, epsilon, dscale_data); sample_size, epsilon, dscale_data);
} }
...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, ...@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
} }
if (ddY) { if (ddY) {
T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace()); T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, ddY, static_cast<T>(0)); set_constant(ctx, ddY, static_cast<T>(0));
if (use_global_stats) { if (use_global_stats) {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} else { } else {
DoubleGradComputeDDYWithGlobal< DoubleGradComputeDDYWithGlobal<
T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>( T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
ddscale_data, epsilon, C, sample_size, num, ddy_data); ddscale_data, epsilon, C, sample_size, num, ddy_data);
} }
} else { } else {
if (data_layout == DataLayout::kNHWC) { if (data_layout == DataLayout::kNHWC) {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} else { } else {
DoubleGradComputeDDY< DoubleGradComputeDDY<
T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>( T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
x_data, mean_data, variance_data, ddscale_data, ddbias_data, x_data, mean_data, variance_data, ddscale_data, ddbias_data,
ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
} }
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/padding.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel<T> { ...@@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]); pads[j * 2 + 1] = static_cast<int>(in_x->dims()[j] - in_y->dims()[j]);
} }
math::PaddingFunctor<DeviceContext, T>(rank, context, pads, pad_value, phi::funcs::PaddingFunctor<DeviceContext, T>(
*in_y, out); rank, context.template device_context<DeviceContext>(), pads, pad_value,
*in_y, out);
} }
}; };
...@@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> { ...@@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]); pads[j * 2 + 1] = static_cast<int>(in_dout->dims()[j] - in_y->dims()[j]);
} }
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *in_dout, phi::funcs::PaddingGradFunctor<DeviceContext, T>(
d_y); rank, context.template device_context<DeviceContext>(), pads, *in_dout,
d_y);
} }
}; };
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/pad_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
namespace paddle { namespace paddle {
...@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ...@@ -167,40 +167,3 @@ REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker,
REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, REGISTER_OPERATOR(pad_grad, ops::PadOpGrad,
ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>, ops::PadOpDoubleGradMaker<paddle::framework::OpDesc>,
ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>); ops::PadOpDoubleGradMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(
pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int>,
ops::PadKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::PadGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/padding.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class PadKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
float pad_value = context.Attr<float>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
int rank = x->dims().size();
math::PaddingFunctor<DeviceContext, T>(rank, context, pads,
static_cast<T>(pad_value), *x, out);
}
};
template <typename DeviceContext, typename T>
class PadGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
if (d_x == nullptr) {
return;
}
d_x->mutable_data<T>(context.GetPlace());
int rank = d_out->dims().size();
math::PaddingGradFunctor<DeviceContext, T>(rank, context, pads, *d_out,
d_x);
}
};
} // namespace operators
} // namespace paddle
...@@ -20,9 +20,11 @@ namespace cub = hipcub; ...@@ -20,9 +20,11 @@ namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) { ...@@ -42,71 +44,86 @@ static inline int NumBlocks(const int N) {
} }
template <typename T> template <typename T>
__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, struct NonzeroFunctor {
const int ignore_index, const int limit, HOSTDEVICE explicit inline NonzeroFunctor() {}
T *out_data, T *counts) { HOSTDEVICE inline T operator()(const T x) const {
CUDA_KERNEL_LOOP(i, limit) { return static_cast<T>(static_cast<double>(x) != 0);
T x = x_data[i]; }
T label = label_data[i]; };
T eps = static_cast<T>(1e-5);
T diff = label - static_cast<T>(ignore_index); template <typename T>
struct SigmoidFwdFunctor {
T ignore_index_;
T eps = static_cast<T>(1e-5);
HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index)
: ignore_index_(ignore_index) {}
HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label) {
T counts;
T out_data;
T diff = label - static_cast<T>(ignore_index_);
if ((diff > -eps) && (diff < eps)) { if ((diff > -eps) && (diff < eps)) {
out_data[i] = static_cast<T>(0.); out_data = static_cast<T>(0.);
counts[i] = 0; counts = 0;
} else { } else {
T term1 = (x > 0) ? x : 0; T term1 = (x > 0) ? x : 0;
T term2 = x * label; T term2 = x * label;
T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x)))); T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
out_data[i] = term1 - term2 + term3;
counts[i] = 1; out_data = term1 - term2 + term3;
counts = 1;
} }
} phi::Array<T, 2> outs;
}
template <typename T, int BlockDim> outs[0] = out_data;
__global__ void Sum(const T *counts, int num, const T eps, T *sum) { outs[1] = counts;
typedef cub::BlockReduce<double, BlockDim> BlockReduce; return outs;
__shared__ typename BlockReduce::TempStorage temp_storage;
T in = 0;
for (int i = threadIdx.x; i < num; i += BlockDim) {
in += counts[i];
} }
__syncthreads(); };
auto out =
BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
T a = out > eps ? out : eps;
sum[0] = a;
}
}
template <typename T> template <typename T>
__global__ void Div(T *loss, const int num, const T *norm) { struct SigmoidBwdFunctor {
CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } T ignore_index_;
} T eps = static_cast<T>(1e-5);
template <typename T> HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index)
__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, : ignore_index_(ignore_index) {}
const int ignore_index, const T *dout_data,
const int limit, T *dx_data, T *counts) { HOSTDEVICE inline phi::Array<T, 2> operator()(const T x, const T label,
CUDA_KERNEL_LOOP(i, limit) { const T dout) {
T x = x_data[i]; T counts;
T label = label_data[i]; T dx_data;
T dout = dout_data[i];
T eps = static_cast<T>(1e-5); T diff = label - static_cast<T>(ignore_index_);
T diff = label - static_cast<T>(ignore_index);
if ((diff > -eps) && (diff < eps)) { if ((diff > -eps) && (diff < eps)) {
dx_data[i] = static_cast<T>(0.); dx_data = static_cast<T>(0.);
counts[i] = 0; counts = 0;
} else { } else {
T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x)); T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
T diff = simoid_x - label; T diff = simoid_x - label;
dx_data[i] = dout * diff; dx_data = dout * diff;
counts[i] = 1; counts = 1;
} }
phi::Array<T, 2> outs;
outs[0] = dx_data;
outs[1] = counts;
return outs;
} }
} };
template <typename T>
struct DivFunctor {
const T norm_;
HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
HOSTDEVICE inline T operator()(T loss) {
loss /= norm_;
return loss;
}
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> { ...@@ -123,20 +140,48 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
bool normalize = context.Attr<bool>("normalize"); bool normalize = context.Attr<bool>("normalize");
// Temporary memory // Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); Tensor *counts_tensor = new Tensor();
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr()); counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(Out->dims());
int limit = Out->numel(); int limit = Out->numel();
int blocks = NumBlocks(limit); int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( std::vector<const framework::Tensor *> ins = {X, Labels};
X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts); std::vector<framework::Tensor *> outs = {Out, counts_tensor};
auto functor = SigmoidFwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
if (normalize) { if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
T *norm = reinterpret_cast<T *>(norm_ptr->ptr()); Tensor *norm_tensor = new Tensor();
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
counts, limit, static_cast<T>(1e-5), norm); auto dims = phi::vectorize(counts_tensor->dims());
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm); std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {Out};
std::vector<framework::Tensor *> div_outs = {Out};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
delete counts_tensor;
} }
} }
}; };
...@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel ...@@ -157,22 +202,48 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel
auto &dev_ctx = context.cuda_device_context(); auto &dev_ctx = context.cuda_device_context();
// Temporary memory // Temporary memory
auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); Tensor *counts_tensor = new Tensor();
T *counts = reinterpret_cast<T *>(cnt_ptr->ptr()); counts_tensor->mutable_data<T>(context.GetPlace(),
Labels->numel() * sizeof(T));
counts_tensor->Resize(dX->dims());
int limit = dX->numel(); int limit = dX->numel();
int blocks = NumBlocks(limit); int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>( std::vector<const framework::Tensor *> ins = {X, Labels, dOut};
X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit, std::vector<framework::Tensor *> outs = {dX, counts_tensor};
dx_data, counts); auto functor = SigmoidBwdFunctor<T>(ignore_index);
constexpr int Size = 2;
phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(dev_ctx, ins,
&outs, functor);
bool normalize = context.Attr<bool>("normalize"); bool normalize = context.Attr<bool>("normalize");
if (normalize) { if (normalize) {
auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *counts = counts_tensor->mutable_data<T>(context.GetPlace());
T *norm = reinterpret_cast<T *>(norm_ptr->ptr()); Tensor *norm_tensor = new Tensor();
Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( norm_tensor->mutable_data<T>(context.GetPlace(), sizeof(T));
counts, limit, static_cast<T>(1e-5), norm); auto dims = phi::vectorize(counts_tensor->dims());
Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm); std::vector<int> reduce_dim = {};
for (int i = 0; i < dims.size(); i++) {
reduce_dim.push_back(i);
}
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
context.cuda_device_context(), *counts_tensor, norm_tensor,
NonzeroFunctor<T>(), reduce_dim, dev_ctx.stream());
T *norm = norm_tensor->mutable_data<T>(context.GetPlace());
auto norm_cpu_mem = memory::Alloc(platform::CPUPlace(), sizeof(T));
T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
memory::Copy(platform::CPUPlace(), norm_cpu_ptr, dev_ctx.GetPlace(), norm,
sizeof(T), dev_ctx.stream());
auto eps = static_cast<T>(1e-5);
*norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
std::vector<const framework::Tensor *> div_ins = {dX};
std::vector<framework::Tensor *> div_outs = {dX};
auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs,
div_functor);
delete norm_tensor;
} }
} }
}; };
......
...@@ -23,9 +23,9 @@ ...@@ -23,9 +23,9 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/padding.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "thrust/device_vector.h" #include "thrust/device_vector.h"
#endif #endif
...@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> { ...@@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel<T> {
std::vector<int> pads(rank * 2, 0); std::vector<int> pads(rank * 2, 0);
pads[axes.back() * 2 + 1] = zero_length; pads[axes.back() * 2 + 1] = zero_length;
paddle::operators::math::PaddingFunctor<DeviceContext, C>( phi::funcs::PaddingFunctor<DeviceContext, C>(
rank, ctx, pads, static_cast<C>(0), *dy, &full_dy); rank, ctx.template device_context<DeviceContext>(), pads,
static_cast<C>(0), *dy, &full_dy);
fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization,
!forward); !forward);
} }
......
...@@ -23,12 +23,9 @@ namespace paddle { ...@@ -23,12 +23,9 @@ namespace paddle {
namespace platform { namespace platform {
bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device); auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) { auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
ops = get_kl2_ops(); : get_kl2_ops();
}
if (ops.find(op_name) != ops.end() && if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) { ops[op_name].find(type) != ops[op_name].end()) {
return true; return true;
...@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) { ...@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
bool is_xpu_kp_support_op(const std::string& op_name, bool is_xpu_kp_support_op(const std::string& op_name,
const pOpKernelType& type) { const pOpKernelType& type) {
auto& ops = get_kl1_ops();
auto v = get_xpu_version(type.place_.device); auto v = get_xpu_version(type.place_.device);
if (v == phi::backends::xpu::XPUVersion::XPU2) { auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
ops = get_kp_ops(); : get_kp_ops();
}
if (ops.find(op_name) != ops.end() && if (ops.find(op_name) != ops.end() &&
ops[op_name].find(type) != ops[op_name].end()) { ops[op_name].find(type) != ops[op_name].end()) {
return true; return true;
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) { ...@@ -161,6 +162,8 @@ void LoadCustomDevice(const std::string &library_dir) {
#endif #endif
void InitDevices() { void InitDevices() {
// set name at the entry point of Paddle
platform::SetCurrentThreadName("MainThread");
// CUPTI attribute should be set before any CUDA context is created (see CUPTI // CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute). // documentation about CUpti_ActivityAttribute).
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) { ...@@ -30,8 +30,7 @@ TEST(ThreadInfo, TestThreadNameUtils) {
using paddle::platform::GetCurrentThreadName; using paddle::platform::GetCurrentThreadName;
using paddle::platform::SetCurrentThreadName; using paddle::platform::SetCurrentThreadName;
using paddle::platform::GetAllThreadNames; using paddle::platform::GetAllThreadNames;
EXPECT_EQ("unset", GetCurrentThreadName()); SetCurrentThreadName("MainThread");
EXPECT_TRUE(SetCurrentThreadName("MainThread"));
EXPECT_FALSE(SetCurrentThreadName("MainThread")); EXPECT_FALSE(SetCurrentThreadName("MainThread"));
auto names = GetAllThreadNames(); auto names = GetAllThreadNames();
EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end()); EXPECT_TRUE(names.find(GetCurrentThreadStdId()) != names.end());
......
...@@ -189,7 +189,10 @@ struct ThreadEventSection { ...@@ -189,7 +189,10 @@ struct ThreadEventSection {
class ThreadEventRecorder { class ThreadEventRecorder {
public: public:
ThreadEventRecorder() { thread_id_ = GetCurrentThreadSysId(); } ThreadEventRecorder() {
thread_id_ = GetCurrentThreadSysId();
thread_name_ = GetCurrentThreadName();
}
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
...@@ -202,7 +205,7 @@ class ThreadEventRecorder { ...@@ -202,7 +205,7 @@ class ThreadEventRecorder {
ThreadEventSection GatherEvents() { ThreadEventSection GatherEvents() {
ThreadEventSection thr_sec; ThreadEventSection thr_sec;
thr_sec.thread_name = GetCurrentThreadName(); thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_; thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce()); thr_sec.events = std::move(base_evt_cntr_.Reduce());
return thr_sec; return thr_sec;
...@@ -210,6 +213,7 @@ class ThreadEventRecorder { ...@@ -210,6 +213,7 @@ class ThreadEventRecorder {
private: private:
uint64_t thread_id_; uint64_t thread_id_;
std::string thread_name_;
EventContainer<CommonEvent> base_evt_cntr_; EventContainer<CommonEvent> base_evt_cntr_;
}; };
......
...@@ -85,6 +85,9 @@ if(NOT ON_INFER) ...@@ -85,6 +85,9 @@ if(NOT ON_INFER)
if (WITH_NCCL) if (WITH_NCCL)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
endif() endif()
if (WITH_GLOO)
set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
endif()
set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
endif() endif()
......
...@@ -31,9 +31,15 @@ namespace pybind { ...@@ -31,9 +31,15 @@ namespace pybind {
using TCPStore = paddle::distributed::TCPStore; using TCPStore = paddle::distributed::TCPStore;
void BindTCPStore(py::module* m) { void BindTCPStore(py::module* m) {
py::class_<TCPStore>(*m, "TCPStore") py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
.def( .def(py::init([](std::string hostname, uint16_t port, bool is_master,
py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>()) size_t world_size, std::chrono::seconds timeout) {
return std::make_shared<TCPStore>(hostname, port, is_master,
world_size, timeout);
}),
py::arg("hostname"), py::arg("port"), py::arg("is_master"),
py::arg("world_size"), py::arg("timeout"),
py::call_guard<py::gil_scoped_release>())
.def("add", &TCPStore::add) .def("add", &TCPStore::add)
.def("get", &TCPStore::get); .def("get", &TCPStore::get);
} }
......
...@@ -35,6 +35,11 @@ limitations under the License. */ ...@@ -35,6 +35,11 @@ limitations under the License. */
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#endif #endif
#if defined(PADDLE_WITH_GLOO)
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
#include "paddle/fluid/distributed/store/tcp_store.h"
#endif
namespace py = pybind11; namespace py = pybind11;
namespace paddle { namespace paddle {
...@@ -42,6 +47,14 @@ namespace pybind { ...@@ -42,6 +47,14 @@ namespace pybind {
using Tensor = paddle::experimental::Tensor; using Tensor = paddle::experimental::Tensor;
#if defined(PADDLE_WITH_GLOO)
using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
#endif
static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT
void BindDistributed(py::module *m) { void BindDistributed(py::module *m) {
py::enum_<distributed::ReduceOp>(*m, "ReduceOp") py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
.value("SUM", distributed::ReduceOp::SUM) .value("SUM", distributed::ReduceOp::SUM)
...@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) { ...@@ -64,6 +77,11 @@ void BindDistributed(py::module *m) {
.def(py::init<>()) .def(py::init<>())
.def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids);
py::class_<distributed::ReduceOptions>(*m, "ReduceOptions")
.def(py::init<>())
.def_readwrite("reduce_op", &distributed::ReduceOptions::reduce_op)
.def_readwrite("source_root", &distributed::ReduceOptions::root_rank);
auto ProcessGroup = auto ProcessGroup =
py::class_<distributed::ProcessGroup, py::class_<distributed::ProcessGroup,
std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup") std::shared_ptr<distributed::ProcessGroup>>(*m, "ProcessGroup")
...@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) { ...@@ -121,6 +139,58 @@ void BindDistributed(py::module *m) {
return self.Recv(tensors, src); return self.Recv(tensors, src);
}, },
py::arg("tensor"), py::arg("src"), py::arg("tensor"), py::arg("src"),
py::call_guard<py::gil_scoped_release>())
.def("all_gather",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllGather(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("alltoall",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.AllToAll(in_tensors, out_tensors);
},
py::arg("in"), py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def("reduce",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
int dst, distributed::ReduceOp op) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
distributed::ReduceOptions opts;
opts.reduce_op = op;
opts.root_rank = dst;
std::vector<Tensor> tensors = {in_tensor};
return self.Reduce(tensors, opts);
},
py::arg("tensor"), py::arg("dst"),
py::arg("op") = distributed::ReduceOp::SUM,
py::call_guard<py::gil_scoped_release>())
.def("scatter",
[](distributed::ProcessGroup &self, py::handle py_in_tensor,
py::handle py_out_tensor, int src) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
distributed::ScatterOptions opts;
opts.root_rank = src;
std::vector<Tensor> in_tensors = {in_tensor};
std::vector<Tensor> out_tensors = {out_tensor};
return self.Scatter(in_tensors, out_tensors, opts);
},
py::arg("in"), py::arg("out"), py::arg("src"),
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) { ...@@ -129,6 +199,7 @@ void BindDistributed(py::module *m) {
*m, "ProcessGroupNCCL", ProcessGroup) *m, "ProcessGroupNCCL", ProcessGroup)
.def(py::init<const distributed::ProcessGroupStrategy &, int, int>(), .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#endif
py::class_<distributed::ProcessGroup::Task, py::class_<distributed::ProcessGroup::Task,
std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task") std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
...@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) { ...@@ -138,7 +209,6 @@ void BindDistributed(py::module *m) {
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("synchronize", &distributed::ProcessGroup::Task::Synchronize, .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#endif
// define parallel strategy, it will be removed // define parallel strategy, it will be removed
py::class_<distributed::ProcessGroupStrategy> pg_strategy( py::class_<distributed::ProcessGroupStrategy> pg_strategy(
...@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) { ...@@ -178,6 +248,45 @@ void BindDistributed(py::module *m) {
self.nrings_ = nrings; self.nrings_ = nrings;
}); });
#if defined(PADDLE_WITH_GLOO)
py::class_<GlooOptions>(*m, "GlooOptions")
.def(py::init<>())
.def_readwrite("_device", &GlooOptions::device)
.def_static("create", &GlooOptions::create);
py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
.def(py::init(
[](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
return std::make_shared<GlooStore>(store);
}),
py::call_guard<py::gil_scoped_release>());
py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
*m, "ProcessGroupGloo", ProcessGroup)
.def(py::init<const std::shared_ptr<GlooStore> &, int, int,
std::shared_ptr<GlooOptions> &>(),
py::call_guard<py::gil_scoped_release>())
.def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
int world_size) {
auto opts = GlooOptions::create();
char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
if (ifname && strlen(ifname) > 1) {
opts->device = ProcessGroupGloo::createDeviceForInterface(
std::string(ifname));
} else {
opts->device = ProcessGroupGloo::createDefaultDevice();
}
return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
opts);
}),
py::arg("store"), py::arg("rank"),
py::arg("world_size"), // py::arg("timeout") =
// kProcessGroupDefaultTimeout,
py::call_guard<py::gil_scoped_release>())
.def_static("create_default_device",
&ProcessGroupGloo::createDefaultDevice);
#endif
m->def("eager_assign_group_by_size", m->def("eager_assign_group_by_size",
[](py::handle py_tensors, std::vector<bool> is_sparse_gradient, [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
std::vector<size_t> group_size_limits, std::vector<size_t> group_size_limits,
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/hooks.h"
...@@ -30,10 +31,12 @@ limitations under the License. */ ...@@ -30,10 +31,12 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self, ...@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
extern PyTypeObject* p_tensor_type; extern PyTypeObject* p_tensor_type;
Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
PADDLE_ENFORCE_EQ(
tensor.initialized(), true,
paddle::platform::errors::InvalidArgument(
"We can only support initialized tensor in slice, however we got "
"uninitialized tensor %s, please check your code.",
tensor.name()));
return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
CastPyArg2Tensor(obj, 0).impl().get())));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject* obj) {
return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
}
static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self, ...@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
// NOTE(wuweilong): Set value and not change self's original place static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
VLOG(4) << "Value " << self->tensor.name(); PyObject* _index = PyTuple_GET_ITEM(args, 0);
pybind11::object numpy_value = VLOG(4) << "Call _getitem_index_not_tensor";
pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true); std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
InitTensorWithNumpyValue(self, numpy_value, false); decrease_axis, none_axes, infer_flags, list_select_idxs;
Py_INCREF(Py_None); // if index is a list, list_select_flag will be true
return Py_None; bool list_select_flag = false;
PADDLE_ENFORCE_EQ(
self->tensor.is_initialized(), true,
platform::errors::InvalidArgument(
"tensor %s has not been initialized, we can only slice initialized "
"tensor please init it first with numpy or other tensor.",
self->tensor.name()));
auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
&slice_strides, &decrease_axis, &none_axes, &infer_flags,
&list_select_idxs, &list_select_flag);
auto out = slice_axes.empty() && !list_select_flag
? self->tensor
: paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
if (!slice_axes.empty()) {
framework::AttributeMap attrs = {{"axes", slice_axes},
{"starts", slice_starts},
{"ends", slice_ends},
{"infer_flags", infer_flags},
{"decrease_axis", decrease_axis}};
std::string op_type = "slice";
for (auto stride : slice_strides) {
if (stride != 1) {
op_type = "strided_slice";
attrs.insert({"strides", slice_strides});
attrs.erase("decrease_axis");
break;
}
}
if (op_type == "slice") {
out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(),
std::move(attrs));
} else if (op_type == "strided_slice") {
out = strided_slice_dygraph_function(self->tensor, attrs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Slice is only support slice and strided_slice, but we got %s which "
"is impossible, please check your code first or contact us by "
"issue. ",
op_type));
}
}
if (!none_axes.empty()) {
// Deal with cases when all axes are decreased.
// After slice, the shape of out is [1], which should have been
// [], but Paddle doesn't support scalar.
// In order to ensure the correctness of the final shape of out,
// one dimension of out needs to be decreased.
// For example:
// # x.shape: (2,3,4)
// out = x[0, 1, 1, None] # out.shape : (1)
if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
none_axes.pop_back();
}
if (!none_axes.empty()) {
// Deal with cases that decrease_axes is not empty
// For example:
// # x.shape: (2,3,4)
// out = x[0, 0:2, None] # out.shape : (2, 1, 4)
for (auto& axis : none_axes) {
int len = 0;
for (int da : decrease_axis) {
if (da < axis) {
len++;
}
}
axis -= len;
}
paddle::experimental::Tensor new_out;
framework::AttributeMap attrs = {{"axes", none_axes}};
new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
return ToPyObject(new_out);
}
}
// the index is a list
if (list_select_flag) {
auto select_index = paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
auto idx_tensor = std::make_shared<phi::DenseTensor>();
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
egr::Controller::Instance().GetExpectedPlace());
paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
idx_tensor.get());
framework::AttributeMap attrs = {{"dim", 0}};
out = index_select_dygraph_function(self->tensor, select_index,
std::move(attrs));
}
return ToPyObject(out);
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
...@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = { ...@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
{"get_tensor", {"get_tensor",
(PyCFunction)(void (*)(void))tensor_method_get_underline_tensor, (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, {"_getitem_index_not_tensor",
(PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_register_grad_hook", {"_register_grad_hook",
(PyCFunction)(void (*)(void))tensor_register_grad_hook, (PyCFunction)(void (*)(void))tensor_register_grad_hook,
......
...@@ -16,8 +16,11 @@ limitations under the License. */ ...@@ -16,8 +16,11 @@ limitations under the License. */
#include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
...@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { ...@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
} }
} }
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos) {
return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
}
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor( std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos) { PyObject* obj, ssize_t arg_pos) {
std::vector<paddle::experimental::Tensor> result; std::vector<paddle::experimental::Tensor> result;
...@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs( ...@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
return result; return result;
} }
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); ...@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos);
std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor( std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
PyObject* obj, ssize_t arg_pos); PyObject* obj, ssize_t arg_pos);
platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos); platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
...@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs( ...@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
const std::string& op_type, const std::string& arg_name, PyObject* args, const std::string& op_type, const std::string& arg_name, PyObject* args,
ssize_t arg_idx, bool dispensable = false); ssize_t arg_idx, bool dispensable = false);
// end of Slice related methods
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -54,6 +54,7 @@ limitations under the License. */ ...@@ -54,6 +54,7 @@ limitations under the License. */
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/op_function.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/pybind_boost_headers.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
namespace paddle { namespace paddle {
...@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) { ...@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
} }
} }
Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
if (py::isinstance<imperative::VarBase>(obj)) {
VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
return GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(obj)
->Var()
.Get<framework::LoDTensor>());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We should only get paddle::experimental::Tensor or VarBase in this "
"method, when you reach this means we got another type index."));
}
}
bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>; using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
// NOTE(zjl): py::handle is a very light wrapper of PyObject *. // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
...@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { ...@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
return result; return result;
} }
static bool IsNumpyType(PyObject *obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static bool PyCheckTensor(PyObject *obj) {
return py::isinstance<imperative::VarBase>(obj);
}
// cast numpy type form S to T, this may allocate new memory // cast numpy type form S to T, this may allocate new memory
template <class T, class S> template <class T, class S>
...@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( ...@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
return result; return result;
} }
static bool PyCheckInteger(PyObject *obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static Py_ssize_t GetSliceIndexFromTensor(
const std::shared_ptr<imperative::VarBase> &tensor_index) {
const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.dtype()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
Py_ssize_t *start, Py_ssize_t *stop,
Py_ssize_t *step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromTensor(
py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor *tensor, PyObject *_index,
std::vector<int> *slice_axes, std::vector<int> *slice_starts,
std::vector<int> *slice_ends, std::vector<int> *slice_strides,
std::vector<int> *decrease_axis, std::vector<int> *none_axes,
std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
bool *list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto &shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject *slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject *slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject *list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
template <typename P> template <typename P>
static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src, // NOLINT
imperative::VarBase &dst, // NOLINT imperative::VarBase &dst, // NOLINT
......
...@@ -80,6 +80,7 @@ limitations under the License. */ ...@@ -80,6 +80,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/cuda_streams_py.h"
#include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/distributed_py.h"
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h" #include "paddle/fluid/pybind/io.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/lod_utils.h"
...@@ -101,7 +102,6 @@ limitations under the License. */ ...@@ -101,7 +102,6 @@ limitations under the License. */
#include "paddle/fluid/pybind/gloo_context_py.h" #include "paddle/fluid/pybind/gloo_context_py.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h" #include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h" #include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/metrics_py.h"
...@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) { ...@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
PYBIND11_MODULE(core_noavx, m) { PYBIND11_MODULE(core_noavx, m) {
#endif #endif
BindImperative(&m);
BindEager(&m); BindEager(&m);
BindCudaStream(&m); BindCudaStream(&m);
...@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
m.def("_promote_types_if_complex_exists", m.def("_promote_types_if_complex_exists",
&paddle::framework::PromoteTypesIfComplexExists); &paddle::framework::PromoteTypesIfComplexExists);
BindImperative(&m);
py::class_<framework::Tensor> framework_tensor(m, "Tensor", py::class_<framework::Tensor> framework_tensor(m, "Tensor",
py::buffer_protocol()); py::buffer_protocol());
g_framework_tensor_pytype = g_framework_tensor_pytype =
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Python.h>
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope_guard.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
static bool PyCheckTensor(PyObject* obj);
static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
// Slice related methods
static bool PyCheckInteger(PyObject* obj) {
#if PY_VERSION_HEX < 0x03000000
return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
#else
return PyLong_Check(obj) && !PyBool_Check(obj);
#endif
}
static bool IsNumpyType(PyObject* obj) {
// It is not a good way to judge the type of obj by its type'name. Maybe using
// `PyArray_IsScalar` will be better. However, this interface cannot be used
// by including pybind11, and it needs to compile with numpy.
auto type_name = std::string(Py_TYPE(obj)->tp_name);
return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
type_name == "numpy.int32" || type_name == "numpy.int16";
}
static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
if (tensor.numel() == 1) {
if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT32) {
return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
} else if (framework::TransToProtoVarType(tensor.type()) ==
framework::proto::VarType::INT64) {
return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, the type of tensor in slice indices only allows "
"int32 and int64, please check the type of index tensor."));
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, tensor in slice indices only allows 1 element, "
"but received %d.",
tensor.numel()));
}
}
// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
// Original PySlice_GetIndices return wrong result when
// slice_item contains long int, such as arr[:180L].
// NOT sure why this happens !!!
// Besides, PySlice_GetIndices cannot raise error when float in slice item.
// So, I make a revised version of PySlice_GetIndices, named to
// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
// PySlice_GetIndices in the future.
static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
Py_ssize_t* start, Py_ssize_t* stop,
Py_ssize_t* step) {
/* XXX support long ints */
if (r->step == Py_None) {
*step = 1;
} else {
if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
*step = PyLong_AsLong(r->step);
} else if (PyCheckTensor(r->step)) {
*step = GetSliceIndexFromPyObject(r->step);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->step)->tp_name)));
}
}
if (r->start == Py_None) {
*start = *step < 0 ? length - 1 : 0;
} else {
if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
*start = PyLong_AsLong(r->start);
} else if (PyCheckTensor(r->start)) {
*start = GetSliceIndexFromPyObject(r->start);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->start)->tp_name)));
}
if (*start < 0) *start += length;
*start = std::max(*start, static_cast<Py_ssize_t>(0));
}
if (r->stop == Py_None) {
*stop = *step < 0 ? -1 : length;
} else {
if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
*stop = PyLong_AsLong(r->stop);
} else if (PyCheckTensor(r->stop)) {
*stop = GetSliceIndexFromPyObject(r->stop);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, slice indices only allows None, integers, "
"tensor(int) and numpy(int) in slice item, but received %s.",
std::string(Py_TYPE(r->stop)->tp_name)));
}
if (0 < *step && *stop < 0) *stop += length;
*stop = std::min(*stop, length);
}
if (*stop > length) return -1;
if (*start >= length) return -1;
if (*step == 0) return -1;
return 0;
}
static void ParseIndexingSlice(
framework::LoDTensor* tensor, PyObject* _index,
std::vector<int>* slice_axes, std::vector<int>* slice_starts,
std::vector<int>* slice_ends, std::vector<int>* slice_strides,
std::vector<int>* decrease_axis, std::vector<int>* none_axes,
std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
bool* list_select_flag) {
// We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
// types, and list of Bool and Integers.
// wrap to tuple
// NOTE(zhiqiu): PyTuple_Pack increases refcount.
PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index);
VLOG(4) << "Call Py_DECREF";
}
});
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("tensor has not been initialized"));
const auto& shape = tensor->dims();
const int rank = shape.size();
const int size = PyTuple_GET_SIZE(index);
// specified_dims is the number of dimensions which indexed by Interger,
// Slices.
int specified_dims = 0;
int ell_count = 0;
for (int dim = 0; dim < size; ++dim) {
PyObject* slice_item = PyTuple_GetItem(index, dim);
if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
specified_dims++;
} else if (slice_item == Py_Ellipsis) {
ell_count++;
}
}
PADDLE_ENFORCE_LE(ell_count, 1,
platform::errors::InvalidArgument(
"An index can only have a single ellipsis ('...')"));
int none_count = 0;
for (int i = 0, dim = 0; i < size; ++i) {
PyObject* slice_item = PyTuple_GetItem(index, i);
infer_flags->push_back(1);
int dim_len = shape[dim];
if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
// integer, PyLong_AsLong supports both int and long
int start = static_cast<int>(PyLong_AsLong(slice_item));
auto s_t = start;
start = start < 0 ? start + dim_len : start;
if (start >= dim_len || start < 0) {
std::string str_error_message =
"The starting index " + std::to_string(s_t) +
" of slice is out of bounds in tensor " + std::to_string(dim) +
"-th axis, it shound be in the range of [" +
std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
// py::index_error is corresponding to IndexError in Python
// Used to indicate out of bounds access in __getitem__, __setitem__
throw py::index_error(str_error_message);
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(start + 1);
slice_strides->push_back(1);
decrease_axis->push_back(dim);
dim++;
} else if (PySlice_Check(slice_item)) {
// slice item
Py_ssize_t start, end, step;
PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
_PySlice_GetIndices(p, dim_len, &start, &end, &step);
// :: or : or 0:dim_len:1
if (start == 0 && end == dim_len && step == 1) {
dim++;
continue;
}
slice_axes->push_back(dim);
slice_starts->push_back(start);
slice_ends->push_back(end);
slice_strides->push_back(step);
dim++;
} else if (slice_item == Py_Ellipsis) {
dim += rank - specified_dims;
} else if (slice_item == Py_None) {
none_axes->push_back(dim + none_count);
none_count++;
} else if (PyList_Check(slice_item)) {
*list_select_flag = true;
PADDLE_ENFORCE_EQ(
size, 1,
platform::errors::InvalidArgument(
"When index contains a list, its length is excepted to 1, "
"but received %d",
size));
bool all_bool = true;
int list_size = PyList_GET_SIZE(slice_item);
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
all_bool = false;
} else if (!PyBool_Check(list_item)) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Only support int or bool in index list."));
}
}
if (all_bool) {
PADDLE_ENFORCE_EQ(
list_size, shape[0],
platform::errors::InvalidArgument(
"The dimension of bool index doesn't match indexed array along "
"dimension 0, the target dimension is %d, but received %d.",
shape[0], list_size));
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (list_item == Py_True) {
list_select_idxs->push_back(j);
}
}
} else {
for (int j = 0; j < list_size; ++j) {
PyObject* list_item = PyList_GetItem(slice_item, j);
if (PyCheckInteger(list_item)) {
list_select_idxs->push_back(
static_cast<int>(PyLong_AsLong(list_item)));
} else if (list_item == Py_True) {
list_select_idxs->push_back(1);
} else {
list_select_idxs->push_back(0);
}
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Currently, Tensor.__indices__() only allows indexing "
"by Integers, Slices, Ellipsis, None, tuples of these types "
"and list of Bool and Integers, but received "
"%s in %dth slice item",
std::string(Py_TYPE(slice_item)->tp_name), i + 1));
}
}
// valid_index is the number of dimensions exclude None index
const int valid_indexs = size - none_axes->size() - ell_count;
PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
platform::errors::InvalidArgument(
"Too many indices (%d) for tensor of dimension %d.",
valid_indexs, rank));
}
} // namespace pybind
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册