提交 31363c3f 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -27,7 +27,6 @@ limitations under the License. */ ...@@ -27,7 +27,6 @@ limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/compat/op_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
namespace paddle { namespace paddle {
...@@ -101,235 +100,197 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { ...@@ -101,235 +100,197 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
const InferShapeContext& ctx_; const InferShapeContext& ctx_;
}; };
// TODO(chenweihang): Support TensorArray later int64_t CompatMetaTensor::numel() const {
class CompatMetaTensor : public phi::MetaTensor { if (is_runtime_) {
public: auto* var = BOOST_GET_CONST(Variable*, var_);
CompatMetaTensor(InferShapeVarPtr var, bool is_runtime) return var->Get<Tensor>().numel();
: var_(std::move(var)), is_runtime_(is_runtime) {} } else {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
CompatMetaTensor() = default; return var->ElementSize();
CompatMetaTensor(const CompatMetaTensor&) = default;
CompatMetaTensor(CompatMetaTensor&&) = default;
CompatMetaTensor& operator=(const CompatMetaTensor&) = delete;
CompatMetaTensor& operator=(CompatMetaTensor&&) = delete;
int64_t numel() const override {
if (is_runtime_) {
auto* var = BOOST_GET_CONST(Variable*, var_);
return var->Get<Tensor>().numel();
} else {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
return var->ElementSize();
}
} }
}
DDim dims() const override { DDim CompatMetaTensor::dims() const {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET_CONST(Variable*, var_); auto* var = BOOST_GET_CONST(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
return var->Get<phi::DenseTensor>().dims(); return var->Get<phi::DenseTensor>().dims();
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
return var->Get<phi::SelectedRows>().dims(); return var->Get<phi::SelectedRows>().dims();
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
// use tensor array size as dims // use tensor array size as dims
auto& tensor_array = var->Get<framework::LoDTensorArray>(); auto& tensor_array = var->Get<framework::LoDTensorArray>();
return phi::make_ddim({static_cast<int64_t>(tensor_array.size())}); return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can get dims from DenseTensor or SelectedRows or "
"DenseTensorArray."));
}
} else { } else {
auto* var = BOOST_GET_CONST(VarDesc*, var_); PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can get dims from DenseTensor or SelectedRows or "
return var->GetShape().empty() ? phi::make_ddim({0UL}) "DenseTensorArray."));
: phi::make_ddim(var->GetShape());
} }
} else {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
return var->GetShape().empty() ? phi::make_ddim({0UL})
: phi::make_ddim(var->GetShape());
} }
}
phi::DataType dtype() const override { phi::DataType CompatMetaTensor::dtype() const {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET_CONST(Variable*, var_); auto* var = BOOST_GET_CONST(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
return var->Get<phi::DenseTensor>().dtype(); return var->Get<phi::DenseTensor>().dtype();
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
return var->Get<phi::SelectedRows>().dtype(); return var->Get<phi::SelectedRows>().dtype();
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
// NOTE(chenweihang): do nothing // NOTE(chenweihang): do nothing
// Unsupported get dtype from LoDTensorArray now // Unsupported get dtype from LoDTensorArray now
return phi::DataType::UNDEFINED; return phi::DataType::UNDEFINED;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can get dtype from DenseTensor or SelectedRows."));
}
} else { } else {
auto* var = BOOST_GET_CONST(VarDesc*, var_); PADDLE_THROW(platform::errors::Unimplemented(
return paddle::framework::TransToPhiDataType(var->GetDataType()); "Currently, only can get dtype from DenseTensor or SelectedRows."));
} }
} else {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
return paddle::framework::TransToPhiDataType(var->GetDataType());
} }
}
DataLayout layout() const override { DataLayout CompatMetaTensor::layout() const {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET_CONST(Variable*, var_); auto* var = BOOST_GET_CONST(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
return var->Get<phi::DenseTensor>().layout(); return var->Get<phi::DenseTensor>().layout();
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
return var->Get<phi::SelectedRows>().layout(); return var->Get<phi::SelectedRows>().layout();
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
// NOTE(chenweihang): do nothing
// Unsupported get layout from LoDTensorArray now
return phi::DataLayout::UNDEFINED;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can get layout from DenseTensor or "
"SelectedRows."));
}
} else {
// NOTE(chenweihang): do nothing // NOTE(chenweihang): do nothing
// Unsupported get layout for VarDesc now // Unsupported get layout from LoDTensorArray now
return DataLayout::UNDEFINED; return phi::DataLayout::UNDEFINED;
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can get layout from DenseTensor or "
"SelectedRows."));
} }
} else {
// NOTE(chenweihang): do nothing
// Unsupported get layout for VarDesc now
return DataLayout::UNDEFINED;
} }
}
void set_dims(const DDim& dims) override { void CompatMetaTensor::set_dims(const DDim& dims) {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_); auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto* tensor = var->GetMutable<phi::DenseTensor>(); auto* tensor = var->GetMutable<phi::DenseTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value(); auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
auto* tensor_array = var->GetMutable<framework::LoDTensorArray>(); auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
// Note: Here I want enforce `tensor_array->size() == 0UL`, because // Note: Here I want enforce `tensor_array->size() == 0UL`, because
// inplace using on LoDTensorArray is dangerous, but the unittest // inplace using on LoDTensorArray is dangerous, but the unittest
// `test_list` contains this behavior // `test_list` contains this behavior
PADDLE_ENFORCE_EQ(dims.size(), 1UL, PADDLE_ENFORCE_EQ(dims.size(), 1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"LoDTensorArray can only have one dimension.")); "LoDTensorArray can only have one dimension."));
// only set the array size for LoDTensorArray input // only set the array size for LoDTensorArray input
tensor_array->resize(dims[0]); tensor_array->resize(dims[0]);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can set dims from DenseTensor or SelectedRows."));
}
} else { } else {
auto* var = BOOST_GET(VarDesc*, var_); PADDLE_THROW(platform::errors::Unimplemented(
var->SetShape(vectorize(dims)); "Currently, only can set dims from DenseTensor or SelectedRows."));
} }
} else {
auto* var = BOOST_GET(VarDesc*, var_);
var->SetShape(vectorize(dims));
} }
}
void set_dtype(phi::DataType dtype) override { void CompatMetaTensor::set_dtype(phi::DataType dtype) {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_); auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto* tensor = var->GetMutable<phi::DenseTensor>(); auto* tensor = var->GetMutable<phi::DenseTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value(); auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
// NOTE(chenweihang): do nothing // NOTE(chenweihang): do nothing
// Unsupported set dtype for LoDTensorArray now // Unsupported set dtype for LoDTensorArray now
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can set dtype from DenseTensor or SelectedRows."));
}
} else { } else {
auto* var = BOOST_GET(VarDesc*, var_); PADDLE_THROW(platform::errors::Unimplemented(
var->SetDataType(paddle::framework::TransToProtoVarType(dtype)); "Currently, only can set dtype from DenseTensor or SelectedRows."));
} }
} else {
auto* var = BOOST_GET(VarDesc*, var_);
var->SetDataType(paddle::framework::TransToProtoVarType(dtype));
} }
}
void set_layout(DataLayout layout) override { void CompatMetaTensor::set_layout(DataLayout layout) {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_); auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto* tensor = var->GetMutable<phi::DenseTensor>(); auto* tensor = var->GetMutable<phi::DenseTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
} else if (var->IsType<phi::SelectedRows>()) { } else if (var->IsType<phi::SelectedRows>()) {
auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value(); auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
} else if (var->IsType<framework::LoDTensorArray>()) { } else if (var->IsType<framework::LoDTensorArray>()) {
// NOTE(chenweihang): do nothing
// Unsupported set dtype for LoDTensorArray now
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can set layout from DenseTensor or "
"SelectedRows."));
}
} else {
// NOTE(chenweihang): do nothing // NOTE(chenweihang): do nothing
// Unsupported set layout for VarDesc now // Unsupported set dtype for LoDTensorArray now
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Currently, only can set layout from DenseTensor or "
"SelectedRows."));
} }
} else {
// NOTE(chenweihang): do nothing
// Unsupported set layout for VarDesc now
} }
}
void share_lod(const MetaTensor& meta_tensor) override { void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) {
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_); auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::DenseTensor>()) { if (var->IsType<phi::DenseTensor>()) {
auto* tensor = var->GetMutable<phi::DenseTensor>(); auto* tensor = var->GetMutable<phi::DenseTensor>();
phi::DenseTensorUtils::GetMutableMeta(tensor)->lod = phi::DenseTensorUtils::GetMutableMeta(tensor)->lod =
static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD(); static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
} else {
// NOTE(chenweihang): do nothing
// only LoDTensor need to share lod
}
} else { } else {
auto* var = BOOST_GET(VarDesc*, var_); // NOTE(chenweihang): do nothing
var->SetLoDLevel(static_cast<const CompatMetaTensor&>(meta_tensor) // only LoDTensor need to share lod
.GetCompileTimeLoD());
} }
} else {
auto* var = BOOST_GET(VarDesc*, var_);
var->SetLoDLevel(
static_cast<const CompatMetaTensor&>(meta_tensor).GetCompileTimeLoD());
} }
}
void share_dims(const MetaTensor& meta_tensor) override { void CompatMetaTensor::share_dims(const MetaTensor& meta_tensor) {
set_dims(meta_tensor.dims()); set_dims(meta_tensor.dims());
if (is_runtime_) { if (is_runtime_) {
auto* var = BOOST_GET(Variable*, var_); auto* var = BOOST_GET(Variable*, var_);
if (var->IsType<phi::SelectedRows>()) { if (var->IsType<phi::SelectedRows>()) {
auto* selected_rows = var->GetMutable<phi::SelectedRows>(); auto* selected_rows = var->GetMutable<phi::SelectedRows>();
auto& input_selected_rows = auto& input_selected_rows =
static_cast<const CompatMetaTensor&>(meta_tensor).GetSelectedRows(); static_cast<const CompatMetaTensor&>(meta_tensor).GetSelectedRows();
selected_rows->set_rows(input_selected_rows.rows()); selected_rows->set_rows(input_selected_rows.rows());
selected_rows->set_height(input_selected_rows.height()); selected_rows->set_height(input_selected_rows.height());
}
} }
} }
}
void share_meta(const MetaTensor& meta_tensor) override { void CompatMetaTensor::share_meta(const MetaTensor& meta_tensor) {
share_dims(meta_tensor); share_dims(meta_tensor);
set_dtype(meta_tensor.dtype()); set_dtype(meta_tensor.dtype());
set_layout(meta_tensor.layout()); set_layout(meta_tensor.layout());
// special case: share lod of LoDTensor // special case: share lod of LoDTensor
share_lod(meta_tensor); share_lod(meta_tensor);
} }
private:
const LoD& GetRuntimeLoD() const {
auto* var = BOOST_GET_CONST(Variable*, var_);
return var->Get<LoDTensor>().lod();
}
int32_t GetCompileTimeLoD() const {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
return var->GetLoDLevel();
}
const phi::SelectedRows& GetSelectedRows() const {
PADDLE_ENFORCE_EQ(is_runtime_, true,
platform::errors::Unavailable(
"Only can get Tensor from MetaTensor in rumtime."));
auto* var = BOOST_GET_CONST(Variable*, var_);
PADDLE_ENFORCE_EQ(var->IsType<phi::SelectedRows>(), true,
platform::errors::Unavailable(
"The Tensor in MetaTensor is not SelectedRows."));
return var->Get<phi::SelectedRows>();
}
InferShapeVarPtr var_;
bool is_runtime_;
};
phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
const std::string& op_type) { const std::string& op_type) {
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/phi/core/meta_tensor.h"
namespace phi { namespace phi {
class InferMetaContext; class InferMetaContext;
} // namespace phi } // namespace phi
...@@ -39,5 +39,63 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, ...@@ -39,5 +39,63 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
} \ } \
} }
// TODO(chenweihang): Support TensorArray later
class CompatMetaTensor : public phi::MetaTensor {
public:
CompatMetaTensor(InferShapeVarPtr var, bool is_runtime)
: var_(std::move(var)), is_runtime_(is_runtime) {}
CompatMetaTensor() = default;
CompatMetaTensor(const CompatMetaTensor&) = default;
CompatMetaTensor(CompatMetaTensor&&) = default;
CompatMetaTensor& operator=(const CompatMetaTensor&) = delete;
CompatMetaTensor& operator=(CompatMetaTensor&&) = delete;
int64_t numel() const override;
DDim dims() const override;
phi::DataType dtype() const override;
DataLayout layout() const override;
void set_dims(const DDim& dims) override;
void set_dtype(phi::DataType dtype) override;
void set_layout(DataLayout layout) override;
void share_lod(const MetaTensor& meta_tensor) override;
void share_dims(const MetaTensor& meta_tensor) override;
void share_meta(const MetaTensor& meta_tensor) override;
private:
const LoD& GetRuntimeLoD() const {
auto* var = BOOST_GET_CONST(Variable*, var_);
return var->Get<LoDTensor>().lod();
}
int32_t GetCompileTimeLoD() const {
auto* var = BOOST_GET_CONST(VarDesc*, var_);
return var->GetLoDLevel();
}
const phi::SelectedRows& GetSelectedRows() const {
PADDLE_ENFORCE_EQ(is_runtime_, true,
platform::errors::Unavailable(
"Only can get Tensor from MetaTensor in rumtime."));
auto* var = BOOST_GET_CONST(Variable*, var_);
PADDLE_ENFORCE_EQ(var->IsType<phi::SelectedRows>(), true,
platform::errors::Unavailable(
"The Tensor in MetaTensor is not SelectedRows."));
return var->Get<phi::SelectedRows>();
}
InferShapeVarPtr var_;
bool is_runtime_;
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -15,9 +15,12 @@ ...@@ -15,9 +15,12 @@
#pragma once #pragma once
#include <functional> #include <functional>
#include <future>
#include <memory> #include <memory>
#include <string> #include <string>
#include <type_traits>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -25,6 +28,29 @@ namespace framework { ...@@ -25,6 +28,29 @@ namespace framework {
constexpr const char* kQueueEmptyEvent = "QueueEmpty"; constexpr const char* kQueueEmptyEvent = "QueueEmpty";
constexpr const char* kQueueDestructEvent = "QueueDestruct"; constexpr const char* kQueueDestructEvent = "QueueDestruct";
// For std::function
// https://stackoverflow.com/questions/25421346/how-to-create-an-stdfunction-from-a-move-capturing-lambda-expression
template <typename OnlyMovable>
class FakeCopyable {
public:
explicit FakeCopyable(OnlyMovable&& obj) : obj_(std::move(obj)) {
static_assert(std::is_copy_constructible<OnlyMovable>::value == false,
"Need not to use FakeCopyable");
}
FakeCopyable(FakeCopyable&& other) : obj_(std::move(other.obj_)) {}
FakeCopyable(const FakeCopyable& other) {
PADDLE_THROW(platform::errors::Unavailable(
"Never use the copy constructor of FakeCopyable."));
}
OnlyMovable& Get() { return obj_; }
private:
OnlyMovable obj_;
};
class EventsWaiter; class EventsWaiter;
struct WorkQueueOptions { struct WorkQueueOptions {
...@@ -78,6 +104,22 @@ class WorkQueue { ...@@ -78,6 +104,22 @@ class WorkQueue {
virtual void AddTask(std::function<void()> fn) = 0; virtual void AddTask(std::function<void()> fn) = 0;
// Higher cost than AddTask
template <typename F, typename... Args>
std::future<typename std::result_of<F(Args...)>::type> AddAwaitableTask(
F&& f, Args&&... args) {
using ReturnType = typename std::result_of<F(Args...)>::type;
std::function<ReturnType()> task =
std::bind(std::forward<F>(f), std::forward<Args>(args)...);
std::promise<ReturnType> prom;
std::future<ReturnType> res = prom.get_future();
AddTask([
t = std::move(task),
p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
]() mutable { p.Get().set_value(t()); });
return res;
}
// See WorkQueueOptions.track_task for details // See WorkQueueOptions.track_task for details
// virtual void WaitQueueEmpty() = 0; // virtual void WaitQueueEmpty() = 0;
...@@ -102,6 +144,22 @@ class WorkQueueGroup { ...@@ -102,6 +144,22 @@ class WorkQueueGroup {
virtual void AddTask(size_t queue_idx, std::function<void()> fn) = 0; virtual void AddTask(size_t queue_idx, std::function<void()> fn) = 0;
// Higher cost than AddTask
template <typename F, typename... Args>
std::future<typename std::result_of<F(Args...)>::type> AddAwaitableTask(
size_t queue_idx, F&& f, Args&&... args) {
using ReturnType = typename std::result_of<F(Args...)>::type;
std::function<ReturnType()> task =
std::bind(std::forward<F>(f), std::forward<Args>(args)...);
std::promise<ReturnType> prom;
std::future<ReturnType> res = prom.get_future();
AddTask(queue_idx, [
t = std::move(task),
p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
]() mutable { p.Get().set_value(t()); });
return res;
}
// See WorkQueueOptions.track_task for details // See WorkQueueOptions.track_task for details
// virtual void WaitQueueGroupEmpty() = 0; // virtual void WaitQueueGroupEmpty() = 0;
......
...@@ -60,11 +60,13 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) { ...@@ -60,11 +60,13 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
} }
finished = true; finished = true;
}); });
auto handle = work_queue->AddAwaitableTask([]() { return 1234; });
// WaitQueueEmpty // WaitQueueEmpty
EXPECT_EQ(finished.load(), false); EXPECT_EQ(finished.load(), false);
events_waiter.WaitEvent(); events_waiter.WaitEvent();
EXPECT_EQ(finished.load(), true); EXPECT_EQ(finished.load(), true);
EXPECT_EQ(counter.load(), kLoopNum); EXPECT_EQ(counter.load(), kLoopNum);
EXPECT_EQ(handle.get(), 1234);
} }
TEST(WorkQueue, TestMultiThreadedWorkQueue) { TEST(WorkQueue, TestMultiThreadedWorkQueue) {
...@@ -146,6 +148,9 @@ TEST(WorkQueue, TestWorkQueueGroup) { ...@@ -146,6 +148,9 @@ TEST(WorkQueue, TestWorkQueueGroup) {
++counter; ++counter;
} }
}); });
int random_num = 123456;
auto handle =
queue_group->AddAwaitableTask(1, [random_num]() { return random_num; });
// WaitQueueGroupEmpty // WaitQueueGroupEmpty
events_waiter.WaitEvent(); events_waiter.WaitEvent();
EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum);
...@@ -154,4 +159,5 @@ TEST(WorkQueue, TestWorkQueueGroup) { ...@@ -154,4 +159,5 @@ TEST(WorkQueue, TestWorkQueueGroup) {
events_waiter.WaitEvent(); events_waiter.WaitEvent();
queue_group.reset(); queue_group.reset();
EXPECT_EQ(events_waiter.WaitEvent(), paddle::framework::kQueueDestructEvent); EXPECT_EQ(events_waiter.WaitEvent(), paddle::framework::kQueueDestructEvent);
EXPECT_EQ(handle.get(), random_num);
} }
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
template <typename T>
HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
const int w, const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
weight = (h == argmax_h_low && w == argmax_w_low)
? (h + 1 - argmax_h) * (w + 1 - argmax_w)
: weight;
weight = (h == argmax_h_low && w == argmax_w_high)
? (h + 1 - argmax_h) * (argmax_w + 1 - w)
: weight;
weight = (h == argmax_h_high && w == argmax_w_low)
? (argmax_h + 1 - h) * (w + 1 - argmax_w)
: weight;
weight = (h == argmax_h_high && w == argmax_w_high)
? (argmax_h + 1 - h) * (argmax_w + 1 - w)
: weight;
return weight;
}
template <typename T>
HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height,
const int width, const T* im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low]
: 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high]
: 0;
} else if (bp_dir == 1) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low]
: 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high]
: 0;
}
return weight;
}
template <typename T>
HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, const int data_width,
const int height, const int width, T h, T w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh;
T hw = 1 - lw;
T v1 =
(h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
T v2 = (h_low >= 0 && w_high <= width - 1)
? bottom_data[h_low * data_width + w_high]
: 0;
T v3 = (h_high <= height - 1 && w_low >= 0)
? bottom_data[h_high * data_width + w_low]
: 0;
T v4 = (h_high <= height - 1 && w_high <= width - 1)
? bottom_data[h_high * data_width + w_high]
: 0;
T w1 = hh * hw;
T w2 = hh * lw;
T w3 = lh * hw;
T w4 = lh * lw;
return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
...@@ -12,9 +12,11 @@ ...@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/deformable_conv_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -108,158 +110,6 @@ $$ ...@@ -108,158 +110,6 @@ $$
class DeformableConvOp : public framework::OperatorWithKernel { class DeformableConvOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "deformable_conv");
OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset",
"deformable_conv)");
OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "deformable_conv");
OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter",
"deformable_conv");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output",
"deformable_conv");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
auto offset_dims = ctx->GetInputDim("Offset");
auto mask_dims = ctx->GetInputDim("Mask");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations =
ctx->Attrs().Get<std::vector<int>>("dilations");
int groups = ctx->Attrs().Get<int>("groups");
int deformable_groups = ctx->Attrs().Get<int>("deformable_groups");
int im2col_step = ctx->Attrs().Get<int>("im2col_step");
PADDLE_ENFORCE_EQ(
in_dims.size(), 4,
platform::errors::InvalidArgument(
"Conv input should be 4-D tensor, get %u", in_dims.size()));
PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
platform::errors::InvalidArgument(
"Conv input dimension and filter dimension should be "
"the same. The difference is [%d]: [%d]",
in_dims.size(), filter_dims.size()));
PADDLE_ENFORCE_EQ(in_dims.size() - strides.size(), 2U,
platform::errors::InvalidArgument(
"Conv input dimension and strides "
"dimension should be consistent. But received input "
"dimension:[%d], strides dimension:[%d]",
in_dims.size(), strides.size()));
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
platform::errors::InvalidArgument(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d]: [%d]",
paddings.size(), strides.size()));
PADDLE_ENFORCE_EQ(
in_dims[1], filter_dims[1] * groups,
platform::errors::InvalidArgument(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]",
in_dims[1], filter_dims[1] * groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % groups, 0,
platform::errors::InvalidArgument(
"The number of output channels should be divided by groups. But "
"received output channels:[%d], groups:[%d]",
filter_dims[0], groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % deformable_groups, 0,
platform::errors::InvalidArgument(
"The number of output channels should be "
"divided by deformable groups. The difference is [%d]: [%d]",
filter_dims[0] % groups, 0));
if (in_dims[0] > im2col_step) {
PADDLE_ENFORCE_EQ(
in_dims[0] % im2col_step, 0U,
platform::errors::InvalidArgument(
"Input batchsize must be smaller than or divide im2col_step. But "
"received Input batchsize:[%d], im2col_step:[%d]",
in_dims[0], im2col_step));
}
for (size_t i = 0; i < strides.size(); ++i) {
PADDLE_ENFORCE_GT(strides[i], 0U, platform::errors::InvalidArgument(
"stride %d size incorrect", i));
}
for (size_t i = 0; i < dilations.size(); ++i) {
PADDLE_ENFORCE_GT(dilations[i], 0U, platform::errors::InvalidArgument(
"dilation %d size incorrect", i));
}
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
if ((!ctx->IsRuntime()) &&
(in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
output_shape.push_back(-1);
} else {
output_shape.push_back(ConvOutputSize(in_dims[i + 2],
filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
}
PADDLE_ENFORCE_EQ(
output_shape[1] % deformable_groups, 0U,
platform::errors::InvalidArgument(
"output num_filter must divide deformable group size. But received "
"output num_filter:[%d], deformable group size:[%d]",
output_shape[1], deformable_groups));
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2],
platform::errors::InvalidArgument(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]",
output_shape[2], offset_dims[2]));
PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3],
platform::errors::InvalidArgument(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]",
output_shape[3], offset_dims[3]));
PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
platform::errors::InvalidArgument(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]",
offset_dims[1], filter_dims[2] * filter_dims[3]));
PADDLE_ENFORCE_EQ(
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups,
platform::errors::InvalidArgument(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]",
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups));
PADDLE_ENFORCE_EQ(output_shape[2], mask_dims[2],
platform::errors::InvalidArgument(
"output height must equal to mask map height. The "
"difference is [%d] vs [%d]",
output_shape[2], mask_dims[2]));
PADDLE_ENFORCE_EQ(output_shape[3], mask_dims[3],
platform::errors::InvalidArgument(
"output width must equal to mask map width. The "
"difference is [%d] vs [%d]",
output_shape[3], mask_dims[3]));
PADDLE_ENFORCE_EQ(mask_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
platform::errors::InvalidArgument(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]",
mask_dims[1], filter_dims[2] * filter_dims[3]));
PADDLE_ENFORCE_EQ(mask_dims[1] / (filter_dims[2] * filter_dims[3]),
deformable_groups,
platform::errors::InvalidArgument(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]",
mask_dims[1] / (filter_dims[2] * filter_dims[3]),
deformable_groups));
}
ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -331,13 +181,13 @@ class DeformableConvGradOp : public framework::OperatorWithKernel { ...@@ -331,13 +181,13 @@ class DeformableConvGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(deformable_conv, DeformableConvInferShapeFunctor,
PD_INFER_META(phi::DeformableConvInferMeta));
REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp,
ops::DeformableConvOpMaker, ops::DeformableConvOpMaker,
ops::DeformableConvGradOpMaker<paddle::framework::OpDesc>, ops::DeformableConvGradOpMaker<paddle::framework::OpDesc>,
ops::DeformableConvGradOpMaker<paddle::imperative::OpBase>); ops::DeformableConvGradOpMaker<paddle::imperative::OpBase>,
DeformableConvInferShapeFunctor);
REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp);
REGISTER_OP_CPU_KERNEL(deformable_conv_grad,
ops::DeformableConvGradCPUKernel<float>,
ops::DeformableConvGradCPUKernel<double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/DCNv2_op/nn/modulated_deformable_im2col.cuh
//
// Copyright (c) 2018 Microsoft
// Licensed under The MIT License [see LICENSE for details]
// \file modulated_deformable_im2col.cuh
// \brief
// \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaximumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaximumNumBlocks);
}
template <typename T>
__device__ T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
const int w, const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
__global__ void ModulatedDeformableCol2imGpuKernel(
const int nthreads, const T* data_col, const T* data_offset,
const T* data_mask, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int deformable_group, const int height_col,
const int width_col, T* grad_im) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t thread = index; thread < nthreads; thread += offset) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[thread] * mask;
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
weight * cur_top_grad);
}
}
}
}
}
template <typename T>
inline void ModulatedDeformableCol2im(
const platform::DeviceContext& ctx, const T* data_col, const T* data_offset,
const T* data_mask, const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> pad,
const std::vector<int> stride, const std::vector<int> dilation,
const int deformable_group, T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableCol2imGpuKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1],
im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0],
stride[1], dilation[0], dilation[1], channel_per_deformable_group,
col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im);
}
template <typename T>
__device__ T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height,
const int width, const T* im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width,
const int height, const int width, T h, T w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__global__ void ModulatedDeformableCol2imCoordGpuKernel(
const int nthreads, const T* data_col, const T* data_im,
const T* data_offset, const T* data_mask, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T* grad_offset, T* grad_mask) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
height, width, inv_h, inv_w);
}
const T weight = DmcnGetCoordinateWeight(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
grad_offset[i] = val;
if (offset_c % 2 == 0)
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
template <typename T>
inline void ModulatedDeformableCol2imCoord(
const platform::DeviceContext& ctx, const T* data_col, const T* data_im,
const T* data_offset, const T* data_mask,
const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* grad_offset, T* grad_mask) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableCol2imCoordGpuKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0],
im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0],
paddings[1], strides[0], strides[1], dilations[0], dilations[1],
channel_per_deformable_group, col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask);
}
template <typename T>
__global__ void ModulatedDeformableIm2colGpuKernel(
const int nthreads, const T* data_im, const T* data_offset,
const T* data_mask, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T* data_col) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask +
(b_col * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
inline void ModulatedDeformableIm2col(
const platform::DeviceContext& ctx, const T* data_im, const T* data_offset,
const T* data_mask, const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> filter_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableIm2colGpuKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2],
filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
strides[1], dilations[0], dilations[1], channel_per_deformable_group,
col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
data_col);
}
template <typename T>
__global__ void FilterGradAddupGpuKernel(const int nthreads, const int n,
const int height, const int width,
const T* dweight_3d, T* filter_grad) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
filter_grad[i] = filter_grad[i] + dweight_3d[i];
}
}
template <typename DeviceContext, typename T>
class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
Tensor* mask_grad = ctx.Output<Tensor>(framework::GradVarName("Mask"));
const Tensor* input = ctx.Input<Tensor>("Input");
Tensor offset = *ctx.Input<Tensor>("Offset");
Tensor mask = *ctx.Input<Tensor>("Mask");
Tensor filter = *ctx.Input<Tensor>("Filter");
if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return;
int groups = ctx.Attr<int>("groups");
int deformable_groups = ctx.Attr<int>("deformable_groups");
int im2col_step = ctx.Attr<int>("im2col_step");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
auto& dev_ctx = ctx.cuda_device_context();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output_grad->dims()));
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
output_buffer.ShareDataWith(*output_grad);
int64_t M =
input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K = output_shape_vec[1] / groups;
framework::DDim weight_3d_shape = {groups, K, M};
framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
N};
framework::DDim col_buffer_3d_shape = {groups, M, N};
framework::DDim filter_grad_shape = {groups, K, M};
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
Tensor out_grad_4d;
out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
phi::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace());
out_grad_4d.mutable_data<T>(ctx.GetPlace());
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
int input_mask_dim = mask.numel() / mask.dims()[0];
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
filter_grad->Resize(filter_grad_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (offset_grad && mask_grad) {
offset_grad->mutable_data<T>(ctx.GetPlace());
mask_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, offset_grad, static_cast<T>(0));
set_zero(dev_ctx, mask_grad, static_cast<T>(0));
}
for (int i = 0; i < batch_size / im2col_step; ++i) {
Tensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
&col_buffer_3d_slice, T(0.0));
}
col_buffer.Resize(col_shape);
T* col_buffer_ptr = col_buffer.data<T>();
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>();
const T* mask_ptr = mask.data<T>();
if (mask_grad && offset_grad) {
T* offset_grad_ptr = offset_grad->data<T>();
T* mask_grad_ptr = mask_grad->data<T>();
ModulatedDeformableCol2imCoord(
ctx.device_context(), col_buffer_ptr,
input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
offset_grad_ptr + i * im2col_step * input_offset_dim,
mask_grad_ptr + i * im2col_step * input_mask_dim);
}
if (input_grad) {
T* input_grad_ptr = input_grad->data<T>();
ModulatedDeformableCol2im(
ctx.device_context(), col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
input_grad_ptr + i * im2col_step * input_dim);
input_grad->Resize(input->dims());
}
ModulatedDeformableIm2col(
ctx.device_context(), input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
deformable_groups, col_buffer_ptr);
col_buffer_3d.Resize(col_buffer_3d_shape);
if (filter_grad) {
Tensor dweight_3d;
dweight_3d =
ctx.AllocateTmpTensor<T, DeviceContext>(filter_grad_shape, dev_ctx);
for (int g = 0; g < groups; ++g) {
Tensor out_grad_3d_slice =
out_grad_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
T(1.0), &dweight_3d_slice, T(0.0));
}
FilterGradAddupGpuKernel<
T><<<NumBlocks(dweight_3d.numel()), kNumCUDAThreads, 0,
ctx.cuda_device_context().stream()>>>(
dweight_3d.numel(), groups, K, M, dweight_3d.data<T>(),
filter_grad->data<T>());
}
}
if (filter_grad) {
filter_grad->Resize(filter.dims());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(deformable_conv_grad,
ops::DeformableConvGradCUDAKernel<CUDA, float>,
ops::DeformableConvGradCUDAKernel<CUDA, double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using CPUDeviceContext = platform::CPUDeviceContext;
template <typename T>
void ModulatedDeformableCol2imCPUKernel(
const int num_kernels, const T* data_col, const T* data_offset,
const T* data_mask, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int deformable_group, const int height_col,
const int width_col, T* grad_im) {
for (int thread = 0; thread < num_kernels; thread++) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[thread] * mask;
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) =
*(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
}
}
}
}
}
template <typename T>
static inline void ModulatedDeformableCol2imCPU(
const platform::CPUDeviceContext& ctx, const T* data_col,
const T* data_offset, const T* data_mask,
const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> pad,
const std::vector<int> stride, const std::vector<int> dilation,
const int deformable_group, T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
ModulatedDeformableCol2imCPUKernel(
num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1],
im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0],
stride[1], dilation[0], dilation[1], channel_per_deformable_group,
col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im);
}
template <typename T>
void ModulatedDeformableCol2imCoordCPUKernel(
const int num_kernels, const T* data_col, const T* data_im,
const T* data_offset, const T* data_mask, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T* grad_offset, T* grad_mask) {
for (int i = 0; i < num_kernels; i++) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
height, width, inv_h, inv_w);
}
const T weight = DmcnGetCoordinateWeight(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
grad_offset[i] = val;
if (offset_c % 2 == 0)
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
template <typename T>
static inline void ModulatedDeformableCol2imCoordCPU(
const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im,
const T* data_offset, const T* data_mask,
const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* grad_offset, T* grad_mask) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
ModulatedDeformableCol2imCoordCPUKernel(
num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0],
im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0],
paddings[1], strides[0], strides[1], dilations[0], dilations[1],
channel_per_deformable_group, col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask);
}
template <typename T>
void ModulatedDeformableIm2colCPUKernel(
const int num_kernels, const T* data_im, const T* data_offset,
const T* data_mask, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T* data_col) {
for (int i = 0; i < num_kernels; i++) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask +
(b_col * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
static inline void ModulatedDeformableIm2colCPU(
const platform::CPUDeviceContext& ctx, const T* data_im,
const T* data_offset, const T* data_mask,
const std::vector<int64_t> im_shape, const std::vector<int64_t> col_shape,
const std::vector<int64_t> filter_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel(
num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2],
filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
strides[1], dilations[0], dilations[1], channel_per_deformable_group,
col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
data_col);
}
template <typename T>
void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height,
const int width, const T* dweight_3d,
T* filter_grad) {
for (int i = 0; i < nthreads; i++) {
filter_grad[i] = filter_grad[i] + dweight_3d[i];
}
}
template <typename T>
class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
Tensor* mask_grad = ctx.Output<Tensor>(framework::GradVarName("Mask"));
const Tensor* input = ctx.Input<Tensor>("Input");
Tensor offset = *ctx.Input<Tensor>("Offset");
Tensor mask = *ctx.Input<Tensor>("Mask");
Tensor filter = *ctx.Input<Tensor>("Filter");
if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return;
int groups = ctx.Attr<int>("groups");
int deformable_groups = ctx.Attr<int>("deformable_groups");
int im2col_step = ctx.Attr<int>("im2col_step");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output_grad->dims()));
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
output_buffer.ShareDataWith(*output_grad);
int64_t M =
input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K = output_shape_vec[1] / groups;
framework::DDim weight_3d_shape = {groups, K, M};
framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
N};
framework::DDim col_buffer_3d_shape = {groups, M, N};
framework::DDim filter_grad_shape = {groups, K, M};
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
Tensor out_grad_4d;
out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
phi::funcs::SetConstant<CPUDeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace());
out_grad_4d.mutable_data<T>(ctx.GetPlace());
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
int input_mask_dim = mask.numel() / mask.dims()[0];
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
filter_grad->Resize(filter_grad_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (offset_grad && mask_grad) {
offset_grad->mutable_data<T>(ctx.GetPlace());
mask_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, offset_grad, static_cast<T>(0));
set_zero(dev_ctx, mask_grad, static_cast<T>(0));
}
for (int i = 0; i < batch_size / im2col_step; ++i) {
Tensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
&col_buffer_3d_slice, T(0.0));
}
col_buffer.Resize(col_shape);
T* col_buffer_ptr = col_buffer.data<T>();
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>();
const T* mask_ptr = mask.data<T>();
if (mask_grad && offset_grad) {
T* offset_grad_ptr = offset_grad->data<T>();
T* mask_grad_ptr = mask_grad->data<T>();
// get grad of offset and mask
ModulatedDeformableCol2imCoordCPU(
ctx.template device_context<CPUDeviceContext>(), col_buffer_ptr,
input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
offset_grad_ptr + i * im2col_step * input_offset_dim,
mask_grad_ptr + i * im2col_step * input_mask_dim);
}
if (input_grad) {
T* input_grad_ptr = input_grad->data<T>();
// get grad of input
ModulatedDeformableCol2imCPU(
ctx.template device_context<CPUDeviceContext>(), col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
input_grad_ptr + i * im2col_step * input_dim);
input_grad->Resize(input->dims());
}
ModulatedDeformableIm2colCPU(
ctx.template device_context<CPUDeviceContext>(),
input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
deformable_groups, col_buffer_ptr);
col_buffer_3d.Resize(col_buffer_3d_shape);
if (filter_grad) {
Tensor dweight_3d;
dweight_3d = ctx.AllocateTmpTensor<T, CPUDeviceContext>(
filter_grad_shape, dev_ctx);
for (int g = 0; g < groups; ++g) {
Tensor out_grad_3d_slice =
out_grad_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
T(1.0), &dweight_3d_slice, T(0.0));
}
// update grad of weights
FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M,
dweight_3d.data<T>(), filter_grad->data<T>());
}
}
if (filter_grad) {
filter_grad->Resize(filter.dims());
}
}
};
} // namespace operators
} // namespace paddle
...@@ -12,9 +12,11 @@ ...@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/deformable_conv_v1_op.h"
#include <memory> #include <memory>
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -113,128 +115,6 @@ $$ ...@@ -113,128 +115,6 @@ $$
class DeformableConvV1Op : public framework::OperatorWithKernel { class DeformableConvV1Op : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
"deformable_conv_v1");
OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset",
"deformable_conv_v1");
OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter",
"deformable_conv_v1");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output",
"deformable_conv_v1");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
auto offset_dims = ctx->GetInputDim("Offset");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations =
ctx->Attrs().Get<std::vector<int>>("dilations");
int groups = ctx->Attrs().Get<int>("groups");
int deformable_groups = ctx->Attrs().Get<int>("deformable_groups");
int im2col_step = ctx->Attrs().Get<int>("im2col_step");
PADDLE_ENFORCE_EQ(
in_dims.size(), 4,
platform::errors::InvalidArgument(
"Conv input should be 4-D tensor, get %u", in_dims.size()));
PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
platform::errors::InvalidArgument(
"Conv input dimension and filter dimension should be "
"the same. the difference is [%d] vs [%d]",
in_dims.size(), filter_dims.size()));
PADDLE_ENFORCE_EQ(
in_dims.size() - strides.size(), 2U,
platform::errors::InvalidArgument(
"Conv input dimension and strides "
"dimension should be consistent., But received [%d]: [%d]",
in_dims.size(), strides.size()));
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
platform::errors::InvalidArgument(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d] vs [%d]",
paddings.size(), strides.size()));
PADDLE_ENFORCE_EQ(
in_dims[1], filter_dims[1] * groups,
platform::errors::InvalidArgument(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]",
in_dims[1], filter_dims[1] * groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % groups, 0,
platform::errors::InvalidArgument(
"The number of output channels should be divided by groups. But"
"received output channels: [%d], groups: [%d]",
filter_dims[0], groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % deformable_groups, 0,
platform::errors::InvalidArgument(
"The number of output channels should be "
"divided by deformable groups. But received [%d]: [%d]",
filter_dims[0], deformable_groups));
if (in_dims[0] > im2col_step) {
PADDLE_ENFORCE_EQ(in_dims[0] % im2col_step, 0U,
platform::errors::InvalidArgument(
"Input batchsize must be smaller than or divide "
"im2col_step, But received [%d]: [%d]",
in_dims[0], im2col_step));
}
for (size_t i = 0; i < strides.size(); ++i) {
PADDLE_ENFORCE_GT(strides[i], 0U, platform::errors::InvalidArgument(
"stride %d size incorrect", i));
}
for (size_t i = 0; i < dilations.size(); ++i) {
PADDLE_ENFORCE_GT(dilations[i], 0U, platform::errors::InvalidArgument(
"dilation %d size incorrect", i));
}
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
if ((!ctx->IsRuntime()) &&
(in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
output_shape.push_back(-1);
} else {
output_shape.push_back(ConvOutputSize(in_dims[i + 2],
filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
}
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(output_shape[1] % deformable_groups, 0U,
platform::errors::InvalidArgument(
"output num_filter must divide deformable group "
"size. But received [%d]: [%d]",
output_shape[1], deformable_groups));
PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2],
platform::errors::InvalidArgument(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]",
output_shape[2], offset_dims[2]));
PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3],
platform::errors::InvalidArgument(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]",
output_shape[3], offset_dims[3]));
PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U,
platform::errors::InvalidArgument(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]",
offset_dims[1], filter_dims[2] * filter_dims[3]));
PADDLE_ENFORCE_EQ(
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups,
platform::errors::InvalidArgument(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]",
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups));
}
ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -300,15 +180,12 @@ class DeformableConvV1GradOp : public framework::OperatorWithKernel { ...@@ -300,15 +180,12 @@ class DeformableConvV1GradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(deformable_conv, DeformableConvV1InferShapeFunctor,
PD_INFER_META(phi::DeformableConvInferMeta));
REGISTER_OPERATOR(deformable_conv_v1, ops::DeformableConvV1Op, REGISTER_OPERATOR(deformable_conv_v1, ops::DeformableConvV1Op,
ops::DeformableConvV1OpMaker, ops::DeformableConvV1OpMaker,
ops::DeformableConvV1GradOpMaker<paddle::framework::OpDesc>, ops::DeformableConvV1GradOpMaker<paddle::framework::OpDesc>,
ops::DeformableConvV1GradOpMaker<paddle::imperative::OpBase>); ops::DeformableConvV1GradOpMaker<paddle::imperative::OpBase>,
DeformableConvV1InferShapeFunctor);
REGISTER_OPERATOR(deformable_conv_v1_grad, ops::DeformableConvV1GradOp); REGISTER_OPERATOR(deformable_conv_v1_grad, ops::DeformableConvV1GradOp);
REGISTER_OP_CPU_KERNEL(deformable_conv_v1,
ops::DeformableConvV1CPUKernel<float>,
ops::DeformableConvV1CPUKernel<double>);
REGISTER_OP_CPU_KERNEL(deformable_conv_v1_grad,
ops::DeformableConvV1GradCPUKernel<float>,
ops::DeformableConvV1GradCPUKernel<double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_filter.cu.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_v1_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
static constexpr int kNumCUDAThread = 512;
static constexpr int kNumMaximumNumBlock = 4096;
static inline int NumBlock(const int N) {
return std::min((N + kNumCUDAThread - 1) / kNumCUDAThread,
kNumMaximumNumBlock);
}
template <typename T>
__global__ void DeformableCol2imCUDAKernel(
const int nthreads, const T* data_col, const T* data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T* grad_im) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t thread = index; thread < nthreads; thread += offset) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[thread];
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
weight * cur_top_grad);
}
}
}
}
}
template <typename T>
inline void DeformableCol2im(const platform::CUDADeviceContext& ctx,
const T* data_col, const T* data_offset,
const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape,
const std::vector<int> pad,
const std::vector<int> stride,
const std::vector<int> dilation,
const int deformable_group, T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlock(num_kernels);
int threads = kNumCUDAThread;
DeformableCol2imCUDAKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2],
kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1],
dilation[0], dilation[1], channel_per_deformable_group, col_shape[1],
deformable_group, col_shape[2], col_shape[3], grad_im);
}
template <typename T>
__global__ void DeformableCol2imCoordCUDAKernel(
const int nthreads, const T* data_col, const T* data_im,
const T* data_offset, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, T* grad_offset) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
height, width, inv_h, inv_w);
}
const T weight = DmcnGetCoordinateWeight(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[i] = val;
}
}
template <typename T>
inline void DeformableCol2imCoord(
const platform::CUDADeviceContext& ctx, const T* data_col, const T* data_im,
const T* data_offset, const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* grad_offset) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
int blocks = NumBlock(num_kernels);
int threads = kNumCUDAThread;
DeformableCol2imCoordCUDAKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1],
im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1],
strides[0], strides[1], dilations[0], dilations[1],
channel_per_deformable_group, col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups, col_shape[2], col_shape[3], grad_offset);
}
template <typename T>
__global__ void DeformableIm2colCUDAKernel(
const int nthreads, const T* data_im, const T* data_offset,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T* data_col) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
inline void DeformableIm2col(const platform::CUDADeviceContext& ctx,
const T* data_im, const T* data_offset,
const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> filter_shape,
const std::vector<int> paddings,
const std::vector<int> strides,
const std::vector<int> dilations,
const int deformable_groups, T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlock(num_kernels);
int threads = kNumCUDAThread;
// get outputs of im2col with offset by bilinear interpolation
DeformableIm2colCUDAKernel<T><<<
blocks, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
num_kernels, data_im, data_offset, im_shape[1], im_shape[2],
filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
strides[1], dilations[0], dilations[1], channel_per_deformable_group,
col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
data_col);
}
template <typename T>
class DeformableConvV1CUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor offset = *ctx.Input<Tensor>("Offset");
Tensor filter = *ctx.Input<Tensor>("Filter");
Tensor* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
const int groups = ctx.Attr<int>("groups");
const int deformable_groups = ctx.Attr<int>("deformable_groups");
const int im2col_step = ctx.Attr<int>("im2col_step");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
// col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer =
ctx.AllocateTmpTensor<T, CUDADeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, CUDADeviceContext>(output_shape, dev_ctx);
int64_t M = output_shape_vec[1] / groups;
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K =
input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer)
.Resize(phi::make_ddim({groups, K, N}));
Tensor output_4d;
output_4d.ShareDataWith(output_buffer)
.Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
output_4d.mutable_data<T>(ctx.GetPlace());
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
auto blas = phi::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>();
col_buffer.mutable_data<T>(ctx.GetPlace());
T* col_buffer_ptr = col_buffer.data<T>();
for (int i = 0; i < batch_size / im2col_step; ++i) {
DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec, filter_shape_vec,
paddings, strides, dilations, deformable_groups,
col_buffer_ptr);
Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
// get the product of pixel and weight
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
&output_3d_slice, T(0.0));
}
}
output->ShareDataWith(output_buffer)
.Resize(phi::make_ddim(output_shape_vec));
}
};
template <typename T>
class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
const Tensor* input = ctx.Input<Tensor>("Input");
Tensor offset = *ctx.Input<Tensor>("Offset");
Tensor filter = *ctx.Input<Tensor>("Filter");
if (!input_grad && !filter_grad && !offset_grad) return;
int groups = ctx.Attr<int>("groups");
int deformable_groups = ctx.Attr<int>("deformable_groups");
int im2col_step = ctx.Attr<int>("im2col_step");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
auto& dev_ctx = ctx.template device_context<CUDADeviceContext>();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output_grad->dims()));
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer =
ctx.AllocateTmpTensor<T, CUDADeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, CUDADeviceContext>(output_shape, dev_ctx);
output_buffer.ShareDataWith(*output_grad);
int64_t M =
input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K = output_shape_vec[1] / groups;
framework::DDim weight_3d_shape = {groups, K, M};
framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
N};
framework::DDim col_buffer_3d_shape = {groups, M, N};
framework::DDim filter_grad_shape = {groups, K, M};
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
Tensor out_grad_4d;
out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
phi::funcs::SetConstant<CUDADeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace());
out_grad_4d.mutable_data<T>(ctx.GetPlace());
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
filter_grad->Resize(filter_grad_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (offset_grad) {
offset_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, offset_grad, static_cast<T>(0));
}
for (int i = 0; i < batch_size / im2col_step; ++i) {
Tensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
&col_buffer_3d_slice, T(0.0));
}
col_buffer.Resize(col_shape);
T* col_buffer_ptr = col_buffer.data<T>();
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>();
if (offset_grad) {
T* offset_grad_ptr = offset_grad->data<T>();
// get grad of offset
DeformableCol2imCoord(
dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
offset_grad_ptr + i * im2col_step * input_offset_dim);
}
if (input_grad) {
T* input_grad_ptr = input_grad->data<T>();
// get grad of input
DeformableCol2im(dev_ctx, col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec,
filter_shape_vec, paddings, strides, dilations,
deformable_groups,
input_grad_ptr + i * im2col_step * input_dim);
input_grad->Resize(input->dims());
}
DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec, filter_shape_vec,
paddings, strides, dilations, deformable_groups,
col_buffer_ptr);
col_buffer_3d.Resize(col_buffer_3d_shape);
if (filter_grad) {
Tensor dweight_3d;
dweight_3d = ctx.AllocateTmpTensor<T, CUDADeviceContext>(
filter_grad_shape, dev_ctx);
for (int g = 0; g < groups; ++g) {
Tensor out_grad_3d_slice =
out_grad_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
T(1.0), &dweight_3d_slice, T(0.0));
}
FilterGradAddupCUDAKernel<T><<<NumBlock(dweight_3d.numel()),
kNumCUDAThread, 0, dev_ctx.stream()>>>(
dweight_3d.numel(), groups, K, M, dweight_3d.data<T>(),
filter_grad->data<T>());
}
}
if (filter_grad) {
filter_grad->Resize(filter.dims());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(deformable_conv_v1,
ops::DeformableConvV1CUDAKernel<float>,
ops::DeformableConvV1CUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(deformable_conv_v1_grad,
ops::DeformableConvV1GradCUDAKernel<float>,
ops::DeformableConvV1GradCUDAKernel<double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Part of the following code in this file refs to
// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu
//
// Copyright (c) 2017 Microsoft
// Licensed under The Apache-2.0 License [see LICENSE for details]
// \file deformable_psroi_pooling.cu
// \brief
// \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using CPUDeviceContext = platform::CPUDeviceContext;
template <typename T>
void DeformableCol2imCPUKernel(
const int num_kernels, const T* data_col, const T* data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T* grad_im) {
for (int thread = 0; thread < num_kernels; thread++) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[thread];
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) =
*(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
}
}
}
}
}
template <typename T>
inline void DeformableCol2imCPU(const platform::CPUDeviceContext& ctx,
const T* data_col, const T* data_offset,
const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape,
const std::vector<int> pad,
const std::vector<int> stride,
const std::vector<int> dilation,
const int deformable_group, T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
DeformableCol2imCPUKernel(
num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2],
kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1],
dilation[0], dilation[1], channel_per_deformable_group, col_shape[1],
deformable_group, col_shape[2], col_shape[3], grad_im);
}
template <typename T>
void DeformableCol2imCoordCPUKernel(
const int num_kernels, const T* data_col, const T* data_im,
const T* data_offset, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, T* grad_offset) {
for (int i = 0; i < num_kernels; i++) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
height, width, inv_h, inv_w);
}
const T weight = DmcnGetCoordinateWeight(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[i] = val;
}
}
template <typename T>
inline void DeformableCol2imCoordCPU(
const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im,
const T* data_offset, const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> kernel_shape, const std::vector<int> paddings,
const std::vector<int> strides, const std::vector<int> dilations,
const int deformable_groups, T* grad_offset) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
DeformableCol2imCoordCPUKernel(
num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1],
im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1],
strides[0], strides[1], dilations[0], dilations[1],
channel_per_deformable_group, col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups, col_shape[2], col_shape[3], grad_offset);
}
template <typename T>
void DeformableIm2colCPUKernel(
const int num_kernels, const T* data_im, const T* data_offset,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T* data_col) {
for (int i = 0; i < num_kernels; i++) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
inline void DeformableIm2colCPU(const platform::CPUDeviceContext& ctx,
const T* data_im, const T* data_offset,
const std::vector<int64_t> im_shape,
const std::vector<int64_t> col_shape,
const std::vector<int64_t> filter_shape,
const std::vector<int> paddings,
const std::vector<int> strides,
const std::vector<int> dilations,
const int deformable_groups, T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
// get outputs of im2col with offset by bilinear interpolation
DeformableIm2colCPUKernel(
num_kernels, data_im, data_offset, im_shape[1], im_shape[2],
filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0],
strides[1], dilations[0], dilations[1], channel_per_deformable_group,
col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3],
data_col);
}
template <typename T>
class DeformableConvV1CPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto* offset = ctx.Input<Tensor>("Offset");
Tensor filter = *ctx.Input<Tensor>("Filter");
Tensor* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
const int groups = ctx.Attr<int>("groups");
const int deformable_groups = ctx.Attr<int>("deformable_groups");
const int im2col_step = ctx.Attr<int>("im2col_step");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
// col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
int64_t M = output_shape_vec[1] / groups;
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K =
input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer)
.Resize(phi::make_ddim({groups, K, N}));
Tensor output_4d;
output_4d.ShareDataWith(output_buffer)
.Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
output_4d.mutable_data<T>(ctx.GetPlace());
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset->numel() / offset->dims()[0];
auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset->data<T>();
col_buffer.mutable_data<T>(ctx.GetPlace());
T* col_buffer_ptr = col_buffer.data<T>();
for (int i = 0; i < batch_size / im2col_step; ++i) {
DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec,
filter_shape_vec, paddings, strides, dilations,
deformable_groups, col_buffer_ptr);
Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
// get the product of pixel and weight
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
&output_3d_slice, T(0.0));
}
}
output->ShareDataWith(output_buffer)
.Resize(phi::make_ddim(output_shape_vec));
}
};
template <typename T>
class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
Tensor* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
const Tensor* input = ctx.Input<Tensor>("Input");
Tensor offset = *ctx.Input<Tensor>("Offset");
Tensor filter = *ctx.Input<Tensor>("Filter");
if (!input_grad && !filter_grad && !offset_grad) return;
int groups = ctx.Attr<int>("groups");
int deformable_groups = ctx.Attr<int>("deformable_groups");
int im2col_step = ctx.Attr<int>("im2col_step");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
const int batch_size = static_cast<int>(input->dims()[0]);
framework::DDim input_shape =
phi::slice_ddim(input->dims(), 1, input->dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(output_grad->dims()));
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] =
input->dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
Tensor col_buffer;
Tensor output_buffer;
col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
output_buffer =
ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
output_buffer.ShareDataWith(*output_grad);
int64_t M =
input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K = output_shape_vec[1] / groups;
framework::DDim weight_3d_shape = {groups, K, M};
framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K,
N};
framework::DDim col_buffer_3d_shape = {groups, M, N};
framework::DDim filter_grad_shape = {groups, K, M};
Tensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
Tensor out_grad_4d;
out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
Tensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
phi::funcs::SetConstant<CPUDeviceContext, T> set_zero;
auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace());
out_grad_4d.mutable_data<T>(ctx.GetPlace());
int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
filter_grad->Resize(filter_grad_shape);
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (offset_grad) {
offset_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, offset_grad, static_cast<T>(0));
}
for (int i = 0; i < batch_size / im2col_step; ++i) {
Tensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
for (int g = 0; g < groups; ++g) {
Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
Tensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0),
&col_buffer_3d_slice, T(0.0));
}
col_buffer.Resize(col_shape);
T* col_buffer_ptr = col_buffer.data<T>();
const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>();
if (offset_grad) {
T* offset_grad_ptr = offset_grad->data<T>();
// get grad of offset
DeformableCol2imCoordCPU(
dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec,
col_buffer_shape_vec, filter_shape_vec, paddings, strides,
dilations, deformable_groups,
offset_grad_ptr + i * im2col_step * input_offset_dim);
}
if (input_grad) {
T* input_grad_ptr = input_grad->data<T>();
// get grad of input
DeformableCol2imCPU(dev_ctx, col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec,
filter_shape_vec, paddings, strides, dilations,
deformable_groups,
input_grad_ptr + i * im2col_step * input_dim);
input_grad->Resize(input->dims());
}
DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
input_shape_vec, col_buffer_shape_vec,
filter_shape_vec, paddings, strides, dilations,
deformable_groups, col_buffer_ptr);
col_buffer_3d.Resize(col_buffer_3d_shape);
if (filter_grad) {
Tensor dweight_3d;
dweight_3d = ctx.AllocateTmpTensor<T, CPUDeviceContext>(
filter_grad_shape, dev_ctx);
for (int g = 0; g < groups; ++g) {
Tensor out_grad_3d_slice =
out_grad_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
Tensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
Tensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true,
T(1.0), &dweight_3d_slice, T(0.0));
}
// update grad of weights
FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M,
dweight_3d.data<T>(), filter_grad->data<T>());
}
}
if (filter_grad) {
filter_grad->Resize(filter.dims());
}
}
};
} // namespace operators
} // namespace paddle
...@@ -17,7 +17,10 @@ limitations under the License. */ ...@@ -17,7 +17,10 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -270,70 +273,24 @@ class Flatten2GradOp : public framework::OperatorWithKernel { ...@@ -270,70 +273,24 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
class FlattenContiguousRangeOp : public framework::OperatorWithKernel { class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange"); OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
"FlattenContiguousRange"); "FlattenContiguousRange");
const auto &start_axis = ctx->Attrs().Get<int>("start_axis"); const auto &start_axis = ctx->Attrs().Get<int>("start_axis");
const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis"); const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis");
const auto &in_dims = ctx->GetInputDim("X");
int in_dims_size = in_dims.size();
int real_start_axis = start_axis, real_stop_axis = stop_axis;
if (start_axis < 0) {
real_start_axis = start_axis + in_dims_size;
}
if (stop_axis < 0) {
real_stop_axis = stop_axis + in_dims_size;
}
PADDLE_ENFORCE_GE(
real_stop_axis, real_start_axis,
platform::errors::InvalidArgument("The stop_axis should be greater"
"than or equal to start_axis."));
const auto &out_dims = // Construct MetaTensor for InferMeta Func
GetOutputShape(real_start_axis, real_stop_axis, in_dims); using CompatMetaTensor = framework::CompatMetaTensor;
ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
if (in_dims[0] == out_dims[0]) { CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
// Only pass LoD when the first dimension of output and Input(X) std::unique_ptr<CompatMetaTensor> xshape(nullptr);
// are the same. if (ctx->HasOutput("XShape")) {
ctx->ShareLoD("X", "Out"); xshape = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
} ctx->GetOutputVarPtrs("XShape")[0], ctx->IsRuntime())));
if (!ctx->HasOutput("XShape")) return;
// OP_INOUT_CHECK(ctx->HasOutput("XShape"), "Output", "XShape", "Flatten2");
std::vector<int64_t> xshape_dims(in_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
} }
ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); phi::FlattenWithXShapeInferMeta(x, start_axis, stop_axis, &out,
ctx->ShareLoD("X", "XShape"); xshape.get());
}
static std::vector<int32_t> GetOutputShape(const int start_axis,
const int stop_axis,
const framework::DDim &in_dims) {
int64_t outer = 1;
std::vector<int32_t> out_shape;
int in_dims_size = in_dims.size();
out_shape.reserve(in_dims_size - stop_axis + start_axis);
for (int i = 0; i < start_axis; ++i) {
out_shape.push_back(in_dims[i]);
}
for (int i = start_axis; i <= stop_axis; i++) {
if (in_dims[i] == -1 || outer == -1) {
outer = -1;
} else {
outer *= in_dims[i];
}
}
out_shape.push_back(outer);
for (int i = stop_axis + 1; i < in_dims_size; i++) {
out_shape.push_back(in_dims[i]);
}
return out_shape;
} }
}; };
...@@ -487,30 +444,3 @@ REGISTER_OP_CPU_KERNEL( ...@@ -487,30 +444,3 @@ REGISTER_OP_CPU_KERNEL(
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>, ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>, ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
double>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
uint8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
int64_t>);
REGISTER_OP_CPU_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
double>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
uint8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
int64_t>);
...@@ -47,34 +47,3 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -47,34 +47,3 @@ REGISTER_OP_CUDA_KERNEL(
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>, ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>, ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
double>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
uint8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
int8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
int64_t>);
REGISTER_OP_CUDA_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
double>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
uint8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
int64_t>);
...@@ -119,46 +119,5 @@ class Flatten2GradKernel : public framework::OpKernel<T> { ...@@ -119,46 +119,5 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *in = context.Input<framework::LoDTensor>("X");
auto *out = context.Output<framework::LoDTensor>("Out");
out->mutable_data(context.GetPlace(), in->type());
auto &start_axis = context.Attr<int>("start_axis");
auto &stop_axis = context.Attr<int>("stop_axis");
auto &dev_ctx = context.device_context<DeviceContext>();
// call new kernel
phi::FlattenKernel<T, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE &>(dev_ctx),
*in, start_axis, stop_axis, out);
}
};
template <typename DeviceContext, typename T>
class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto *d_out =
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto *xshape = ctx.Input<framework::LoDTensor>("XShape");
d_x->mutable_data(ctx.GetPlace(), d_out->type());
auto &dev_ctx = ctx.device_context<DeviceContext>();
// call new kernel
phi::FlattenGradKernel<T, typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE &>(dev_ctx),
*d_out, *xshape, d_x);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -41,27 +41,4 @@ REGISTER_OP_XPU_KERNEL( ...@@ -41,27 +41,4 @@ REGISTER_OP_XPU_KERNEL(
ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int>, ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int>,
ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int8_t>, ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int8_t>,
ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int64_t>); ops::Flatten2GradKernel<paddle::platform::XPUDeviceContext, int64_t>);
REGISTER_OP_XPU_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeKernel<paddle::platform::XPUDeviceContext,
float>,
ops::FlattenContiguousRangeKernel<paddle::platform::XPUDeviceContext,
plat::float16>,
ops::FlattenContiguousRangeKernel<paddle::platform::XPUDeviceContext, int>,
ops::FlattenContiguousRangeKernel<paddle::platform::XPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeKernel<paddle::platform::XPUDeviceContext,
int64_t>);
REGISTER_OP_XPU_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradKernel<paddle::platform::XPUDeviceContext,
float>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::XPUDeviceContext,
plat::float16>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::XPUDeviceContext,
int>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::XPUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradKernel<paddle::platform::XPUDeviceContext,
int64_t>);
#endif #endif
...@@ -868,16 +868,22 @@ static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args, ...@@ -868,16 +868,22 @@ static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
int64_t hook_id; int64_t hook_id;
if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name(); VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name();
auto autograd_meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
if (autograd_meta && !autograd_meta->StopGradient()) {
if (!autograd_meta->GetMutableGradNode()) {
VLOG(6) << "Detected NULL grad_node, Leaf tensor should have had "
"grad_node with type: GradNodeAccumulation.";
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
}
}
std::shared_ptr<egr::GradNodeBase> grad_node = std::shared_ptr<egr::GradNodeBase> grad_node =
egr::EagerUtils::grad_node(self->tensor); egr::EagerUtils::grad_node(self->tensor);
PADDLE_ENFORCE(
grad_node.get() != nullptr,
paddle::platform::errors::Fatal("Detected NULL grad_node,"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation."));
auto rank_info = auto rank_info =
egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo(); egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo();
PyObject* hook_func = PyTuple_GET_ITEM(args, 0); PyObject* hook_func = PyTuple_GET_ITEM(args, 0);
auto accumulation_grad_node = auto accumulation_grad_node =
...@@ -948,8 +954,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, ...@@ -948,8 +954,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* set_grad_type(TensorObject* self, PyObject* args, static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensor = auto grad_tensor =
...@@ -963,6 +969,42 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args, ...@@ -963,6 +969,42 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor__clear(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
self->tensor.reset();
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto src = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
if (self->tensor.is_initialized()) {
PADDLE_ENFORCE_EQ(self->tensor.dtype(), src.dtype(),
platform::errors::PreconditionNotMet(
"Tensor %s has different data type with Tensor %s",
self->tensor.name(), src.name()));
PADDLE_ENFORCE_EQ(self->tensor.impl()->type_info().id(),
src.impl()->type_info().id(),
platform::errors::PreconditionNotMet(
"Tensor %s has different type with Tensor %s, Tensor "
"ShareGradientDataWith cannot be performed!",
self->tensor.name(), src.name()));
}
VLOG(6) << "Tensor copy gradient from: " << src.name();
auto* p_grad = egr::EagerUtils::mutable_grad(self->tensor);
if (p_grad) {
PADDLE_ENFORCE_EQ(src.initialized(), true,
platform::errors::InvalidArgument(
"Tensor %s has not been initialized", src.name()));
p_grad->set_impl(src.impl());
}
Py_INCREF(Py_None);
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method_get_non_zero_indices(TensorObject* self, static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
...@@ -1117,7 +1159,12 @@ PyMethodDef variable_methods[] = { ...@@ -1117,7 +1159,12 @@ PyMethodDef variable_methods[] = {
{"_register_backward_hook", {"_register_backward_hook",
(PyCFunction)(void (*)(void))tensor_register_reduce_hook, (PyCFunction)(void (*)(void))tensor_register_reduce_hook,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, {"_set_grad_type", (PyCFunction)(void (*)(void))tensor__set_grad_type,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_clear", (PyCFunction)(void (*)(void))tensor__clear,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_copy_gradient_from",
(PyCFunction)(void (*)(void))tensor__copy_gradient_from,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
/***the method of sparse tensor****/ /***the method of sparse tensor****/
{"non_zero_indices", {"non_zero_indices",
......
...@@ -655,6 +655,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -655,6 +655,7 @@ void BindImperative(py::module *m_ptr) {
} else { } else {
act_name = name.cast<std::string>(); act_name = name.cast<std::string>();
} }
VLOG(4) << "Init VarBase :" << act_name;
new (&self) imperative::VarBase(act_name); new (&self) imperative::VarBase(act_name);
self.SetPersistable(persistable); self.SetPersistable(persistable);
self.SetType(type); self.SetType(type);
......
...@@ -829,6 +829,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -829,6 +829,8 @@ PYBIND11_MODULE(core_noavx, m) {
[](const framework::Tensor &self) { [](const framework::Tensor &self) {
return reinterpret_cast<uintptr_t>(self.data()); return reinterpret_cast<uintptr_t>(self.data());
}) })
.def("_slice", &framework::Tensor::Slice)
.def("_numel", &framework::Tensor::numel)
.def("_is_initialized", .def("_is_initialized",
[](const framework::Tensor &self) { return self.IsInitialized(); }) [](const framework::Tensor &self) { return self.IsInitialized(); })
.def("_get_dims", .def("_get_dims",
......
...@@ -427,9 +427,7 @@ class PADDLE_API Tensor final { ...@@ -427,9 +427,7 @@ class PADDLE_API Tensor final {
* @param blocking, Should we copy this in sync way. * @param blocking, Should we copy this in sync way.
* @return void * @return void
*/ */
void copy_(const Tensor& src, void copy_(const Tensor& src, const phi::Place& target_place, bool blocking);
const phi::Place& target_place,
const bool blocking);
/** /**
* @brief Cast datatype from one to another * @brief Cast datatype from one to another
* *
......
...@@ -84,26 +84,26 @@ void Tensor::copy_(const Tensor &src, ...@@ -84,26 +84,26 @@ void Tensor::copy_(const Tensor &src,
if (is_initialized()) { if (is_initialized()) {
PADDLE_ENFORCE_EQ(dtype(), PADDLE_ENFORCE_EQ(dtype(),
src.dtype(), src.dtype(),
platform::errors::PreconditionNotMet( phi::errors::PreconditionNotMet(
"Tensor %s has different data type with Tensor %s, " "Tensor %s has different data type with Tensor %s, "
"Tensor Copy cannot be performed!", "Tensor Copy cannot be performed!",
name(), name(),
src.name())); src.name()));
PADDLE_ENFORCE_EQ(impl()->type_info().id(), PADDLE_ENFORCE_EQ(impl()->type_info().id(),
src.impl()->type_info().id(), src.impl()->type_info().id(),
platform::errors::PreconditionNotMet( phi::errors::PreconditionNotMet(
"Tensor %s has different type with Tensor %s, Tensor " "Tensor %s has different type with Tensor %s, Tensor "
"Copy cannot be performed!", "Copy cannot be performed!",
name(), name(),
src.name())); src.name()));
PADDLE_ENFORCE_EQ(target_place, PADDLE_ENFORCE_EQ(target_place,
inner_place(), inner_place(),
platform::errors::PreconditionNotMet( phi::errors::PreconditionNotMet(
"Place is different of dst tensor and args %s, which " "Place is different of dst tensor and args %s, which "
"current tensor holds %s " "current tensor holds %s "
"Copy cannot be performed!", "Copy cannot be performed!",
target_place.DebugString(), target_place,
inner_place().DebugString())); inner_place()));
kernel_key_set.backend_set = kernel_key_set.backend_set =
kernel_key_set.backend_set | kernel_key_set.backend_set |
BackendSet(phi::TransToPhiBackend(inner_place())); BackendSet(phi::TransToPhiBackend(inner_place()));
...@@ -177,7 +177,7 @@ void Tensor::copy_(const Tensor &src, ...@@ -177,7 +177,7 @@ void Tensor::copy_(const Tensor &src,
blocking, blocking,
static_cast<phi::SelectedRows *>(impl_.get())); static_cast<phi::SelectedRows *>(impl_.get()));
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"We currently only support dense tensor copy for now and if u need to " "We currently only support dense tensor copy for now and if u need to "
"copy selected rows please raise a issue.")); "copy selected rows please raise a issue."));
} }
......
...@@ -516,6 +516,215 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x, ...@@ -516,6 +516,215 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
out->share_lod(*x.at(0)); out->share_lod(*x.at(0));
} }
inline int ConvOutputSize(
int input_size, int filter_size, int dilation, int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
PADDLE_ENFORCE_GT(
output_size,
0,
phi::errors::InvalidArgument(
"The output's size is expected to be greater than 0. "
"But recieved: output's size is %d. The output's size is computed by "
"((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
"stride + 1), where input_size is %d, padding is %d, "
"filter_size is %d, dilation is %d, stride is %d.",
output_size,
input_size,
padding,
filter_size,
dilation,
stride));
return output_size;
}
void DeformableConvInferMeta(const MetaTensor& x,
const MetaTensor& offset,
const MetaTensor& filter,
paddle::optional<const MetaTensor&> mask,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
int deformable_groups,
int groups,
int im2col_step,
MetaTensor* out,
MetaConfig config) {
auto in_dims = x.dims();
auto offset_dims = offset.dims();
auto filter_dims = filter.dims();
PADDLE_ENFORCE_EQ(
in_dims.size(),
4,
phi::errors::InvalidArgument("Conv input should be 4-D tensor, get %u",
in_dims.size()));
PADDLE_ENFORCE_EQ(in_dims.size(),
filter_dims.size(),
phi::errors::InvalidArgument(
"Conv input dimension and filter dimension should be "
"the same. The difference is [%d]: [%d]",
in_dims.size(),
filter_dims.size()));
PADDLE_ENFORCE_EQ(in_dims.size() - strides.size(),
2U,
phi::errors::InvalidArgument(
"Conv input dimension and strides "
"dimension should be consistent. But received input "
"dimension:[%d], strides dimension:[%d]",
in_dims.size(),
strides.size()));
PADDLE_ENFORCE_EQ(paddings.size(),
strides.size(),
phi::errors::InvalidArgument(
"Conv paddings dimension and Conv strides dimension "
"should be the same. The difference is [%d]: [%d]",
paddings.size(),
strides.size()));
PADDLE_ENFORCE_EQ(
in_dims[1],
filter_dims[1] * groups,
phi::errors::InvalidArgument(
"The number of input channels should be equal to filter "
"channels * groups. The difference is [%d]: [%d]",
in_dims[1],
filter_dims[1] * groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % groups,
0,
phi::errors::InvalidArgument(
"The number of output channels should be divided by groups. But "
"received output channels:[%d], groups:[%d]",
filter_dims[0],
groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % deformable_groups,
0,
phi::errors::InvalidArgument(
"The number of output channels should be "
"divided by deformable groups. The difference is [%d]: [%d]",
filter_dims[0] % groups,
0));
if (in_dims[0] > im2col_step) {
PADDLE_ENFORCE_EQ(
in_dims[0] % im2col_step,
0U,
phi::errors::InvalidArgument(
"Input batchsize must be smaller than or divide im2col_step. But "
"received Input batchsize:[%d], im2col_step:[%d]",
in_dims[0],
im2col_step));
}
for (size_t i = 0; i < strides.size(); ++i) {
PADDLE_ENFORCE_GT(
strides[i],
0U,
phi::errors::InvalidArgument("stride %d size incorrect", i));
}
for (size_t i = 0; i < dilations.size(); ++i) {
PADDLE_ENFORCE_GT(
dilations[i],
0U,
phi::errors::InvalidArgument("dilation %d size incorrect", i));
}
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
if (!config.is_runtime &&
(in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
output_shape.push_back(-1);
} else {
output_shape.push_back(ConvOutputSize(in_dims[i + 2],
filter_dims[i + 2],
dilations[i],
paddings[i],
strides[i]));
}
}
PADDLE_ENFORCE_EQ(
output_shape[1] % deformable_groups,
0U,
phi::errors::InvalidArgument(
"output num_filter must divide deformable group size. But received "
"output num_filter:[%d], deformable group size:[%d]",
output_shape[1],
deformable_groups));
if (config.is_runtime) {
PADDLE_ENFORCE_EQ(output_shape[2],
offset_dims[2],
phi::errors::InvalidArgument(
"output height must equal to offset map height. "
"The difference is [%d]: [%d]",
output_shape[2],
offset_dims[2]));
PADDLE_ENFORCE_EQ(output_shape[3],
offset_dims[3],
phi::errors::InvalidArgument(
"output width must equal to offset map width. The "
"difference is [%d]: [%d]",
output_shape[3],
offset_dims[3]));
PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]),
0U,
phi::errors::InvalidArgument(
"offset filter must divide deformable group size. "
"But received [%d]: [%d]",
offset_dims[1],
filter_dims[2] * filter_dims[3]));
PADDLE_ENFORCE_EQ(
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups,
phi::errors::InvalidArgument(
"offset filter must divide deformable group size. But received "
"[%d]: [%d]",
offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]),
deformable_groups));
if (mask) {
auto mask_dims = mask->dims();
PADDLE_ENFORCE_EQ(output_shape[2],
mask_dims[2],
phi::errors::InvalidArgument(
"output height must equal to mask map height. The "
"difference is [%d] vs [%d]",
output_shape[2],
mask_dims[2]));
PADDLE_ENFORCE_EQ(output_shape[3],
mask_dims[3],
phi::errors::InvalidArgument(
"output width must equal to mask map width. The "
"difference is [%d] vs [%d]",
output_shape[3],
mask_dims[3]));
PADDLE_ENFORCE_EQ(mask_dims[1] % (filter_dims[2] * filter_dims[3]),
0U,
phi::errors::InvalidArgument(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]",
mask_dims[1],
filter_dims[2] * filter_dims[3]));
PADDLE_ENFORCE_EQ(mask_dims[1] / (filter_dims[2] * filter_dims[3]),
deformable_groups,
phi::errors::InvalidArgument(
"mask filter must divide deformable group size. "
"But received [%d]: [%d]",
mask_dims[1] / (filter_dims[2] * filter_dims[3]),
deformable_groups));
}
}
out->set_dims(phi::make_ddim(output_shape));
out->set_dtype(x.dtype());
}
void HierarchicalSigmoidInferMeta(const MetaTensor& x, void HierarchicalSigmoidInferMeta(const MetaTensor& x,
const MetaTensor& w, const MetaTensor& w,
const MetaTensor& label, const MetaTensor& label,
......
...@@ -120,6 +120,19 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x, ...@@ -120,6 +120,19 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
MetaTensor* out, MetaTensor* out,
MetaConfig config = MetaConfig()); MetaConfig config = MetaConfig());
void DeformableConvInferMeta(const MetaTensor& x,
const MetaTensor& offset,
const MetaTensor& filter,
paddle::optional<const MetaTensor&> mask,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
int deformable_groups,
int groups,
int im2col_step,
MetaTensor* out,
MetaConfig config = MetaConfig());
void HierarchicalSigmoidInferMeta(const MetaTensor& x, void HierarchicalSigmoidInferMeta(const MetaTensor& x,
const MetaTensor& w, const MetaTensor& w,
const MetaTensor& label, const MetaTensor& label,
......
...@@ -352,6 +352,14 @@ void FlattenInferMeta(const MetaTensor& x, ...@@ -352,6 +352,14 @@ void FlattenInferMeta(const MetaTensor& x,
int start_axis, int start_axis,
int stop_axis, int stop_axis,
MetaTensor* out) { MetaTensor* out) {
FlattenWithXShapeInferMeta(x, start_axis, stop_axis, out, nullptr);
}
void FlattenWithXShapeInferMeta(const MetaTensor& x,
int start_axis,
int stop_axis,
MetaTensor* out,
MetaTensor* xshape) {
auto x_dims = x.dims(); auto x_dims = x.dims();
int in_dims_size = x_dims.size(); int in_dims_size = x_dims.size();
if (start_axis < 0) { if (start_axis < 0) {
...@@ -394,6 +402,14 @@ void FlattenInferMeta(const MetaTensor& x, ...@@ -394,6 +402,14 @@ void FlattenInferMeta(const MetaTensor& x,
// are the same. // are the same.
out->share_lod(x); out->share_lod(x);
} }
if (xshape == nullptr) return;
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
xshape->set_dims(phi::make_ddim(xshape_dims));
xshape->share_lod(x);
} }
void GumbelSoftmaxInferMeta(const MetaTensor& x, void GumbelSoftmaxInferMeta(const MetaTensor& x,
......
...@@ -86,6 +86,12 @@ void FlattenInferMeta(const MetaTensor& x, ...@@ -86,6 +86,12 @@ void FlattenInferMeta(const MetaTensor& x,
int stop_axis, int stop_axis,
MetaTensor* out); MetaTensor* out);
void FlattenWithXShapeInferMeta(const MetaTensor& x,
int start_axis,
int stop_axis,
MetaTensor* out,
MetaTensor* xshape);
void GumbelSoftmaxInferMeta(const MetaTensor& x, void GumbelSoftmaxInferMeta(const MetaTensor& x,
float temperature, float temperature,
bool hard, bool hard,
......
...@@ -27,12 +27,14 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) ...@@ -27,12 +27,14 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
# Some kernels depend on some targets that are not commonly used. # Some kernels depend on some targets that are not commonly used.
# These targets are not suitable for common dependencies. # These targets are not suitable for common dependencies.
# In this case, you need to manually generate them here. # In this case, you need to manually generate them here.
set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel set(MANUAL_BUILD_KERNELS deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel) triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
namespace phi {
template <typename T>
inline void ModulatedDeformableCol2imCPUKernel(
const int num_kernels,
const T* data_col,
const T* data_offset,
const T* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int deformable_group,
const int height_col,
const int width_col,
T* grad_im) {
for (int thread = 0; thread < num_kernels; thread++) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
T cur_top_grad = data_col[thread];
if (data_mask) {
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const T mask = data_mask_ptr[data_mask_hw_ptr];
cur_top_grad *= mask;
}
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = DmcnGetGradientWeight(cur_inv_h_data,
cur_inv_w_data,
cur_h + dy,
cur_w + dx,
height,
width);
*(grad_im + cur_bottom_grad_pos) =
*(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
}
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& pad,
const std::vector<int>& stride,
const std::vector<int>& dilation,
const int deformable_group,
T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
ModulatedDeformableCol2imCPUKernel(num_kernels,
data_col,
data_offset,
data_mask,
im_shape[0],
im_shape[1],
im_shape[2],
kernel_shape[2],
kernel_shape[3],
pad[0],
pad[1],
stride[0],
stride[1],
dilation[0],
dilation[1],
channel_per_deformable_group,
col_shape[1],
deformable_group,
col_shape[2],
col_shape[3],
grad_im);
}
template <typename T>
void ModulatedDeformableCol2imCoordCPUKernel(
const int num_kernels,
const T* data_col,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int offset_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* grad_offset,
T* grad_mask) {
for (int i = 0; i < num_kernels; i++) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr =
data_mask
? data_mask +
(b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col
: nullptr;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
width,
height,
width,
inv_h,
inv_w);
}
const T weight =
DmcnGetCoordinateWeight(inv_h,
inv_w,
height,
width,
data_im_ptr + cnt * height * width,
width,
bp_dir);
if (data_mask_ptr) {
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T mask = data_mask_ptr[data_mask_hw_ptr];
val += weight * data_col_ptr[col_pos] * mask;
} else {
val += weight * data_col_ptr[col_pos];
}
cnt += 1;
}
grad_offset[i] = val;
if (grad_mask && offset_c % 2 == 0)
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
template <typename T, typename Context>
void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
const T* data_col,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* grad_offset,
T* grad_mask) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
ModulatedDeformableCol2imCoordCPUKernel(
num_kernels,
data_col,
data_im,
data_offset,
data_mask,
im_shape[0],
im_shape[1],
im_shape[2],
kernel_shape[2],
kernel_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups,
col_shape[2],
col_shape[3],
grad_offset,
grad_mask);
}
template <typename T, typename Context>
void FilterGradAddup(const Context& dev_ctx,
const int nthreads,
const int n,
const int height,
const int width,
const T* dweight_3d,
T* filter_grad) {
for (int i = 0; i < nthreads; i++) {
filter_grad[i] = filter_grad[i] + dweight_3d[i];
}
}
} // namespace phi
PD_REGISTER_KERNEL(deformable_conv_grad,
CPU,
ALL_LAYOUT,
phi::DeformableConvGradKernel,
float,
double) {}
...@@ -18,126 +18,6 @@ ...@@ -18,126 +18,6 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" #include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
namespace phi {
template <typename T>
inline void ModulatedDeformableIm2colCPUKernel(
const int num_kernels,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* data_col) {
for (int i = 0; i < num_kernels; i++) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask +
(b_col * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel(num_kernels,
data_im,
data_offset,
data_mask,
im_shape[1],
im_shape[2],
filter_shape[2],
filter_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
im_shape[0],
deformable_groups,
col_shape[2],
col_shape[3],
data_col);
}
} // namespace phi
PD_REGISTER_KERNEL(deformable_conv, PD_REGISTER_KERNEL(deformable_conv,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void DeformableConvGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& offset,
const DenseTensor& filter,
paddle::optional<const DenseTensor&> mask,
const DenseTensor& out_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
int deformable_groups,
int groups,
int im2col_step,
DenseTensor* dx,
DenseTensor* offset_grad,
DenseTensor* filter_grad,
DenseTensor* mask_grad);
} // namespace phi
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/utils/optional.h"
namespace phi { namespace phi {
...@@ -23,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -23,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& offset, const DenseTensor& offset,
const DenseTensor& filter, const DenseTensor& filter,
const DenseTensor& mask, paddle::optional<const DenseTensor&> mask,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
......
...@@ -25,6 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx, ...@@ -25,6 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx,
const DenseTensor& xshape, const DenseTensor& xshape,
DenseTensor* x_grad) { DenseTensor* x_grad) {
auto xshape_dims = xshape.dims(); auto xshape_dims = xshape.dims();
dev_ctx.Alloc(x_grad, out_grad.dtype());
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
x_grad->Resize(x_dims); x_grad->Resize(x_dims);
......
...@@ -27,6 +27,7 @@ void FlattenKernel(const Context& dev_ctx, ...@@ -27,6 +27,7 @@ void FlattenKernel(const Context& dev_ctx,
int start_axis, int start_axis,
int stop_axis, int stop_axis,
DenseTensor* out) { DenseTensor* out) {
dev_ctx.Alloc(out, x.dtype());
auto out_dims = out->dims(); auto out_dims = out->dims();
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(out_dims); out->Resize(out_dims);
...@@ -43,7 +44,6 @@ void FlattenWithXShape(const Context& dev_ctx, ...@@ -43,7 +44,6 @@ void FlattenWithXShape(const Context& dev_ctx,
DenseTensor* out, DenseTensor* out,
DenseTensor* xshape) { DenseTensor* xshape) {
FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, out); FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, out);
funcs::SetXShape(x, xshape);
} }
} // namespace phi } // namespace phi
......
...@@ -3,6 +3,7 @@ add_subdirectory(blas) ...@@ -3,6 +3,7 @@ add_subdirectory(blas)
add_subdirectory(lapack) add_subdirectory(lapack)
add_subdirectory(detail) add_subdirectory(detail)
math_library(deformable_conv_functor DEPS dense_tensor)
math_library(concat_and_split_functor DEPS dense_tensor) math_library(concat_and_split_functor DEPS dense_tensor)
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
namespace phi {
namespace funcs {
template <typename T>
inline void ModulatedDeformableIm2colCPUKernel(
const int num_kernels,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* data_col) {
for (int i = 0; i < num_kernels; i++) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask
? data_mask +
(b_col * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col
: nullptr;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
if (data_mask_ptr) {
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T mask = data_mask_ptr[data_mask_hw_ptr];
*data_col_ptr *= mask;
}
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
// get outputs of im2col with offset by bilinear interpolation
ModulatedDeformableIm2colCPUKernel(num_kernels,
data_im,
data_offset,
data_mask,
im_shape[1],
im_shape[2],
filter_shape[2],
filter_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
im_shape[0],
deformable_groups,
col_shape[2],
col_shape[3],
data_col);
}
template void ModulatedDeformableIm2col(
const phi::CPUContext& dev_ctx,
const float* data_im,
const float* data_offset,
const float* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
float* data_col);
template void ModulatedDeformableIm2col(
const phi::CPUContext& dev_ctx,
const double* data_im,
const double* data_offset,
const double* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
double* data_col);
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace phi {
namespace funcs {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaximumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaximumNumBlocks);
}
template <typename T>
__global__ void ModulatedDeformableIm2colGpuKernel(
const int nthreads,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* data_col) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask
? data_mask +
(b_col * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col
: nullptr;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
if (data_mask_ptr) {
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T mask = data_mask_ptr[data_mask_hw_ptr];
*data_col_ptr *= mask;
}
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableIm2colGpuKernel<
T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
data_im,
data_offset,
data_mask,
im_shape[1],
im_shape[2],
filter_shape[2],
filter_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
im_shape[0],
deformable_groups,
col_shape[2],
col_shape[3],
data_col);
}
template void ModulatedDeformableIm2col(
const phi::GPUContext& dev_ctx,
const float* data_im,
const float* data_offset,
const float* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
float* data_col);
template void ModulatedDeformableIm2col(
const phi::GPUContext& dev_ctx,
const double* data_im,
const double* data_offset,
const double* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
double* data_col);
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
namespace funcs {
template <typename T>
HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
const int data_width,
const int height,
const int width,
T h,
T w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh;
T hw = 1 - lw;
T v1 =
(h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
T v2 = (h_low >= 0 && w_high <= width - 1)
? bottom_data[h_low * data_width + w_high]
: 0;
T v3 = (h_high <= height - 1 && w_low >= 0)
? bottom_data[h_high * data_width + w_low]
: 0;
T v4 = (h_high <= height - 1 && w_high <= width - 1)
? bottom_data[h_high * data_width + w_high]
: 0;
T w1 = hh * hw;
T w2 = hh * lw;
T w3 = lh * hw;
T w4 = lh * lw;
return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col);
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
namespace phi {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaximumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaximumNumBlocks);
}
template <typename T>
__global__ void ModulatedDeformableCol2imGpuKernel(
const int nthreads,
const T* data_col,
const T* data_offset,
const T* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int deformable_group,
const int height_col,
const int width_col,
T* grad_im) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t thread = index; thread < nthreads; thread += offset) {
const int j = (thread / width_col / height_col / batch_size) % kernel_w;
const int i =
(thread / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
thread / width_col / height_col / batch_size / kernel_w / kernel_h;
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = thread % width_col;
int h_out = (thread / width_col) % height_col;
int b = (thread / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
T cur_top_grad = data_col[thread];
if (data_mask) {
const T* data_mask_ptr = data_mask +
(b * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
const T mask = data_mask_ptr[data_mask_hw_ptr];
cur_top_grad *= mask;
}
const int cur_h = static_cast<int>(cur_inv_h_data);
const int cur_w = static_cast<int>(cur_inv_w_data);
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = DmcnGetGradientWeight(cur_inv_h_data,
cur_inv_w_data,
cur_h + dy,
cur_w + dx,
height,
width);
paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
weight * cur_top_grad);
}
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& pad,
const std::vector<int>& stride,
const std::vector<int>& dilation,
const int deformable_group,
T* grad_im) {
int channel_per_deformable_group = im_shape[0] / deformable_group;
int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableCol2imGpuKernel<
T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
data_col,
data_offset,
data_mask,
im_shape[0],
im_shape[1],
im_shape[2],
kernel_shape[2],
kernel_shape[3],
pad[0],
pad[1],
stride[0],
stride[1],
dilation[0],
dilation[1],
channel_per_deformable_group,
col_shape[1],
deformable_group,
col_shape[2],
col_shape[3],
grad_im);
}
template <typename T>
__global__ void ModulatedDeformableCol2imCoordGpuKernel(
const int nthreads,
const T* data_col,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int channels,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int offset_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* grad_offset,
T* grad_mask) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
T val = 0, mval = 0;
const int w = i % width_col;
const int h = (i / width_col) % height_col;
const int c = (i / width_col / height_col) % offset_channels;
const int b = (i / width_col / height_col) / offset_channels;
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T* data_col_ptr = data_col +
deformable_group_index *
channel_per_deformable_group * batch_size *
width_col * height_col;
const T* data_im_ptr = data_im +
(b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h /
kernel_w * height * width;
const T* data_offset_ptr = data_offset +
(b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col *
width_col;
const T* data_mask_ptr =
data_mask
? data_mask +
(b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col
: nullptr;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = offset_c / 2; col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
inv_h = inv_w = -2;
} else {
mval += data_col_ptr[col_pos] *
funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
width,
height,
width,
inv_h,
inv_w);
}
const T weight =
DmcnGetCoordinateWeight(inv_h,
inv_w,
height,
width,
data_im_ptr + cnt * height * width,
width,
bp_dir);
if (data_mask_ptr) {
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T mask = data_mask_ptr[data_mask_hw_ptr];
val += weight * data_col_ptr[col_pos] * mask;
} else {
val += weight * data_col_ptr[col_pos];
}
cnt += 1;
}
grad_offset[i] = val;
if (grad_mask && offset_c % 2 == 0)
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
template <typename T, typename Context>
void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
const T* data_col,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* grad_offset,
T* grad_mask) {
int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
col_shape[2] * col_shape[3] * deformable_groups;
int channel_per_deformable_group = col_shape[0] / deformable_groups;
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableCol2imCoordGpuKernel<
T><<<blocks, threads, 0, dev_ctx.stream()>>>(
num_kernels,
data_col,
data_im,
data_offset,
data_mask,
im_shape[0],
im_shape[1],
im_shape[2],
kernel_shape[2],
kernel_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
deformable_groups,
col_shape[2],
col_shape[3],
grad_offset,
grad_mask);
}
template <typename T>
__global__ void FilterGradAddupGpuKernel(const int nthreads,
const int n,
const int height,
const int width,
const T* dweight_3d,
T* filter_grad) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
filter_grad[i] = filter_grad[i] + dweight_3d[i];
}
}
template <typename T, typename Context>
void FilterGradAddup(const Context& dev_ctx,
const int nthreads,
const int n,
const int height,
const int width,
const T* dweight_3d,
T* filter_grad) {
FilterGradAddupGpuKernel<
T><<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
nthreads, n, height, width, dweight_3d, filter_grad);
}
} // namespace phi
PD_REGISTER_KERNEL(deformable_conv_grad,
GPU,
ALL_LAYOUT,
phi::DeformableConvGradKernel,
float,
double) {}
...@@ -16,142 +16,8 @@ ...@@ -16,142 +16,8 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" #include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
namespace phi {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaximumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaximumNumBlocks);
}
template <typename T>
__global__ void ModulatedDeformableIm2colGpuKernel(
const int nthreads,
const T* data_im,
const T* data_offset,
const T* data_mask,
const int height,
const int width,
const int kernel_h,
const int kernel_w,
const int pad_h,
const int pad_w,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int channel_per_deformable_group,
const int batch_size,
const int num_channels,
const int deformable_group,
const int height_col,
const int width_col,
T* data_col) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
const int w_col = i % width_col;
const int h_col = (i / width_col) % height_col;
const int b_col = (i / width_col) / height_col % batch_size;
const int c_im = (i / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T* data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T* data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T* data_offset_ptr =
data_offset +
(b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
kernel_w * height_col * width_col;
const T* data_mask_ptr =
data_mask +
(b_col * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
val =
DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col) {
int channel_per_deformable_group = im_shape[0] / deformable_groups;
int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
int blocks = NumBlocks(num_kernels);
int threads = kNumCUDAThreads;
ModulatedDeformableIm2colGpuKernel<
T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
data_im,
data_offset,
data_mask,
im_shape[1],
im_shape[2],
filter_shape[2],
filter_shape[3],
paddings[0],
paddings[1],
strides[0],
strides[1],
dilations[0],
dilations[1],
channel_per_deformable_group,
col_shape[1],
im_shape[0],
deformable_groups,
col_shape[2],
col_shape[3],
data_col);
}
} // namespace phi
PD_REGISTER_KERNEL(deformable_conv, PD_REGISTER_KERNEL(deformable_conv,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
namespace phi {
template <typename T>
HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
T argmax_w,
const int h,
const int w,
const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
weight = (h == argmax_h_low && w == argmax_w_low)
? (h + 1 - argmax_h) * (w + 1 - argmax_w)
: weight;
weight = (h == argmax_h_low && w == argmax_w_high)
? (h + 1 - argmax_h) * (argmax_w + 1 - w)
: weight;
weight = (h == argmax_h_high && w == argmax_w_low)
? (argmax_h + 1 - h) * (w + 1 - argmax_w)
: weight;
weight = (h == argmax_h_high && w == argmax_w_high)
? (argmax_h + 1 - h) * (argmax_w + 1 - w)
: weight;
return weight;
}
template <typename T>
HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
T argmax_w,
const int height,
const int width,
const T* im_data,
const int data_width,
const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low]
: 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high]
: 0;
} else if (bp_dir == 1) {
weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
? -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low]
: 0;
weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
? (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
? -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low]
: 0;
weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
? (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high]
: 0;
}
return weight;
}
template <typename T, typename Context>
void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
const T* data_col,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* grad_offset,
T* grad_mask);
template <typename T, typename Context>
void ModulatedDeformableCol2im(const Context& dev_ctx,
const T* data_col,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& kernel_shape,
const std::vector<int>& pad,
const std::vector<int>& stride,
const std::vector<int>& dilation,
const int deformable_group,
T* grad_im);
template <typename T, typename Context>
void FilterGradAddup(const Context& dev_ctx,
const int nthreads,
const int n,
const int height,
const int width,
const T* dweight_3d,
T* filter_grad);
template <typename T, typename Context>
void DeformableConvGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& offset,
const DenseTensor& filter,
paddle::optional<const DenseTensor&> mask,
const DenseTensor& out_grad,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
int deformable_groups,
int groups,
int im2col_step,
DenseTensor* dx,
DenseTensor* offset_grad,
DenseTensor* filter_grad,
DenseTensor* mask_grad) {
const int batch_size = static_cast<int>(x.dims()[0]);
DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(phi::vectorize(out_grad.dims()));
std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
col_buffer_shape_vec[1] = im2col_step;
for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
}
std::vector<int64_t> output_buffer_shape_vec(1);
output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
output_shape_vec[2] * output_shape_vec[3];
DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
DenseTensor output_buffer;
output_buffer.ShareDataWith(out_grad).Resize(
make_ddim(output_buffer_shape_vec));
int64_t M =
input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
int64_t K = output_shape_vec[1] / groups;
DDim weight_3d_shape = {groups, K, M};
DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, N};
DDim col_buffer_3d_shape = {groups, M, N};
DDim filter_grad_shape = {groups, K, M};
DenseTensor weight_3d;
weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
DenseTensor out_grad_4d;
out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
DenseTensor col_buffer_3d;
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
phi::funcs::SetConstant<Context, T> set_zero;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
int input_dim = x.numel() / x.dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0];
int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
if (filter_grad) {
Full<T>(dev_ctx,
{filter_grad_shape.Get(), filter_grad_shape.size()},
0,
filter_grad);
}
if (dx) {
dev_ctx.template Alloc<T>(dx);
set_zero(dev_ctx, dx, static_cast<T>(0));
}
if (offset_grad) {
dev_ctx.template Alloc<T>(offset_grad);
set_zero(dev_ctx, offset_grad, static_cast<T>(0));
if (mask_grad) {
dev_ctx.template Alloc<T>(mask_grad);
set_zero(dev_ctx, mask_grad, static_cast<T>(0));
}
}
for (int i = 0; i < batch_size / im2col_step; ++i) {
DenseTensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
for (int g = 0; g < groups; ++g) {
DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
DenseTensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
blas.MatMul(weight_3d_slice,
true,
out_grad_3d_slice,
false,
T(1.0),
&col_buffer_3d_slice,
T(0.0));
}
col_buffer.Resize(make_ddim(col_buffer_shape_vec));
T* col_buffer_ptr = col_buffer.data<T>();
const T* input_ptr = x.data<T>();
const T* offset_ptr = offset.data<T>();
const T* mask_data_ptr =
mask ? mask->data<T>() + i * im2col_step * input_mask_dim : nullptr;
if (offset_grad) {
T* offset_grad_ptr = offset_grad->data<T>();
T* mask_grad_data_ptr =
mask_grad ? mask_grad->data<T>() + i * im2col_step * input_mask_dim
: nullptr;
// get grad of offset and mask
ModulatedDeformableCol2imCoord(
dev_ctx,
col_buffer_ptr,
input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_data_ptr,
input_shape_vec,
col_buffer_shape_vec,
filter_shape_vec,
paddings,
strides,
dilations,
deformable_groups,
offset_grad_ptr + i * im2col_step * input_offset_dim,
mask_grad_data_ptr);
}
if (dx) {
T* dx_ptr = dx->data<T>();
// get grad of input
ModulatedDeformableCol2im(dev_ctx,
col_buffer_ptr,
offset_ptr + i * im2col_step * input_offset_dim,
mask_data_ptr,
input_shape_vec,
col_buffer_shape_vec,
filter_shape_vec,
paddings,
strides,
dilations,
deformable_groups,
dx_ptr + i * im2col_step * input_dim);
dx->Resize(x.dims());
}
funcs::ModulatedDeformableIm2col(
dev_ctx,
input_ptr + i * im2col_step * input_dim,
offset_ptr + i * im2col_step * input_offset_dim,
mask_data_ptr,
input_shape_vec,
col_buffer_shape_vec,
filter_shape_vec,
paddings,
strides,
dilations,
deformable_groups,
col_buffer_ptr);
col_buffer_3d.Resize(col_buffer_3d_shape);
if (filter_grad) {
DenseTensor dweight_3d = Empty<T>(
dev_ctx, {filter_grad_shape.Get(), filter_grad_shape.size()});
for (int g = 0; g < groups; ++g) {
DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
DenseTensor col_buffer_3d_slice =
col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
DenseTensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
blas.MatMul(out_grad_3d_slice,
false,
col_buffer_3d_slice,
true,
T(1.0),
&dweight_3d_slice,
T(0.0));
}
// update grad of weights
FilterGradAddup<T>(dev_ctx,
dweight_3d.numel(),
groups,
K,
M,
dweight_3d.data<T>(),
filter_grad->data<T>());
}
}
if (filter_grad) {
filter_grad->Resize(filter.dims());
}
}
} // namespace phi
...@@ -18,66 +18,17 @@ ...@@ -18,66 +18,17 @@
#include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
#include "paddle/utils/optional.h"
namespace phi { namespace phi {
template <typename T>
HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
const int data_width,
const int height,
const int width,
T h,
T w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh;
T hw = 1 - lw;
T v1 =
(h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
T v2 = (h_low >= 0 && w_high <= width - 1)
? bottom_data[h_low * data_width + w_high]
: 0;
T v3 = (h_high <= height - 1 && w_low >= 0)
? bottom_data[h_high * data_width + w_low]
: 0;
T v4 = (h_high <= height - 1 && w_high <= width - 1)
? bottom_data[h_high * data_width + w_high]
: 0;
T w1 = hh * hw;
T w2 = hh * lw;
T w3 = lh * hw;
T w4 = lh * lw;
return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
template <typename T, typename Context>
void ModulatedDeformableIm2col(const Context& dev_ctx,
const T* data_im,
const T* data_offset,
const T* data_mask,
const std::vector<int64_t>& im_shape,
const std::vector<int64_t>& col_shape,
const std::vector<int64_t>& filter_shape,
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
const int deformable_groups,
T* data_col);
template <typename T, typename Context> template <typename T, typename Context>
void DeformableConvKernel(const Context& dev_ctx, void DeformableConvKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& offset, const DenseTensor& offset,
const DenseTensor& filter, const DenseTensor& filter,
const DenseTensor& mask, paddle::optional<const DenseTensor&> mask,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
...@@ -125,28 +76,31 @@ void DeformableConvKernel(const Context& dev_ctx, ...@@ -125,28 +76,31 @@ void DeformableConvKernel(const Context& dev_ctx,
int input_dim = x.numel() / x.dims()[0]; int input_dim = x.numel() / x.dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0]; int input_offset_dim = offset.numel() / offset.dims()[0];
int input_mask_dim = mask.numel() / mask.dims()[0]; int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
const T* input_ptr = x.data<T>(); const T* input_ptr = x.data<T>();
const T* offset_ptr = offset.data<T>(); const T* offset_ptr = offset.data<T>();
const T* mask_ptr = mask.data<T>(); const T* mask_ptr = mask ? mask->data<T>() : nullptr;
T* col_buffer_ptr = col_buffer.data<T>(); T* col_buffer_ptr = col_buffer.data<T>();
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
for (int i = 0; i < batch_size / im2col_step; ++i) { for (int i = 0; i < batch_size / im2col_step; ++i) {
ModulatedDeformableIm2col(dev_ctx, const T* temp_mask_ptr =
input_ptr + i * im2col_step * input_dim, mask_ptr ? mask_ptr + i * im2col_step * input_mask_dim : nullptr;
offset_ptr + i * im2col_step * input_offset_dim, funcs::ModulatedDeformableIm2col(
mask_ptr + i * im2col_step * input_mask_dim, dev_ctx,
input_shape_vec, input_ptr + i * im2col_step * input_dim,
col_buffer_shape_vec, offset_ptr + i * im2col_step * input_offset_dim,
filter_shape_vec, temp_mask_ptr,
paddings, input_shape_vec,
strides, col_buffer_shape_vec,
dilations, filter_shape_vec,
deformable_groups, paddings,
col_buffer_ptr); strides,
dilations,
deformable_groups,
col_buffer_ptr);
DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
// get the product of pixel and weight // get the product of pixel and weight
......
...@@ -29,6 +29,34 @@ KernelSignature DeformableConvOpArgumentMapping( ...@@ -29,6 +29,34 @@ KernelSignature DeformableConvOpArgumentMapping(
{"Output"}); {"Output"});
} }
KernelSignature DeformableConvGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"deformable_conv_grad",
{"Input", "Offset", "Filter", "Mask", GradVarName("Output")},
{"strides",
"paddings",
"dilations",
"deformable_groups",
"groups",
"im2col_step"},
{GradVarName("Input"),
GradVarName("Offset"),
GradVarName("Filter"),
GradVarName("Mask")});
}
} // namespace phi } // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(deformable_conv_v1, deformable_conv);
PD_REGISTER_BASE_KERNEL_NAME(deformable_conv_v1_grad, deformable_conv_grad);
PD_REGISTER_ARG_MAPPING_FN(deformable_conv, PD_REGISTER_ARG_MAPPING_FN(deformable_conv,
phi::DeformableConvOpArgumentMapping); phi::DeformableConvOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(deformable_conv_grad,
phi::DeformableConvGradOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(deformable_conv_v1,
phi::DeformableConvOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(deformable_conv_v1_grad,
phi::DeformableConvGradOpArgumentMapping);
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import abc import abc
import numpy as np import numpy as np
import paddle import paddle
from .utils import to_list
from paddle.io import DataLoader, DistributedBatchSampler from paddle.io import DataLoader, DistributedBatchSampler
...@@ -51,10 +52,11 @@ class NonIterableGeneratorLoader(DistributedDataLoader): ...@@ -51,10 +52,11 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
places, places,
batch_size=1, batch_size=1,
epochs=1, epochs=1,
steps_per_epoch=1000, steps_per_epoch=None,
data_parallel_world_size=None, data_parallel_world_size=None,
data_parallel_rank=None, data_parallel_rank=None,
drop_last=False): drop_last=False,
inputs=[]):
self.feed_list = feed_list self.feed_list = feed_list
self.places = places self.places = places
self.steps_per_epoch = steps_per_epoch self.steps_per_epoch = steps_per_epoch
...@@ -62,6 +64,8 @@ class NonIterableGeneratorLoader(DistributedDataLoader): ...@@ -62,6 +64,8 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
dataset, batch_size, epochs, data_parallel_world_size, dataset, batch_size, epochs, data_parallel_world_size,
data_parallel_rank, drop_last) data_parallel_rank, drop_last)
self._inner_dataloader = self._create_inner_dataloader() self._inner_dataloader = self._create_inner_dataloader()
self._steps = self._infer_steps()
self._inputs = inputs
def __iter__(self): def __iter__(self):
self._cur_step = 0 self._cur_step = 0
...@@ -69,22 +73,38 @@ class NonIterableGeneratorLoader(DistributedDataLoader): ...@@ -69,22 +73,38 @@ class NonIterableGeneratorLoader(DistributedDataLoader):
return self return self
def __next__(self): def __next__(self):
if self._cur_step < self.steps_per_epoch: if self._cur_step < self._steps:
self._cur_step += 1 self._cur_step += 1
else: else:
self._inner_dataloader.reset() self._inner_dataloader.reset()
raise StopIteration raise StopIteration
def _infer_steps(self):
if self.steps_per_epoch is not None:
return self.steps_per_epoch
try:
steps_per_epoch = len(self.dataset) // self.batch_size
except:
raise ValueError(
"Pleace set `steps_per_epoch` or implement `__len__` methond in dataset class."
)
return steps_per_epoch
def _create_inner_dataloader(self): def _create_inner_dataloader(self):
def data_generator(): def data_generator():
batch_data = None batch_data = None
for step, data in enumerate(self.dataset): for step, data in enumerate(self.dataset):
if not isinstance(data, list):
data = to_list(data)
if batch_data is None: if batch_data is None:
batch_data = [[] for i in range(len(data))] batch_data = [[] for i in range(len(data))]
for idx, data_item in enumerate(data):
batch_data[idx].append(np.array(data_item)) for idx in range(len(data)):
batch_data[idx].append(data[idx])
if (step + 1) % self.batch_size == 0: if (step + 1) % self.batch_size == 0:
yield batch_data[0], batch_data[1] yield batch_data
batch_data = None batch_data = None
dataloader = paddle.fluid.io.DataLoader.from_generator( dataloader = paddle.fluid.io.DataLoader.from_generator(
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import re
import os
import errno
import pickle
import warnings
import logging
import numpy as np
import paddle
from paddle import fluid
from paddle.fluid import core
from paddle.fluid.framework import static_only
from .utils import get_dist_attr
from .converter import Converter
from .process_group import _g_process_group_map
from ..utils import get_logger
def check_filename(re_exp, filename):
if re.search(re_exp, filename):
return True
else:
return False
def _process_path(path):
filename = os.path.basename(path)
if filename == "":
raise ValueError(
"path should be of 'dirname/filename' format, but received filename is empty string"
)
try:
dirname = os.path.dirname(path)
os.makedirs(dirname)
except OSError as e:
if e.errno != errno.EEXIST:
raise
return dirname, filename
class DistributedSaver:
def __init__(self):
self._logger = get_logger(logging.INFO)
def save(self, path, serial_program, dist_main_program, dist_context):
dirname, filename = _process_path(path)
rank_id = paddle.distributed.get_rank()
# save serial program when rank id is 0
if rank_id == 0:
self._save_rank_mapping(dirname)
serial_model_filename = filename + "_serial.pdmodel"
serial_model_path = os.path.join(dirname, serial_model_filename)
with open(serial_model_path, "wb") as f:
f.write(serial_program.desc.serialize_to_string())
# save distributed main program
dist_model_filename = filename + "_dist" + str(rank_id) + ".pdmodel"
dist_model_path = os.path.join(dirname, dist_model_filename)
with open(dist_model_path, "wb") as f:
f.write(dist_main_program.desc.serialize_to_string())
# save distributed params
dist_param_filename = filename + "_dist" + str(rank_id) + ".pdparams"
dist_param_path = os.path.join(dirname, dist_param_filename)
dist_param = {
k: np.array(v)
for k, v in dist_main_program.state_dict().items()
}
with open(dist_param_path, "wb") as f:
pickle.dump(dist_param, f)
# save distributed attribute
dist_attr_filename = filename + "_dist" + str(rank_id) + ".pdattr"
dist_attr_path = os.path.join(dirname, dist_attr_filename)
dist_attrs = get_dist_attr(dist_main_program, dist_context)
with open(dist_attr_path, "wb") as f:
pickle.dump(dist_attrs, f)
# TODO:save cluster.json
def load(self,
path,
program,
dist_context,
strict=True,
load_optimizer=True):
# TODO: if `program` is None, load `path.pdmodel`.
filename = os.path.basename(path)
if filename == "":
raise ValueError(
"path should be of 'dirname/filename' format, but received filename is empty string"
)
dirname = os.path.dirname(path)
# load path.pdparam
param_file_list = []
for param_file in os.listdir(dirname):
if check_filename('{}(.*)_dist(.*).pdparams'.format(filename),
param_file):
param_file_list.append(os.path.join(dirname, param_file))
param_file_list.sort()
self._logger.info("Load distributed attribute file: {}".format(
param_file_list))
param_dict = {}
for param_file in param_file_list:
with open(param_file, 'rb') as f:
state_dict_info = pickle.load(f, encoding='latin1')
for name, value in state_dict_info.items():
if name in param_dict:
param_dict[name].append(np.array(value))
else:
param_dict[name] = [np.array(value)]
# load path.pdattr
dist_attr_file_list = []
for dist_attr_file in os.listdir(dirname):
if check_filename('{}(.*)_dist(.*).pdattr'.format(filename),
dist_attr_file):
dist_attr_file_list.append(
os.path.join(dirname, dist_attr_file))
dist_attr_file_list.sort()
self._logger.info("Load distributed attribute file: {}".format(
dist_attr_file_list))
pre_dist_attr = {}
for dist_attr_file in dist_attr_file_list:
with open(dist_attr_file, 'rb') as f:
dist_attr = pickle.load(f, encoding='latin1')
for name, attr in dist_attr.items():
if name not in pre_dist_attr:
pre_dist_attr[name] = attr
# get current dist_attr
cur_dist_attr = get_dist_attr(program, dist_context)
# param convert
converter = Converter(param_dict, pre_dist_attr, cur_dist_attr)
param_dict = converter.convert(strict=strict)
program.set_state_dict(param_dict)
def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
dirname, filename = _process_path(path)
# save distributed inference program
rank_id = paddle.distributed.get_rank()
if rank_id == 0:
self._save_rank_mapping(dirname)
op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName()
op_role_forward = int(core.op_proto_and_checker_maker.OpRole.Forward)
dist_main_prog = kwargs.get('program', None)
if not dist_main_prog:
dist_main_prog = fluid.default_main_program()
global_block = dist_main_prog.global_block()
ops = global_block.ops
feed_vars_names = list(map(lambda x: x.name, feed_vars))
fetch_vars_names = list(map(lambda x: x.name, fetch_vars))
last_idx = -1
for idx, op in enumerate(ops):
if op.attr(op_role_key) != op_role_forward:
continue
if op.type == "read" or op.type == "feed" or op.type == 'recv_v2':
feed_vars_names += op.output("Out")
if op.type == "send_v2":
fetch_vars_names += op.input("X")
last_idx = max(idx, last_idx)
for out_name in op.output_arg_names:
if out_name in fetch_vars_names:
last_idx = max(idx, last_idx)
used_inputs = []
used_outputs = []
for idx, op in enumerate(ops):
if idx > last_idx:
break
used_inputs += op.input_arg_names
used_outputs += op.output_arg_names
dist_feed_vars_names = list(set(feed_vars_names) & set(used_inputs))
dist_fetch_vars_names = list(set(fetch_vars_names) & set(used_outputs))
dist_feed_vars = [
global_block.vars[name] for name in dist_feed_vars_names
]
dist_fetch_vars = [
global_block.vars[name] for name in dist_fetch_vars_names
]
# NOTE: `paddle.static.save_inference_model` does not support subblock.
dist_filename = filename + "_dist" + str(rank_id)
dist_path = os.path.join(dirname, dist_filename)
paddle.static.save_inference_model(
dist_path,
dist_feed_vars,
dist_fetch_vars,
exe,
program=dist_main_prog)
def _save_rank_mapping(self, dirname):
path = os.path.join(dirname, 'rank_mapping.csv')
f = open(path, 'w')
f.write('[ring_id -> ranks]\n')
for process_group in _g_process_group_map.values():
ring_id = process_group._group_id
ranks = [str(rank) for rank in process_group._ranks]
id_to_rank = str(ring_id) + "," + ",".join(ranks) + '\n'
f.write(id_to_rank)
id_to_rank = ""
f.write('[rank -> ring_ids]\n')
rank_to_id_dict = {}
for process_group in _g_process_group_map.values():
ring_id = process_group._group_id
for rank in process_group._ranks:
if rank in rank_to_id_dict:
rank_to_id_dict[rank].append(str(ring_id))
else:
rank_to_id_dict[rank] = [str(ring_id)]
rank_to_id = ""
for item, val in rank_to_id_dict.items():
rank_to_id += str(item) + ","
rank_to_id += ",".join(val) + "\n"
f.write(rank_to_id)
rank_to_id = ""
f.close()
...@@ -19,138 +19,158 @@ from collections import defaultdict ...@@ -19,138 +19,158 @@ from collections import defaultdict
import paddle import paddle
from paddle import fluid from paddle import fluid
from paddle.io import Dataset from paddle.io import Dataset
from paddle.fluid.backward import append_backward from paddle.metric import Metric
import paddle.fluid.core as core
from paddle.static import InputSpec from paddle.static import InputSpec
from paddle.fluid import core
from paddle.fluid import program_guard from paddle.fluid import program_guard
from paddle.fluid.backward import append_backward
from paddle.fluid.framework import Operator from paddle.fluid.framework import Operator
from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.framework import _current_expected_place as _get_device
from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.distributed.passes import new_pass, PassContext from paddle.distributed.passes import new_pass, PassContext
from paddle.distributed.utils import get_logger from paddle.distributed.utils import get_logger
from .dist_loader import NonIterableGeneratorLoader
from .dist_op import DistributedOperator
from .dist_tensor import DistributedTensor
from .dist_context import DistributedContext
from .dist_context import get_default_distributed_context
from .dist_context import set_default_distributed_context
from .process_group import get_all_process_groups
from .process_group import get_process_group
from .process_group import get_world_process_group
from .process_group import _g_process_group_map, ProcessGroup
from .completion import Completer
from .partitioner import Partitioner
from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
from .cluster import Cluster
from .mapper import mapping from .mapper import mapping
from .cluster import Cluster
from .reshard import reshard
from .planner import Planner from .planner import Planner
from .utils import make_data_unshard from .completion import Completer
from .utils import set_grad_var_shape from .partitioner import Partitioner
from .utils import print_program_with_dist_attr from .dist_op import DistributedOperator
from .utils import SerialProgramInfo from .dist_saver import DistributedSaver
from .dist_loader import NonIterableGeneratorLoader
from .utils import make_data_unshard, set_grad_var_shape
from .utils import print_program_with_dist_attr, to_list
from .process_group import get_all_process_groups, get_world_process_group
from .dist_context import DistributedContext, get_default_distributed_context
paddle.enable_static() paddle.enable_static()
def to_list(value):
if value is None:
return value
if isinstance(value, (list, tuple)):
return list(value)
return [value]
class Engine: class Engine:
def __init__(self, model=None, data_spec=None, cluster=None, strategy=None): def __init__(self,
model=None,
inputs_spec=None,
labels_spec=None,
cluster=None,
strategy=None):
self.model = model self.model = model
self.data_spec = data_spec self.inputs_spec = self._validate_spec(inputs_spec)
self.labels_spec = self._validate_spec(labels_spec)
self.cluster = cluster self.cluster = cluster
self.strategy = strategy self.strategy = strategy
self._executor = None self._executor = None
self._orig_main_prog = fluid.default_main_program() self._orig_main_prog = fluid.default_main_program()
self._orig_startup_prog = fluid.default_startup_program() self._orig_startup_prog = fluid.default_startup_program()
self._orig_dist_context = get_default_distributed_context()
self._serial_main_progs = {} self._serial_main_progs = {}
self._serial_startup_progs = {} self._serial_startup_progs = {}
self._dist_main_progs = defaultdict(dict) self._dist_main_progs = defaultdict(dict) # dist main programs
self._dist_startup_progs = defaultdict(dict) self._dist_startup_progs = defaultdict(dict) # dist startup programs
self._orig_dist_context = get_default_distributed_context()
self._dist_contexts = {} self._dist_contexts = {}
self._pass_contexts = {} self._pass_contexts = {}
self._cur_rank = paddle.distributed.get_rank() self._cur_rank = paddle.distributed.get_rank()
self._logger = get_logger(logging.INFO) self._logger = get_logger(logging.INFO)
self._saver = DistributedSaver()
self._feed_vars = {}
self._fetch_vars = {}
def prepare(self, def prepare(self,
optimizer=None, optimizer=None,
loss=None, loss=None,
metrics=None, metrics=None,
mode="train", mode='train',
all_ranks=False): all_ranks=False):
self.optimizer = optimizer self._optimizer = optimizer
self.loss = loss # TODO: check loss type
self.metrics = metrics self._loss = loss
self._metrics = to_list(metrics)
for m in ['train', 'predict']:
self.mode = m
self._build(m) # build forward program
self._plan(m) # completion & planner
self._parallel(m, all_ranks) # parallel
self._initialize(m) # init comm and startup program
self.mode = mode self.mode = mode
self._build()
self._plan()
if not all_ranks:
self._parallel(self._cur_rank)
else:
world_process_group = get_world_process_group()
all_ranks = world_process_group.ranks
for rank in all_ranks:
self._parallel(rank)
self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace):
self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
if self._executor is None:
self._executor = paddle.static.Executor(self._place)
def _build(self): def _build(self, mode):
serial_main_prog = self._serial_main_progs.get(self.mode, None) serial_main_prog = self._serial_main_progs.get(mode, None)
if serial_main_prog is not None: if serial_main_prog is not None:
return return
losses = []
metrics = []
serial_main_prog = self._orig_main_prog.clone() serial_main_prog = self._orig_main_prog.clone()
serial_startup_prog = self._orig_startup_prog.clone() serial_startup_prog = self._orig_startup_prog.clone()
with fluid.program_guard(serial_main_prog, serial_startup_prog): with fluid.program_guard(serial_main_prog, serial_startup_prog):
inputs_spec = self.data_spec[0] inputs_spec = self.inputs_spec
labels_spec = self.data_spec[1] labels_spec = self.labels_spec if self.labels_spec else []
inputs = [s._create_feed_layer() for s in to_list(inputs_spec)] inputs = [s._create_feed_layer() for s in inputs_spec]
labels = [s._create_feed_layer() for s in to_list(labels_spec)] labels = [s._create_feed_layer() for s in labels_spec]
self._input_vars = inputs
self._label_vars = labels
self._feed_vars = self._input_vars + self._label_vars
outputs = to_list(self.model(*inputs)) outputs = to_list(self.model(*inputs))
if self.mode != "predict" and self.loss: if mode != "predict" and self._loss:
loss = self.loss(*(outputs + labels)) losses = to_list(self._loss(*(outputs + labels)))
self._loss_var = loss
self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
self._fetch_vars = {"outputs": outputs, "loss": loss}
self._serial_main_progs[self.mode] = serial_main_prog self._fetch_vars[mode] = {
self._serial_startup_progs[self.mode] = serial_startup_prog "outputs": outputs,
self._dist_contexts[self.mode] = DistributedContext( "loss": losses,
serial_main_prog, serial_startup_prog, "metrics": metrics
self._dist_main_progs[self.mode], }
self._dist_startup_progs[self.mode])
self._pass_contexts[self.mode] = PassContext() self._serial_main_progs[mode] = serial_main_prog
self._serial_startup_progs[mode] = serial_startup_prog
def _plan(self): self._dist_contexts[mode] = DistributedContext(
serial_main_prog, serial_startup_prog, self._dist_main_progs[mode],
self._dist_startup_progs[mode])
self._pass_contexts[mode] = PassContext()
def _plan(self, mode):
# Complete the distributed annotation # Complete the distributed annotation
serial_main_prog = self._serial_main_progs[self.mode] serial_main_prog = self._serial_main_progs[mode]
self._completer = Completer(self._dist_contexts[self.mode]) self._completer = Completer(self._dist_contexts[mode])
self._completer.complete_forward_annotation(serial_main_prog) self._completer.complete_forward_annotation(serial_main_prog)
# TODO: add auto planner process # TODO: add auto planner process
# parse forward sub block # parse forward sub block
self._dist_contexts[self.mode].block_state.parse_forward_blocks( self._dist_contexts[mode].block_state.parse_forward_blocks(
serial_main_prog) serial_main_prog)
def _parallel(self, rank): def _parallel(self, mode, all_ranks=False):
serial_main_program = self._serial_main_progs[self.mode] if not all_ranks:
serial_startup_program = self._serial_startup_progs[self.mode] self._parallel_program(mode, self._cur_rank)
dist_context = self._dist_contexts[self.mode] else:
if self.mode != "predict" and self.loss: world_process_group = get_world_process_group()
all_ranks = world_process_group.ranks
for rank in all_ranks:
self._parallel_program(mode, rank)
def _initialize(self, mode):
# Traverse different rank programs and traverse each op of them,
# instantiate communication by process_mapping.
all_process_groups = get_all_process_groups()
for process_group in all_process_groups:
if self._cur_rank not in process_group.ranks:
continue
process_group.instantiate()
# initialize
self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace):
self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
if self._executor is None:
self._executor = paddle.static.Executor(self._place)
dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
self._executor.run(dist_startup_prog)
def _parallel_program(self, mode, rank):
serial_main_program = self._serial_main_progs[mode]
serial_startup_program = self._serial_startup_progs[mode]
dist_context = self._dist_contexts[mode]
if mode == "train" and self._optimizer:
# Generate backward # Generate backward
serial_loss = self._loss_var serial_loss = self._fetch_vars[mode]["loss"][0]
params_grads = self._generate_backward( params_grads = self._generate_backward(
serial_main_program, serial_startup_program, serial_loss) serial_main_program, serial_startup_program, serial_loss)
# Apply pre optimization passes # Apply pre optimization passes
...@@ -172,8 +192,23 @@ class Engine: ...@@ -172,8 +192,23 @@ class Engine:
# Apply post optimization passes # Apply post optimization passes
self._apply_post_optimization(dist_main_prog, dist_startup_prog, self._apply_post_optimization(dist_main_prog, dist_startup_prog,
rank, dist_params_grads) rank, dist_params_grads)
self._dist_main_progs[self.mode][rank] = dist_main_prog else:
self._dist_startup_progs[self.mode][rank] = dist_startup_prog # Do logical partition
partitioner = Partitioner(dist_context, rank)
dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
serial_main_program, serial_startup_program, [])
# Do reshard process
make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
reshard(dist_main_prog, dist_startup_prog, rank, dist_context, [],
1)
# clone program for test
if mode != 'train':
dist_main_prog = dist_main_prog.clone(for_test=True)
dist_startup_prog = dist_startup_prog.clone(for_test=True)
self._dist_main_progs[mode][rank] = dist_main_prog
self._dist_startup_progs[mode][rank] = dist_startup_prog
def _generate_backward(self, main_program, startup_program, loss): def _generate_backward(self, main_program, startup_program, loss):
with program_guard(main_program, startup_program): with program_guard(main_program, startup_program):
...@@ -187,7 +222,7 @@ class Engine: ...@@ -187,7 +222,7 @@ class Engine:
def _generate_optimizer(self, main_program, startup_program, params_grads): def _generate_optimizer(self, main_program, startup_program, params_grads):
with program_guard(main_program, startup_program): with program_guard(main_program, startup_program):
optimizer_ops = copy.deepcopy(self.optimizer).apply_gradients( optimizer_ops = copy.deepcopy(self._optimizer).apply_gradients(
params_grads) params_grads)
self._completer.complete_update_annotation(main_program) self._completer.complete_update_annotation(main_program)
return optimizer_ops return optimizer_ops
...@@ -239,42 +274,87 @@ class Engine: ...@@ -239,42 +274,87 @@ class Engine:
[main_program], [startup_program], [main_program], [startup_program],
self._pass_contexts[self.mode]) self._pass_contexts[self.mode])
def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000): def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=None):
# TODO: callbacks
# TODO: evaluate after training
self.mode = 'train'
assert isinstance(train_data, Dataset) assert isinstance(train_data, Dataset)
assert steps_per_epoch is not None
train_dataloader = self._create_dataloader(train_data, batch_size, train_dataloader = self._create_dataloader(train_data, batch_size,
epochs, steps_per_epoch) epochs, steps_per_epoch)
self._init_communication()
dist_startup_prog = self._dist_startup_progs["train"][self._cur_rank] outputs = []
self._executor.run(dist_startup_prog)
for epoch in range(epochs): for epoch in range(epochs):
# train_dataloader.start()
# for step in range(steps_per_epoch):
# logs = self.train_step(None)
# self._logger.info(logs)
# train_dataloader.reset()
for step, data in enumerate(train_dataloader): for step, data in enumerate(train_dataloader):
logs = self._train_step(data) logs, loss = self._train_step(data)
outputs.append(loss)
train_logs = { train_logs = {
"train_" + name: val "train_" + name: val
for name, val in logs.items() for name, val in logs.items()
} }
self._logger.info(train_logs) self._logger.info(train_logs)
return outputs
def predict(self,
test_data,
batch_size=1,
use_program_cache=False,
return_numpy=True):
self.mode = 'predict'
# TODO: need check dataset
test_dataloader = self._create_dataloader(test_data, batch_size)
outputs = []
for step, data in enumerate(test_dataloader):
logs, outs = self._predict_step(data, use_program_cache,
return_numpy)
outputs.append(outs)
predict_logs = {
"predict_" + name: val
for name, val in logs.items()
}
self._logger.info(predict_logs)
return outputs
def _train_step(self, data): def _train_step(self, data):
logs = {} logs = {}
dist_main_prog = self._dist_main_progs["train"][self._cur_rank] dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
if self._loss_var.name not in dist_main_prog.global_block().vars: fetch_var = self._fetch_vars[self.mode]["loss"][0]
if fetch_var.name not in dist_main_prog.global_block().vars:
loss = self._executor.run(dist_main_prog) loss = self._executor.run(dist_main_prog)
logs["loss"] = None logs["loss"] = None
else: else:
fetch_list = self._loss_var loss = self._executor.run(dist_main_prog,
loss = self._executor.run(dist_main_prog, fetch_list=fetch_list) fetch_list=to_list(fetch_var))
logs["loss"] = loss logs["loss"] = loss
return logs return logs, loss
def _predict_step(self, data, use_program_cache=False, return_numpy=True):
logs = {}
dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
fetch_var = []
for var in self._fetch_vars[self.mode]["outputs"]:
if var.name in dist_main_prog.global_block().vars:
fetch_var.append(var)
if fetch_var is []:
outs = self._executor.run(dist_main_prog,
use_program_cache=use_program_cache)
logs["pred"] = outs
else:
outs = self._executor.run(dist_main_prog,
fetch_list=fetch_var,
use_program_cache=use_program_cache,
return_numpy=return_numpy)
logs["pred"] = outs
return logs, outs
def _create_dataloader(self, dataset, batch_size, epochs, steps_per_epoch): def _create_dataloader(self,
feed_list = self._input_vars + self._label_vars dataset,
batch_size,
epochs=1,
steps_per_epoch=None):
feed_list = self._feed_vars[self.mode]["inputs"] + self._feed_vars[
self.mode]["labels"]
dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank] dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank] dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
dist_context = self._dist_contexts[self.mode] dist_context = self._dist_contexts[self.mode]
...@@ -284,8 +364,15 @@ class Engine: ...@@ -284,8 +364,15 @@ class Engine:
op_size = len(dist_main_block.ops) op_size = len(dist_main_block.ops)
places = paddle.static.cuda_places() places = paddle.static.cuda_places()
with fluid.program_guard(dist_main_prog, dist_startup_prog): with fluid.program_guard(dist_main_prog, dist_startup_prog):
inputs = self._feed_vars[self.mode]["inputs"]
dataloader = NonIterableGeneratorLoader( dataloader = NonIterableGeneratorLoader(
dataset, feed_list, places, batch_size, epochs, steps_per_epoch) dataset,
feed_list,
places,
batch_size,
epochs,
steps_per_epoch,
inputs=inputs)
new_op_size = len(dist_main_block.ops) new_op_size = len(dist_main_block.ops)
for _ in range(new_op_size - 1, op_size - 1, -1): for _ in range(new_op_size - 1, op_size - 1, -1):
op = dist_main_block.ops[new_op_size - 1] op = dist_main_block.ops[new_op_size - 1]
...@@ -312,17 +399,49 @@ class Engine: ...@@ -312,17 +399,49 @@ class Engine:
dist_main_block._sync_with_cpp() dist_main_block._sync_with_cpp()
return dataloader return dataloader
def _init_communication(self): def _validate_spec(self, specs):
# Traverse different rank programs and traverse each op of them, specs = to_list(specs)
# instantiate communication by process_mapping. if specs is not None:
all_process_groups = get_all_process_groups() for i, spec in enumerate(specs):
for process_group in all_process_groups: assert isinstance(spec, InputSpec)
if self._cur_rank not in process_group.ranks: if spec.name is None:
continue raise ValueError(
process_group.instantiate() "Requires Input[{}].name != None, but receive `None` with {}."
.format(i, spec))
return specs
def save(self, path, training=True, mode=None):
if not mode:
mode = self.mode
if training:
assert 'train' in self._serial_main_progs, "training model is not ready, please call `engine.prepare(mode='train')` first."
serial_program = self._serial_main_progs["train"]
dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
dist_context = self._dist_contexts["train"]
self._saver.save(
path,
serial_program=serial_program,
dist_main_program=dist_main_prog,
dist_context=dist_context)
else:
assert mode, "Please set the 'mode' you want to save."
feed_vars = self._feed_vars[mode]['inputs']
fetch_vars = self._fetch_vars[mode]['outputs']
dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
self._saver.save_inference_model(
path,
feed_vars,
fetch_vars,
self._executor,
program=dist_main_prog)
# def save(self, path, training=True): def load(self, path, strict=True, load_optimizer=True, mode=None):
# pass if not mode:
mode = self.mode
assert mode, "Please set the 'mode' you want to load."
# def load(self, path, strict=True, load_optimizer=True): dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
# pass dist_context = self._dist_contexts[mode]
self._saver.load(path, dist_main_prog, dist_context, strict,
load_optimizer)
...@@ -1416,3 +1416,11 @@ def set_dist_op_desc_original_id(dist_op_desc, op_desc, dist_context): ...@@ -1416,3 +1416,11 @@ def set_dist_op_desc_original_id(dist_op_desc, op_desc, dist_context):
# Third, print error infomation if we cannot find the original id # Third, print error infomation if we cannot find the original id
else: else:
assert False, "Cannot find the original id in the distributed context" assert False, "Cannot find the original id in the distributed context"
def to_list(value):
if value is None:
return value
if isinstance(value, (list, tuple)):
return list(value)
return [value]
...@@ -797,6 +797,34 @@ def monkey_patch_varbase(): ...@@ -797,6 +797,34 @@ def monkey_patch_varbase():
def value(self): def value(self):
return self return self
@framework.dygraph_only
def _slice(self, begin_idx, end_idx):
return core.eager.Tensor(self.get_tensor()._slice(begin_idx, end_idx))
@framework.dygraph_only
def _numel(self):
return self.get_tensor()._numel()
@framework.dygraph_only
def cpu(self):
if self.place.is_cpu_place():
return self
else:
res = self._copy_to(core.CPUPlace(), True)
res.stop_gradient = self.stop_gradient
res.persistable = self.persistable
return res
@framework.dygraph_only
def cuda(self, device_id, blocking):
if self.place.is_gpu_place():
return self
else:
res = self._copy_to(core.CUDAPlace(device_id), True)
res.stop_gradient = self.stop_gradient
res.persistable = self.persistable
return res
if core._in_eager_mode() and not hasattr(core, "eager"): if core._in_eager_mode() and not hasattr(core, "eager"):
return return
...@@ -820,6 +848,10 @@ def monkey_patch_varbase(): ...@@ -820,6 +848,10 @@ def monkey_patch_varbase():
setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar) setattr(core.eager.Tensor, "_set_grad_ivar", _set_grad_ivar)
setattr(core.eager.Tensor, "clone", clone) setattr(core.eager.Tensor, "clone", clone)
setattr(core.eager.Tensor, "value", value) setattr(core.eager.Tensor, "value", value)
setattr(core.eager.Tensor, "cpu", cpu)
setattr(core.eager.Tensor, "cuda", cuda)
setattr(core.eager.Tensor, "_slice", _slice)
setattr(core.eager.Tensor, "_numel", _numel)
else: else:
setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "__name__", "Tensor")
setattr(core.VarBase, "grad", grad) setattr(core.VarBase, "grad", grad)
......
...@@ -108,10 +108,8 @@ def train(): ...@@ -108,10 +108,8 @@ def train():
grad_clip=None) grad_clip=None)
dataset = MyDataset(batch_num * batch_size) dataset = MyDataset(batch_num * batch_size)
data_spec = [ inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
InputSpec([batch_size, hidden_size], 'float32', 'x'), labels_spec = InputSpec([batch_size], 'int64', 'label')
InputSpec([batch_size], 'int64', 'label')
]
dist_strategy = fleet.DistributedStrategy() dist_strategy = fleet.DistributedStrategy()
dist_strategy.amp = False dist_strategy.amp = False
...@@ -121,11 +119,18 @@ def train(): ...@@ -121,11 +119,18 @@ def train():
dist_strategy.semi_auto = True dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy) fleet.init(is_collective=True, strategy=dist_strategy)
engine = Engine(mlp, data_spec, strategy=dist_strategy) engine = Engine(
mlp,
inputs_spec=inputs_spec,
labels_spec=labels_spec,
strategy=dist_strategy)
engine.prepare(optimizer, loss) engine.prepare(optimizer, loss)
engine.fit(dataset, engine.fit(dataset,
batch_size=batch_size, batch_size=batch_size,
steps_per_epoch=batch_num * batch_size) steps_per_epoch=batch_num * batch_size)
engine.save('./mlp')
engine.load('./mlp')
engine.save('./mlp_inf', training=False, mode='predict')
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import time
import paddle.fluid as fluid
import copy
import os
import numpy as np
import subprocess
import paddle
import paddle.nn as nn
import paddle.fluid as fluid
import paddle.static as static
import paddle.nn.functional as F
import paddle.utils as utils
from paddle.fluid import layers
from paddle.io import Dataset, IterableDataset, DataLoader
from paddle.static import InputSpec
from paddle.distributed import fleet
import paddle.distributed.auto_parallel as auto
from paddle.distributed.auto_parallel.engine import Engine
paddle.enable_static()
global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
batch_size = 1
batch_num = 10
hidden_size = 1024
image_size = hidden_size
paddle.seed(44)
class MyDataset(Dataset):
def __init__(self, num_samples):
super(MyDataset, self).__init__()
self.num_samples = num_samples
def __getitem__(self, index):
input = np.random.uniform(size=image_size).astype("float32")
return input
def __len__(self):
return self.num_samples
class MLPLayer(nn.Layer):
def __init__(self,
hidden_size=1024,
intermediate_size=4 * 1024,
dropout_ratio=0.1,
initializer_range=0.02):
super(MLPLayer, self).__init__()
d_model = hidden_size
dim_feedforward = intermediate_size
weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=initializer_range))
bias_attr = None
self.linear0 = nn.Linear(
d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
self.linear1 = nn.Linear(
dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
def forward(self, input):
out = self.norm(input)
out = self.linear0(input)
auto.shard_tensor(
self.linear0.weight,
dist_attr={
"process_mesh": global_process_mesh,
"dims_mapping": [-1, 0]
})
out = F.gelu(out, approximate=True)
out = self.linear1(out)
auto.shard_tensor(
self.linear1.weight,
dist_attr={
"process_mesh": global_process_mesh,
"dims_mapping": [0, -1]
})
out = self.dropout(out)
out = self.linear2(out)
return out
def train():
mlp = MLPLayer(
hidden_size=hidden_size,
intermediate_size=4 * hidden_size,
dropout_ratio=0.1,
initializer_range=0.02)
dataset = MyDataset(batch_num * batch_size)
inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
dist_strategy = fleet.DistributedStrategy()
# init parallel optimizer
dist_strategy.semi_auto = True
fleet.init(is_collective=True, strategy=dist_strategy)
engine = Engine(mlp, inputs_spec=inputs_spec, strategy=dist_strategy)
engine.prepare(mode='predict')
engine.predict(dataset, batch_size=batch_size)
if __name__ == "__main__":
train()
...@@ -42,6 +42,34 @@ class TestEngineAPI(unittest.TestCase): ...@@ -42,6 +42,34 @@ class TestEngineAPI(unittest.TestCase):
log_path = os.path.join(file_dir, "log") log_path = os.path.join(file_dir, "log")
if os.path.exists(log_path): if os.path.exists(log_path):
shutil.rmtree(log_path) shutil.rmtree(log_path)
files_path = [path for path in os.listdir('.') if '.pd' in path]
for path in files_path:
if os.path.exists(path):
os.remove(path)
if os.path.exists('rank_mapping.csv'):
os.remove('rank_mapping.csv')
def test_engine_predict(self):
file_dir = os.path.dirname(os.path.abspath(__file__))
launch_model_path = os.path.join(file_dir, "engine_predict_api.py")
if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
else:
coverage_args = []
cmd = [sys.executable, "-u"] + coverage_args + [
"-m", "launch", "--gpus", "0,1", launch_model_path
]
process = subprocess.Popen(cmd)
process.wait()
self.assertEqual(process.returncode, 0)
# Remove unnecessary files
log_path = os.path.join(file_dir, "log")
if os.path.exists(log_path):
shutil.rmtree(log_path)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -634,20 +634,39 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): ...@@ -634,20 +634,39 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
tensor3 = tensor2._copy_to(core.CUDAPlace(0), True) tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
self.assertTrue(tensor3.persistable, True) self.assertEqual(tensor3.persistable, True)
self.assertTrue(tensor3.stop_gradient, True) self.assertEqual(tensor3.stop_gradient, True)
self.assertTrue(tensor3.place.is_gpu_place()) self.assertTrue(tensor3.place.is_gpu_place())
tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
tensor5 = tensor4._copy_to(core.CUDAPlace(0), True) tensor4 = tensor2.cuda(0, True)
self.assertTrue(np.array_equal(tensor4.numpy(), arr2))
self.assertEqual(tensor4.persistable, True)
self.assertEqual(tensor4.stop_gradient, False)
self.assertTrue(tensor4.place.is_gpu_place())
tensor5 = tensor4.cpu()
self.assertTrue(np.array_equal(tensor5.numpy(), arr2))
self.assertEqual(tensor5.persistable, True)
self.assertEqual(tensor5.stop_gradient, False)
self.assertTrue(tensor5.place.is_cpu_place())
tensor10 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
tensor11 = tensor10._copy_to(core.CUDAPlace(0), True)
self.assertTrue( self.assertTrue(
np.array_equal(tensor4.numpy(), tensor5.numpy())) np.array_equal(tensor10.numpy(), tensor11.numpy()))
else: else:
tensor3 = tensor2._copy_to(core.CPUPlace(), True) tensor3 = tensor2._copy_to(core.CPUPlace(), True)
self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
self.assertTrue(tensor3.persistable, True) self.assertEqual(tensor3.persistable, True)
self.assertTrue(tensor3.stop_gradient, True) self.assertEqual(tensor3.stop_gradient, True)
self.assertTrue(tensor3.place.is_cpu_place()) self.assertTrue(tensor3.place.is_cpu_place())
tensor4 = tensor2.cpu()
self.assertTrue(np.array_equal(tensor4.numpy(), arr2))
self.assertEqual(tensor4.persistable, True)
self.assertEqual(tensor4.stop_gradient, False)
self.assertTrue(tensor4.place.is_cpu_place())
def test_share_buffer_to(self): def test_share_buffer_to(self):
with _test_eager_guard(): with _test_eager_guard():
arr = np.ones([4, 16, 16, 32]).astype('float32') arr = np.ones([4, 16, 16, 32]).astype('float32')
...@@ -784,6 +803,34 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): ...@@ -784,6 +803,34 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr)) self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
def test_sharding_related_api(self):
with _test_eager_guard():
arr0 = np.random.rand(4, 16, 16, 32).astype('float32')
egr_tensor1 = core.eager.Tensor(arr0,
core.CPUPlace(), True, False,
"numpy_tensor1", False)
self.assertEqual(egr_tensor1._numel(), 32768)
self.assertEqual(egr_tensor1._slice(0, 2)._numel(), 16384)
def test_copy_gradient_from(self):
with _test_eager_guard():
np_x = np.random.random((2, 2))
np_y = np.random.random((2, 2))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64")
out = x + x
out.backward()
x._copy_gradient_from(y)
self.assertTrue(np.array_equal(x.grad.numpy(), np_y))
def test_clear(self):
with _test_eager_guard():
np_x = np.random.random((3, 8, 8))
x = paddle.to_tensor(np_x, dtype="float64")
self.assertTrue(x._is_initialized())
x._clear()
self.assertFalse(x._is_initialized())
class EagerParamBaseUsageTestCase(unittest.TestCase): class EagerParamBaseUsageTestCase(unittest.TestCase):
def test_print(self): def test_print(self):
......
...@@ -170,6 +170,180 @@ class TestDygraphInplace(unittest.TestCase): ...@@ -170,6 +170,180 @@ class TestDygraphInplace(unittest.TestCase):
grad_var_a = var_a.grad.numpy() grad_var_a = var_a.grad.numpy()
self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a)) self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
# inplace + hook
def test_backward_success_3(self):
# var_b is modified inplace before using it, the inplace operator doesn't result
# in incorrect gradient computation.
def double_hook(grad):
grad = grad * 2
return grad
grad_var_a, grad_var_a_inplace = 0, 1
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
helper = var_a.register_hook(double_hook)
var_b = var_a**2
var_c = self.inplace_api_processing(
var_b) # var_b is modified inplace before using it
# Here, the gradient computation will use the value of var_b
var_d = var_c**2
loss = var_d.sum()
loss.backward()
grad_var_a_inplace = var_a.grad.numpy()
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
helper = var_a.register_hook(double_hook)
var_b = var_a**2
var_c = self.non_inplace_api_processing(var_b)
var_d = var_c**2
loss = var_d.sum()
loss.backward()
grad_var_a = var_a.grad.numpy()
self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
# inplace + hook
def test_backward_success_4(self):
# Although var_b is modified inplace after using it, it does not used in gradient computation.
# The inplace operator doesn't result in incorrect gradient computation.
def double_hook(grad):
grad = grad * 2
return grad
grad_var_a, grad_var_a_inplace = 0, 1
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_a.register_hook(double_hook)
var_b = var_a**2
var_c = self.inplace_api_processing(
var_b) # var_b is modified inplace before using it
var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b
loss = var_d.sum()
loss.backward()
grad_var_a_inplace = var_a.grad.numpy()
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_a.register_hook(double_hook)
var_b = var_a**2
var_c = self.non_inplace_api_processing(
var_b) # var_b is modified inplace before using it
var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b
loss = var_d.sum()
loss.backward()
grad_var_a = var_a.grad.numpy()
self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
# inplace + hook
def test_backward_success_5(self):
# var_b is modified inplace before using it, the inplace operator doesn't result
# in incorrect gradient computation.
def double_hook(grad):
grad = grad * 2
return grad
grad_var_a, grad_var_a_inplace = 0, 1
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_b = var_a**2
var_b.register_hook(double_hook)
var_c = self.inplace_api_processing(
var_b) # var_b is modified inplace before using it
# Here, the gradient computation will use the value of var_b
var_d = var_c**2
loss = var_d.sum()
loss.backward()
grad_var_a_inplace = var_a.grad.numpy()
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_b = var_a**2
var_b.register_hook(double_hook)
var_c = self.non_inplace_api_processing(var_b)
var_d = var_c**2
loss = var_d.sum()
loss.backward()
grad_var_a = var_a.grad.numpy()
self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
# inplace + hook
def test_backward_success_6(self):
# Although var_b is modified inplace before using it, it does not used in gradient computation.
# The inplace operator doesn't result in incorrect gradient computation.
def double_hook(grad):
grad = grad * 2
return grad
grad_var_a, grad_var_a_inplace = 0, 1
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_b = var_a**2
var_b.register_hook(double_hook)
var_c = self.inplace_api_processing(
var_b) # var_b is modified inplace before using it
var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b
loss = var_d.sum()
loss.backward()
grad_var_a_inplace = var_a.grad.numpy()
with paddle.fluid.dygraph.guard():
with _test_eager_guard():
var_a = paddle.to_tensor(self.input_var_numpy).astype(
self.dtype)
var_a.stop_gradient = False
var_b = var_a**2
var_b.register_hook(double_hook)
var_c = self.non_inplace_api_processing(
var_b) # var_b is modified inplace before using it
var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b
loss = var_d.sum()
loss.backward()
grad_var_a = var_a.grad.numpy()
self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
class TestDygraphInplaceUnsqueeze(TestDygraphInplace): class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
def non_inplace_api_processing(self, var): def non_inplace_api_processing(self, var):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册