提交 ef905598 编写于 作者: W wanghaox 提交者: wanghaox

fix some code issues

#include <paddle/capi.h> #include <paddle/capi.h>
#include <time.h> #include <time.h>
#include "../common/common.h" #include "../common/common.h"
#define CONFIG_BIN "./trainer_config.bin" #define CONFIG_BIN "./trainer_config.bin"
...@@ -27,20 +28,19 @@ int main() { ...@@ -27,20 +28,19 @@ int main() {
CHECK(paddle_arguments_resize(in_args, 1)); CHECK(paddle_arguments_resize(in_args, 1));
// Create input matrix. // Create input matrix.
paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10, paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
/* size */ 784, /* size */ 784,
/* useGPU */ false); /* useGPU */ false);
srand(time(0)); srand(time(0));
std::vector<paddle_real> input; paddle_real* array;
input.resize(784 * 10);
// Get First row.
CHECK(paddle_matrix_get_row(mat, 0, &array));
for (int i = 0; i < input.size(); ++i) { for (int i = 0; i < 784; ++i) {
input[i] = rand() / ((float)RAND_MAX); array[i] = rand() / ((float)RAND_MAX);
} }
// Set value for the input matrix
CHECK(paddle_matrix_set_value(mat, input.data()));
CHECK(paddle_arguments_set_value(in_args, 0, mat)); CHECK(paddle_arguments_set_value(in_args, 0, mat));
...@@ -53,17 +53,18 @@ int main() { ...@@ -53,17 +53,18 @@ int main() {
CHECK(paddle_arguments_get_value(out_args, 0, prob)); CHECK(paddle_arguments_get_value(out_args, 0, prob));
std::std::vector<paddle_real> result; uint64_t height;
int height; uint64_t width;
int width;
CHECK(paddle_matrix_get_shape(prob, &height, &width); CHECK(paddle_matrix_get_shape(prob, &height, &width));
result.resize(height * width); CHECK(paddle_matrix_get_row(prob, 0, &array));
CHECK(paddle_matrix_get_value(prob, result.data()));
printf("Prob: "); printf("Prob: \n");
for (int i = 0; i < height * width; ++i) { for (int i = 0; i < height * width; ++i) {
printf("%.2f ", result[i]); printf("%.4f ", array[i]);
if ((i + 1) % width == 0) {
printf("\n");
}
} }
printf("\n"); printf("\n");
......
...@@ -17,24 +17,47 @@ limitations under the License. */ ...@@ -17,24 +17,47 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class RoiPoolOp : public framework::OperatorWithKernel { class ROIPoolOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of RoiPoolOp should not be null."); "Input(X) of ROIPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Rois"), PADDLE_ENFORCE(ctx->HasInput("ROIs"),
"Input(Rois) of RoiPoolOp should not be null."); "Input(ROIs) of ROIPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of RoiPoolOp should not be null."); "Output(Out) of ROIPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Argmax"), PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
"Output(Argmax) of RoiPoolOp should not be null."); "Output(Argmax) of ROIPoolOp should not be null.");
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
// Initialize the output's dims to maximum,
// and re-set to real dims by the value of Rois at kernel PADDLE_ENFORCE(input_dims.size() == 4,
ctx->SetOutputDim("Out", input_dims); "The format of input tensor is NCHW.");
PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D tensor of shape (num_rois, 5)"
"given as [[batch_id, x1, y1, x2, y2], …].");
int pooled_height = ctx->Attrs().Get<int>("pooled_height");
int pooled_width = ctx->Attrs().Get<int>("pooled_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
"The pooled output height must greater than 0");
PADDLE_ENFORCE_GT(pooled_width, 0,
"The pooled output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
"The spatial scale must greater than 0");
auto out_dims = input_dims;
out_dims[0] = rois_dims[0];
out_dims[1] = input_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
ctx->SetOutputDim("Out", out_dims);
ctx->SetOutputDim("Argmax", out_dims);
} }
protected: protected:
...@@ -46,7 +69,7 @@ class RoiPoolOp : public framework::OperatorWithKernel { ...@@ -46,7 +69,7 @@ class RoiPoolOp : public framework::OperatorWithKernel {
} }
}; };
class RoiPoolGradOp : public framework::OperatorWithKernel { class ROIPoolGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -67,44 +90,51 @@ class RoiPoolGradOp : public framework::OperatorWithKernel { ...@@ -67,44 +90,51 @@ class RoiPoolGradOp : public framework::OperatorWithKernel {
} }
}; };
class RoiPoolOpMaker : public framework::OpProtoAndCheckerMaker { class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
RoiPoolOpMaker(framework::OpProto* proto, ROIPoolOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"(Tensor), " "(Tensor), "
"the input of RoiPoolOp."); "the input of ROIPoolOp. "
AddInput("Rois", "The format of input tensor is NCHW. Where N is batch size, "
"C is the number of input channels, "
"H is the height of the feature, and "
"W is the width of the feature.");
AddInput("ROIs",
"(Tensor), " "(Tensor), "
"RoIs (Regions of Interest) to pool over. " "ROIs (Regions of Interest) to pool over. "
"Should be a 2-D tensor of shape (num_rois, 5)" "should be a 2-D tensor of shape (num_rois, 5)"
"given as [[batch_id, x1, y1, x2, y2], …]."); "given as [[batch_id, x1, y1, x2, y2], …]. "
"Where batch_id is the id of the data, "
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates.");
AddOutput("Out", AddOutput("Out",
"(Tensor), " "(Tensor), "
"RoI pooled output 4-D tensor of shape " "The output of ROIPoolOp is a 4-D tensor with shape "
"(num_rois, channels, pooled_h, pooled_w)."); "(num_rois, channels, pooled_h, pooled_w).");
AddOutput("Argmax", AddOutput("Argmax",
"(Tensor), " "(Tensor), "
"Argmaxes corresponding to indices in X used " "Argmaxes corresponding to indices in X used "
"for gradient computation. Only output " "for gradient computation. Only output "
"if arg “is_test” is false.").AsIntermediate(); "if arg “is_test” is false.").AsIntermediate();
AddAttr<float>("spatial_scale", AddAttr<float>("spatial_scale",
"(float, default 1.0), " "(float, default 1.0), "
"Multiplicative spatial scale factor " "Multiplicative spatial scale factor "
"to translate ROI coords from their input scale " "to translate ROI coords from their input scale "
"to the scale used when pooling.") "to the scale used when pooling.")
.SetDefault(1.0); .SetDefault(1.0);
AddAttr<int>("pooled_height", AddAttr<int>("pooled_height",
"(int, default 1), " "(int, default 1), "
"The pooled output height.") "The pooled output height.")
.SetDefault(1); .SetDefault(1);
AddAttr<int>("pooled_width", AddAttr<int>("pooled_width",
"(int, default 1), " "(int, default 1), "
"The pooled output width.") "The pooled output width.")
.SetDefault(1); .SetDefault(1);
AddComment(R"DOC( AddComment(R"DOC(
RoiPool operator ROIPool operator
ROI Pooling for Faster-RCNN. The link below is a further introduction: ROI Pooling for Faster-RCNN. The link below is a further introduction:
https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
...@@ -116,11 +146,11 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn ...@@ -116,11 +146,11 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(roi_pool, ops::RoiPoolOp, ops::RoiPoolOpMaker, REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
roi_pool_grad, ops::RoiPoolGradOp); roi_pool_grad, ops::ROIPoolGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool, roi_pool,
ops::CPURoiPoolOpKernel<paddle::platform::CPUPlace, float>); ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
roi_pool_grad, roi_pool_grad,
ops::CPURoiPoolGradOpKernel<paddle::platform::CPUPlace, float>); ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>);
...@@ -12,91 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,91 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/platform/cuda_helper.h"
#include "paddle/operators/roi_pool_op.h" #include "paddle/operators/roi_pool_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define FLT_MAX __FLT_MAX__ static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static constexpr int kROISize = 5;
constexpr int PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS = 512; static inline int NumBlocks(const int N) {
constexpr int PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS = 4096; return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
inline int PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(const int N) { template <typename T>
return std::min((N + PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS - 1) __global__ void GPUROIPoolForward(
/ PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS, const int nthreads, const T* input_data, const int64_t* input_rois,
PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS); const float spatial_scale, const int channels, const int height,
} const int width, const int pooled_height, const int pooled_width,
T* output_data, int64_t* argmax_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (size_t i = index; i < nthreads; i += offset) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
template <typename T> const int64_t* offset_input_rois = input_rois + n * kROISize;
__global__ void GPURoiPoolForward( int roi_batch_ind = offset_input_rois[0];
const int nthreads, int roi_start_w = round(offset_input_rois[1] * spatial_scale);
const T* input_data, int roi_start_h = round(offset_input_rois[2] * spatial_scale);
const int64_t* input_rois, int roi_end_w = round(offset_input_rois[3] * spatial_scale);
const float spatial_scale, int roi_end_h = round(offset_input_rois[4] * spatial_scale);
const int channels,
const int height, int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const int width, int roi_height = max(roi_end_h - roi_start_h + 1, 1);
const int pooled_height, T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
const int pooled_width, T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* output_data,
int64_t* argmax_data) { int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
int index = blockIdx.x * blockDim.x + threadIdx.x; int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
int offset = blockDim.x * gridDim.x; int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
for (size_t i = index; i < nthreads; i += offset) { int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height; hstart = min(max(hstart + roi_start_h, 0), height);
int c = (index / pooled_width / pooled_height) % channels; hend = min(max(hend + roi_start_h, 0), height);
int n = index / pooled_width / pooled_height / channels; wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
const int64_t* offset_input_rois = input_rois + n * 5; bool is_empty = (hend <= hstart) || (wend <= wstart);
int roi_batch_ind = offset_input_rois[0];
int roi_start_w = round(offset_input_rois[1] * spatial_scale); T maxval = is_empty ? 0 : -std::numeric_limits<float>::max();
int roi_start_h = round(offset_input_rois[2] * spatial_scale); int maxidx = -1;
int roi_end_w = round(offset_input_rois[3] * spatial_scale); const T* offset_input_data =
int roi_end_h = round(offset_input_rois[4] * spatial_scale); input_data + (roi_batch_ind * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
int roi_width = max(roi_end_w - roi_start_w + 1, 1); for (int w = wstart; w < wend; ++w) {
int roi_height = max(roi_end_h - roi_start_h + 1, 1); int input_data_index = h * width + w;
T bin_size_h = static_cast<T>(roi_height) if (offset_input_data[input_data_index] > maxval) {
/ static_cast<T>(pooled_height); maxval = offset_input_data[input_data_index];
T bin_size_w = static_cast<T>(roi_width) maxidx = input_data_index;
/ static_cast<T>(pooled_width);
int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
T maxval = is_empty ? 0 : -FLT_MAX;
int maxidx = -1;
const T* offset_input_data =
input_data + (roi_batch_ind * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int input_data_index = h * width + w;
if (offset_input_data[input_data_index] > maxval) {
maxval = offset_input_data[input_data_index];
maxidx = input_data_index;
}
} }
} }
output_data[index] = maxval; }
if (argmax_data) { output_data[index] = maxval;
argmax_data[index] = maxidx; if (argmax_data) {
} argmax_data[index] = maxidx;
}
} }
} }
template <typename T> template <typename T>
__global__ void GPURoiPoolBackward( __global__ void GPUROIPoolBackward(
const int nthreads, const int nthreads,
const int64_t* input_rois, const int64_t* input_rois,
const T* output_grad, const T* output_grad,
...@@ -117,7 +106,7 @@ __global__ void GPURoiPoolBackward( ...@@ -117,7 +106,7 @@ __global__ void GPURoiPoolBackward(
int c = (index / pooled_width / pooled_height) % channels; int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels; int n = index / pooled_width / pooled_height / channels;
const int64_t* offset_input_rois = input_rois + n * 5; const int64_t* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = offset_input_rois[0]; int roi_batch_ind = offset_input_rois[0];
int input_offset = (roi_batch_ind * channels + c) * height * width; int input_offset = (roi_batch_ind * channels + c) * height * width;
int output_offset = (n * channels + c) * pooled_height * pooled_width; int output_offset = (n * channels + c) * pooled_height * pooled_width;
...@@ -135,11 +124,11 @@ __global__ void GPURoiPoolBackward( ...@@ -135,11 +124,11 @@ __global__ void GPURoiPoolBackward(
template <typename Place, typename T> template <typename Place, typename T>
class GPURoiPoolOpKernel : public framework::OpKernel<T> { class GPUROIPoolOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X"); auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<Tensor>("Rois"); auto* rois = ctx.Input<Tensor>("ROIs");
auto* out = ctx.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
auto* argmax = ctx.Output<Tensor>("Argmax"); auto* argmax = ctx.Output<Tensor>("Argmax");
...@@ -147,31 +136,17 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -147,31 +136,17 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
auto pooled_width = ctx.Attr<int>("pooled_width"); auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale"); auto spatial_scale = ctx.Attr<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
"The pooled output height must greater than 0");
PADDLE_ENFORCE_GT(pooled_width, 0,
"The pooled output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0,
"The spatial scale must greater than 0");
auto in_dims = in->dims(); auto in_dims = in->dims();
auto in_stride = framework::stride(in_dims); auto in_stride = framework::stride(in_dims);
int channels = in_dims[1]; int channels = in_dims[1];
int height = in_dims[2]; int height = in_dims[2];
int width = in_dims[3]; int width = in_dims[3];
int rois_num = rois->dims()[0]; size_t rois_num = rois->dims()[0];
auto out_dims = in_dims;
out_dims[0] = rois_num;
out_dims[1] = in_dims[1];
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
out->Resize(out_dims);
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
set_zero(ctx.device_context(), out, static_cast<T>(0)); set_zero(ctx.device_context(), out, static_cast<T>(0));
argmax->Resize(out->dims());
argmax->mutable_data<int64_t>(ctx.GetPlace()); argmax->mutable_data<int64_t>(ctx.GetPlace());
math::SetConstant<Place, int64_t> set_init; math::SetConstant<Place, int64_t> set_init;
set_init(ctx.device_context(), argmax, static_cast<int64_t>(-1)); set_init(ctx.device_context(), argmax, static_cast<int64_t>(-1));
...@@ -179,10 +154,10 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -179,10 +154,10 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
if (rois_num== 0) return; if (rois_num== 0) return;
int output_size = out->numel(); int output_size = out->numel();
int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_size); int blocks = NumBlocks(output_size);
int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS; int threads = kNumCUDAThreads;
GPURoiPoolForward<T> GPUROIPoolForward<T>
<<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, output_size,
in->data<T>(), in->data<T>(),
...@@ -195,17 +170,15 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -195,17 +170,15 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
pooled_width, pooled_width,
out->mutable_data<T>(ctx.GetPlace()), out->mutable_data<T>(ctx.GetPlace()),
argmax->mutable_data<int64_t>(ctx.GetPlace())); argmax->mutable_data<int64_t>(ctx.GetPlace()));
return;
} }
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GPURoiPoolGradOpKernel : public framework::OpKernel<T> { class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X"); auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<Tensor>("Rois"); auto* rois = ctx.Input<Tensor>("ROIs");
auto* argmax = ctx.Input<Tensor>("Argmax"); auto* argmax = ctx.Input<Tensor>("Argmax");
auto* out_grad = auto* out_grad =
...@@ -217,23 +190,22 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -217,23 +190,22 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
auto pooled_width = ctx.Attr<int>("pooled_width"); auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale"); auto spatial_scale = ctx.Attr<float>("spatial_scale");
int rois_num = rois->dims()[0]; size_t rois_num = rois->dims()[0];
int channels = in->dims()[1]; int channels = in->dims()[1];
int height = in->dims()[2]; int height = in->dims()[2];
int width = in->dims()[3]; int width = in->dims()[3];
if (x_grad) { if (x_grad) {
x_grad->Resize(in->dims());
x_grad->mutable_data<T>(ctx.GetPlace()); x_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
set_zero(ctx.device_context(), x_grad, static_cast<T>(0)); set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
int output_grad_size = out_grad->numel(); int output_grad_size = out_grad->numel();
int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_grad_size); int blocks = NumBlocks(output_grad_size);
int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS; int threads = kNumCUDAThreads;
if (output_grad_size > 0) { if (output_grad_size > 0) {
GPURoiPoolBackward<T> GPUROIPoolBackward<T>
<<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, output_grad_size,
rois->data<int64_t>(), rois->data<int64_t>(),
...@@ -248,7 +220,6 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -248,7 +220,6 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
pooled_width, pooled_width,
x_grad->mutable_data<T>(ctx.GetPlace())); x_grad->mutable_data<T>(ctx.GetPlace()));
} }
return;
} }
} }
}; };
...@@ -259,7 +230,7 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -259,7 +230,7 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
roi_pool, roi_pool,
ops::GPURoiPoolOpKernel<paddle::platform::GPUPlace, float>); ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
roi_pool_grad, roi_pool_grad,
ops::GPURoiPoolGradOpKernel<paddle::platform::GPUPlace, float>); ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>);
...@@ -25,11 +25,11 @@ using LoDTensor = framework::LoDTensor; ...@@ -25,11 +25,11 @@ using LoDTensor = framework::LoDTensor;
using LoD = framework::LoD; using LoD = framework::LoD;
template <typename Place, typename T> template <typename Place, typename T>
class CPURoiPoolOpKernel : public framework::OpKernel<T> { class CPUROIPoolOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X"); auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<Tensor>("Rois"); auto* rois = ctx.Input<Tensor>("ROIs");
auto* out = ctx.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
auto* argmax = ctx.Output<Tensor>("Argmax"); auto* argmax = ctx.Output<Tensor>("Argmax");
...@@ -37,13 +37,6 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -37,13 +37,6 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
auto pooled_width = ctx.Attr<int>("pooled_width"); auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale"); auto spatial_scale = ctx.Attr<float>("spatial_scale");
PADDLE_ENFORCE_GT(pooled_height, 0,
"The pooled output height must greater than 0");
PADDLE_ENFORCE_GT(pooled_width, 0,
"The pooled output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0,
"The spatial scale must greater than 0");
auto in_dims = in->dims(); auto in_dims = in->dims();
int batch_size = in_dims[0]; int batch_size = in_dims[0];
int channels = in_dims[1]; int channels = in_dims[1];
...@@ -51,18 +44,10 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -51,18 +44,10 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
int width = in_dims[3]; int width = in_dims[3];
int rois_num = rois->dims()[0]; int rois_num = rois->dims()[0];
auto out_dims = in_dims;
out_dims[0] = rois_num;
out_dims[1] = channels;
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
out->Resize(out_dims);
argmax->Resize(out->dims());
auto in_stride = framework::stride(in_dims); auto in_stride = framework::stride(in_dims);
auto argmax_stride = framework::stride(argmax->dims()); auto argmax_stride = framework::stride(argmax->dims());
auto roi_stride = framework::stride(rois->dims()); auto roi_stride = framework::stride(rois->dims());
auto out_stride = framework::stride(out_dims); auto out_stride = framework::stride(out->dims());
const T* input_data = in->data<T>(); const T* input_data = in->data<T>();
const int64_t* rois_data = rois->data<int64_t>(); const int64_t* rois_data = rois->data<int64_t>();
...@@ -124,7 +109,8 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -124,7 +109,8 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
// Define an empty pooling region to be zero // Define an empty pooling region to be zero
bool is_empty = (hend <= hstart) || (wend <= wstart); bool is_empty = (hend <= hstart) || (wend <= wstart);
output_data[pool_index] = is_empty ? 0 : -__FLT_MAX__; output_data[pool_index] =
is_empty ? 0 : -std::numeric_limits<float>::max();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
...@@ -150,11 +136,11 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> { ...@@ -150,11 +136,11 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class CPURoiPoolGradOpKernel : public framework::OpKernel<T> { class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<Tensor>("X"); auto* in = ctx.Input<Tensor>("X");
auto* rois = ctx.Input<Tensor>("Rois"); auto* rois = ctx.Input<Tensor>("ROIs");
auto* argmax = ctx.Input<Tensor>("Argmax"); auto* argmax = ctx.Input<Tensor>("Argmax");
auto* out_grad = auto* out_grad =
...@@ -188,9 +174,9 @@ class CPURoiPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -188,9 +174,9 @@ class CPURoiPoolGradOpKernel : public framework::OpKernel<T> {
for (size_t n = 0; n < rois_num; ++n) { for (size_t n = 0; n < rois_num; ++n) {
size_t roi_batch_idx = rois_data[0]; size_t roi_batch_idx = rois_data[0];
T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx; T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
for (size_t c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
for (size_t ph = 0; ph < pooled_height; ++ph) { for (int ph = 0; ph < pooled_height; ++ph) {
for (size_t pw = 0; pw < pooled_width; ++pw) { for (int pw = 0; pw < pooled_width; ++pw) {
size_t pool_index = ph * pooled_width + pw; size_t pool_index = ph * pooled_width + pw;
if (argmax_data[pool_index] >= 0) { if (argmax_data[pool_index] >= 0) {
......
...@@ -4,8 +4,7 @@ import math ...@@ -4,8 +4,7 @@ import math
import sys import sys
from op_test import OpTest from op_test import OpTest
class TestROIPoolOp(OpTest):
class TestSequenceSliceOp(OpTest):
def set_data(self): def set_data(self):
self.init_test_case() self.init_test_case()
self.make_rois() self.make_rois()
...@@ -13,7 +12,7 @@ class TestSequenceSliceOp(OpTest): ...@@ -13,7 +12,7 @@ class TestSequenceSliceOp(OpTest):
self.inputs = { self.inputs = {
'X': self.x, 'X': self.x,
'Rois': self.rois} 'ROIs': self.rois}
self.attrs = { self.attrs = {
'spatial_scale': self.spatial_scale, 'spatial_scale': self.spatial_scale,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册