提交 11bcb43a 编写于 作者: T typhoonzero

fix merge issue

...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
|---|---| |---|---|
| backyes | Yan-Fei Wang | | backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi | | beckett1124 | Bin Qi |
| Canpio | Jia-Yi Feng | | JiayiFeng | Jia-Yi Feng |
| chengxiaohua1105 | Xiao-Hua Cheng | | chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng | | cxysteven | Xing-Yi Cheng |
......
...@@ -82,7 +82,7 @@ language = 'zh_CN' ...@@ -82,7 +82,7 @@ language = 'zh_CN'
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
exclude_patterns = ['_build', '**/*_en*', '*_en*'] exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all # The reST default role (used for this markup: `text`) to use for all
# documents. # documents.
......
...@@ -82,7 +82,7 @@ language = None ...@@ -82,7 +82,7 @@ language = None
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
exclude_patterns = ['_build', '**/*_cn*', '*_cn*'] exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
# The reST default role (used for this markup: `text`) to use for all # The reST default role (used for this markup: `text`) to use for all
# documents. # documents.
......
...@@ -11,7 +11,6 @@ if(MOBILE_INFERENCE) ...@@ -11,7 +11,6 @@ if(MOBILE_INFERENCE)
else() else()
add_subdirectory(pserver) add_subdirectory(pserver)
add_subdirectory(trainer) add_subdirectory(trainer)
add_subdirectory(string)
add_subdirectory(scripts) add_subdirectory(scripts)
if(WITH_C_API) if(WITH_C_API)
......
...@@ -4,3 +4,4 @@ add_subdirectory(framework) ...@@ -4,3 +4,4 @@ add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(pybind) add_subdirectory(pybind)
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(string)
...@@ -314,5 +314,15 @@ DDim stride(const DDim& ddim) { ...@@ -314,5 +314,15 @@ DDim stride(const DDim& ddim) {
} }
return framework::make_ddim(strides); return framework::make_ddim(strides);
} }
DDim stride_numel(const framework::DDim& ddim) {
std::vector<int64_t> strides(ddim.size());
strides[ddim.size() - 1] = ddim[ddim.size() - 1];
for (int i = ddim.size() - 2; i >= 0; --i) {
strides[i] = strides[i + 1] * ddim[i];
}
return framework::make_ddim(strides);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -125,6 +125,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims); ...@@ -125,6 +125,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims);
DDim flatten_to_1d(const DDim& src); DDim flatten_to_1d(const DDim& src);
DDim stride(const DDim& ddim); DDim stride(const DDim& ddim);
DDim stride_numel(const DDim& ddim);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/string/piece.h" #include "paddle/fluid/string/piece.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -37,9 +37,8 @@ class Vector { ...@@ -37,9 +37,8 @@ class Vector {
// Fill vector with value. The vector size is `count`. // Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T& value = T()) { explicit Vector(size_t count, const T& value = T()) {
if (count == 0) { InitEmpty();
InitEmpty(); if (count != 0) {
} else {
resize(count); resize(count);
T* ptr = begin(); T* ptr = begin();
for (size_t i = 0; i < count; ++i) { for (size_t i = 0; i < count; ++i) {
...@@ -122,6 +121,10 @@ class Vector { ...@@ -122,6 +121,10 @@ class Vector {
const T* begin() const { return &this->operator[](0); } const T* begin() const { return &this->operator[](0); }
const T* end() const { return &this->operator[](size()); } const T* end() const { return &this->operator[](size()); }
const T* cbegin() const { return begin(); }
const T* cend() const { return end(); }
const T& back() const { const T& back() const {
auto it = end(); auto it = end();
--it; --it;
...@@ -244,7 +247,9 @@ class Vector { ...@@ -244,7 +247,9 @@ class Vector {
bool operator==(const Vector<T>& other) const { bool operator==(const Vector<T>& other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
for (auto it1 = begin(), it2 = other.begin(); it1 < end(); ++it1, ++it2) { auto it1 = cbegin();
auto it2 = other.cbegin();
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) { if (*it1 != *it2) {
return false; return false;
} }
......
...@@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) { ...@@ -26,10 +26,10 @@ TEST(mixed_vector, CPU_VECTOR) {
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
tmp.push_back(i); tmp.push_back(i);
} }
ASSERT_EQ(tmp.size(), 10); ASSERT_EQ(tmp.size(), 10UL);
vec<int> tmp2; vec<int> tmp2;
tmp2 = tmp; tmp2 = tmp;
ASSERT_EQ(tmp2.size(), 10); ASSERT_EQ(tmp2.size(), 10UL);
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp2[i], i); ASSERT_EQ(tmp2[i], i);
ASSERT_EQ(tmp2[i], tmp[i]); ASSERT_EQ(tmp2[i], tmp[i]);
...@@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) { ...@@ -58,7 +58,7 @@ TEST(mixed_vector, GPU_VECTOR) {
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
tmp.push_back(i); tmp.push_back(i);
} }
ASSERT_EQ(tmp.size(), 10); ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu(0); paddle::platform::CUDAPlace gpu(0);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
...@@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) { ...@@ -79,7 +79,7 @@ TEST(mixed_vector, MultiGPU) {
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
tmp.push_back(i); tmp.push_back(i);
} }
ASSERT_EQ(tmp.size(), 10); ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu0(0); paddle::platform::CUDAPlace gpu0(0);
paddle::platform::SetDeviceId(0); paddle::platform::SetDeviceId(0);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
...@@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) { ...@@ -91,3 +91,10 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ(tmp[i], i * 100); ASSERT_EQ(tmp[i], i * 100);
} }
} }
TEST(mixed_vector, InitWithCount) {
paddle::framework::Vector<int> vec(10, 10);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(vec[i], 10);
}
}
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <mutex> // for call_once #include <mutex> // for call_once
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
......
...@@ -28,17 +28,18 @@ class ConcatKernel : public framework::OpKernel<T> { ...@@ -28,17 +28,18 @@ class ConcatKernel : public framework::OpKernel<T> {
auto ins = ctx.MultiInput<framework::Tensor>("X"); auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = ins.size(); auto place = ctx.GetPlace();
out->mutable_data<T>(place);
auto out_stride = framework::stride_numel(out->dims());
size_t output_offset = 0; size_t output_offset = 0;
out->mutable_data<T>(ctx.GetPlace()); for (auto* in : ins) {
auto out_stride = framework::stride(out->dims()); auto in_stride = framework::stride_numel(in->dims());
for (size_t i = 0; i < n; i++) { StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
auto& in = ins[i]; out->data<T>() + output_offset, out_stride,
auto axis_dim = in->dims()[axis]; in->data<T>(), in_stride);
auto in_stride = framework::stride(in->dims()); output_offset += in_stride[axis];
StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
in->dims(), out_stride, out->data<T>() + output_offset);
output_offset += axis_dim * in_stride[axis];
} }
} }
}; };
...@@ -50,17 +51,16 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -50,17 +51,16 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = outs.size();
size_t input_offset = 0; size_t input_offset = 0;
auto in_stride = framework::stride(in->dims()); auto in_stride = framework::stride_numel(in->dims());
for (size_t i = 0; i < n; i++) {
auto& out = outs[i]; for (auto& out : outs) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
size_t axis_dim = out->dims()[axis]; auto out_stride = framework::stride_numel(out->dims());
auto out_stride = framework::stride(out->dims()); StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset, out_stride, in->data<T>() + input_offset,
in_stride, out->dims(), out_stride, out->data<T>()); in_stride);
input_offset += axis_dim * in_stride[axis]; input_offset += out_stride[axis];
} }
} }
}; };
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h" #include "paddle/fluid/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -98,6 +98,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -98,6 +98,7 @@ class ListenAndServOp : public framework::OperatorBase {
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0); rpc_service_->SetCond(0);
size_t recv_var_cnt = 0; size_t recv_var_cnt = 0;
size_t update_param_cnt = 0;
int batch_barrier = 0; int batch_barrier = 0;
while (batch_barrier != fan_in) { while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get(); const detail::MessageWithName &v = rpc_service_->Get();
...@@ -122,11 +123,10 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -122,11 +123,10 @@ class ListenAndServOp : public framework::OperatorBase {
} }
} }
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier."; VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) { if (exit_flag) {
rpc_service_->ShutDown(); rpc_service_->ShutDown();
} }
VLOG(3) << "run optimize graph...";
try { try {
executor.Run(*program, &recv_scope, block->ID(), /*global_block*/ executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
false /*create_local_scope*/, false /*create_vars*/); false /*create_local_scope*/, false /*create_vars*/);
...@@ -134,7 +134,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -134,7 +134,7 @@ class ListenAndServOp : public framework::OperatorBase {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
} }
rpc_service_->SetCond(1); rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(recv_var_cnt); rpc_service_->WaitClientGet(update_param_cnt);
grads_counter_.clear(); grads_counter_.clear();
} // while(true) } // while(true)
} }
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/string/printf.h" #include "paddle/fluid/string/printf.h"
USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(listen_and_serv); USE_NO_KERNEL_OP(listen_and_serv);
......
...@@ -29,7 +29,9 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ...@@ -29,7 +29,9 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out")); PADDLE_ENFORCE(ctx->HasOutput("Out"));
PADDLE_ENFORCE(ctx->HasInput("Y")); PADDLE_ENFORCE(ctx->HasInput("Y"));
framework::DDim out_dim; framework::DDim out_dim;
out_dim = ctx->GetInputDim("Y"); auto y_dim = ctx->GetInputDim("Y");
out_dim = ctx->GetInputDim("X");
out_dim[0] = y_dim[0];
ctx->ShareLoD("Y", "Out"); ctx->ShareLoD("Y", "Out");
ctx->SetOutputDim("Out", out_dim); ctx->SetOutputDim("Out", out_dim);
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <chrono>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
...@@ -27,18 +28,18 @@ class SplitOpKernel : public framework::OpKernel<T> { ...@@ -27,18 +28,18 @@ class SplitOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out"); auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto in_stride = framework::stride(in->dims()); auto in_stride = framework::stride_numel(in->dims());
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
const size_t n = outs.size(); auto place = ctx.GetPlace();
size_t input_offset = 0; size_t input_offset = 0;
for (size_t i = 0; i < n; i++) { for (auto& out : outs) {
auto& out = outs[i];
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
size_t axis_dim = out->dims()[axis]; auto out_stride = framework::stride_numel(out->dims());
auto out_stride = framework::stride(out->dims()); StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset, out_stride, in->data<T>() + input_offset,
in_stride, out->dims(), out_stride, out->data<T>()); in_stride);
input_offset += axis_dim * in_stride[axis]; input_offset += out_stride[axis];
} }
} }
}; };
......
...@@ -41,5 +41,62 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, ...@@ -41,5 +41,62 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst); StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
boost::apply_visitor(func, dst_dim); boost::apply_visitor(func, dst_dim);
} }
// Strided numel memory copy from src to dst by the specified axis
//
// For example, for a tensor dims [4, 20, 100], the strieded numel is
// [8000, 2000, 100]
//
// NOTE: The src and dst tensor should have the same elements
// except the specified axis.
template <typename T>
inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
int64_t axis, T* dst,
const framework::DDim& dst_stride_numel,
const T* src,
const framework::DDim& src_stride_numel) {
int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
int64_t src_after = src_stride_numel[axis];
int64_t dst_after = dst_stride_numel[axis];
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
"src and dst tensor should have the same dims size.");
for (int64_t i = 0; i < axis; ++i) {
if (i < axis) {
PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
dst_stride_numel[i] / dst_stride_numel[axis],
"src and dst should have the same elements "
"except the specified axis.");
} else if (i == axis) {
continue;
} else {
PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
"src and dst should have the same elements "
"except the specified axis.");
}
}
for (int64_t i = 0; i < before; ++i) {
if (platform::is_cpu_place(place)) {
auto& cpu_place = boost::get<platform::CPUPlace>(place);
memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
src + i * src_after, sizeof(T) * src_after);
} else {
#ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::CUDAPlace>(place);
auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
src + i * src_after, sizeof(T) * src_after,
cuda_ctx.stream());
#else
PADDLE_THROW("Paddle is not compiled with GPU");
#endif
}
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -22,69 +22,43 @@ class TargetAssignOp : public framework::OperatorWithKernel { ...@@ -22,69 +22,43 @@ class TargetAssignOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
// checkout inputs PADDLE_ENFORCE(ctx->HasInput("X"),
PADDLE_ENFORCE(ctx->HasInput("EncodedGTBBox"), "Input(X) of TargetAssignOp should not be null");
"Input(EncodedGTBBox) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("GTScoreLabel"),
"Input(GTScoreLabel) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("MatchIndices"), PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
"Input(MatchIndices) of TargetAssignOp should not be null"); "Input(MatchIndices) of TargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("NegIndices"),
"Input(NegIndices) of TargetAssignOp should not be null"); PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of TargetAssignOp should not be null.");
// checkout outputs PADDLE_ENFORCE(ctx->HasOutput("OutWeight"),
PADDLE_ENFORCE( "Output(OutWeight) of TargetAssignOp should not be null.");
ctx->HasOutput("PredBBoxLabel"),
"Output(PredBBoxLabel) of TargetAssignOp should not be null."); auto in_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(
ctx->HasOutput("PredBBoxWeight"),
"Output(PredBBoxWeight) of TargetAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PredScoreLabel"),
"Output(PredScoreLabel) of TargetAssignOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PredScoreWeight"),
"Output(PredScoreWeight) of TargetAssignOp should not be null.");
auto blabel_dims = ctx->GetInputDim("EncodedGTBBox");
auto slabel_dims = ctx->GetInputDim("GTScoreLabel");
auto mi_dims = ctx->GetInputDim("MatchIndices"); auto mi_dims = ctx->GetInputDim("MatchIndices");
auto neg_dims = ctx->GetInputDim("NegIndices");
PADDLE_ENFORCE_EQ(blabel_dims.size(), 3UL, PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3.");
"The rank of Input(EncodedGTBBox) must be 3."); PADDLE_ENFORCE_EQ(mi_dims.size(), 2,
PADDLE_ENFORCE_EQ(slabel_dims.size(), 2UL,
"The rank of Input(GTScoreLabel) must be 2.");
PADDLE_ENFORCE_EQ(mi_dims.size(), 2UL,
"The rank of Input(MatchIndices) must be 2."); "The rank of Input(MatchIndices) must be 2.");
PADDLE_ENFORCE_EQ(neg_dims.size(), 2UL,
"The rank of Input(NegIndices) must be 2."); if (ctx->HasInput("NegIndices")) {
auto neg_dims = ctx->GetInputDim("NegIndices");
PADDLE_ENFORCE_EQ(blabel_dims[0], slabel_dims[0], PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
"The 1st dimension (means the total number of " "The rank of Input(NegIndices) must be 2.");
"ground-truth bounding boxes) of Input(EncodedGTBBox) " PADDLE_ENFORCE_EQ(neg_dims[1], 1,
"and Input(GTScoreLabel) must be the same."); "The last dimenstion of Out(NegIndices) must be 1.");
PADDLE_ENFORCE_EQ(blabel_dims[1], mi_dims[1], }
"The 2nd dimension (means the number of priod boxes) "
"of Input(EncodedGTBBox) and "
"Input(MatchIndices) must be the same.");
PADDLE_ENFORCE_EQ(blabel_dims[2], 4,
"The 3rd dimension of Input(EncodedGTBBox) must be 4.");
auto n = mi_dims[0]; auto n = mi_dims[0];
auto np = mi_dims[1]; auto m = mi_dims[1];
ctx->SetOutputDim("PredBBoxLabel", {n, np, 4}); auto k = in_dims[in_dims.size() - 1];
ctx->SetOutputDim("PredBBoxWeight", {n, np, 1}); ctx->SetOutputDim("Out", {n, m, k});
ctx->SetOutputDim("PredScoreLabel", {n, np, 1}); ctx->SetOutputDim("OutWeight", {n, m, 1});
ctx->SetOutputDim("PredScoreWeight", {n, np, 1});
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.Input<framework::LoDTensor>("EncodedGTBBox")->type()),
ctx.device_context()); ctx.device_context());
} }
}; };
...@@ -93,102 +67,87 @@ class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,102 +67,87 @@ class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker) TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("EncodedGTBBox", AddInput("X",
"(LoDTensor), The encoded ground-truth bounding boxes with shape " "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
"[Ng, Np, 4], where Ng is the total number of ground-truth boxes " "Some elements in X will be assigned to Out based on the "
"in this mini-batch, Np the number of predictions, 4 is the " "MatchIndices and NegIndices.");
"number of coordinate in [xmin, ymin, xmax, ymax] layout.");
AddInput("GTScoreLabel",
"(LoDTensor, default LoDTensor<int>), The input ground-truth "
"labels with shape [Ng, 1], where the Ng is the same as it in "
"the input of EncodedGTBBox.");
AddInput("MatchIndices", AddInput("MatchIndices",
"(Tensor, default Tensor<int>), The input matched indices " "(Tensor, default Tensor<int>), The input matched indices "
"with shape [N, Np], where N is the batch size, Np is the same " "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
"as it in the input of EncodedGTBBox. If MatchIndices[i][j] " "of column is not matched to any entity of row in i-th instance.");
"is -1, the j-th prior box is not matched to any ground-truh "
"box in i-th instance.");
AddInput("NegIndices", AddInput("NegIndices",
"(LoDTensor, default LoDTensor<int>), The input negative example " "(LoDTensor, default LoDTensor<int>), The input negative example "
"indices with shape [Neg, 1], where is the total number of " "indices are an optional input with shape [Neg, 1], where Neg is "
"negative example indices."); "the total number of negative example indices.")
AddAttr<int>("background_label", .AsDispensable();
"(int, default 0), Label index of background class.") AddAttr<int>("mismatch_value",
"(int, default 0), Fill this value to the "
"mismatched location.")
.SetDefault(0); .SetDefault(0);
AddOutput("PredBBoxLabel", AddOutput("Out",
"(Tensor), The output encoded ground-truth labels " "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
"with shape [N, Np, 4], N is the batch size and Np, 4 is the " "N and P is the same as they are in NegIndices, K is the "
"same as they in input of EncodedGTBBox. If MatchIndices[i][j] " "same as it in input of X. If MatchIndices[i][j] "
"is -1, the PredBBoxLabel[i][j][:] is the encoded ground-truth " "is -1, the Out[i][j][0 : K] is the mismatch_value.");
"box for background_label in i-th instance."); AddOutput("OutWeight",
AddOutput("PredBBoxWeight", "(Tensor), The weight for output with the shape of [N, P, 1]");
"(Tensor), The weight for PredBBoxLabel with the shape "
"of [N, Np, 1]");
AddOutput("PredScoreLabel",
"(Tensor, default Tensor<int>), The output score labels for "
"each predictions with shape [N, Np, 1]. If MatchIndices[i][j] "
"is -1, PredScoreLabel[i][j] = background_label.");
AddOutput("PredScoreWeight",
"(Tensor), The weight for PredScoreLabel with the shape "
"of [N, Np, 1]");
AddComment(R"DOC( AddComment(R"DOC(
This operator is, for given the encoded boxes between prior boxes and This operator can be, for given the target bounding boxes or labels,
ground-truth boxes and ground-truth class labels, to assign classification to assign classification and regression targets to each prediction as well as
and regression targets to each prior box as well as weights to each weights to prediction. The weights is used to specify which prediction would
prior box. The weights is used to specify which prior box would not contribute not contribute to training loss.
to training loss.
For each instance, the output `Out` and`OutWeight` are assigned based on
For each instance, the output `PredBBoxLabel`, `PredBBoxWeight`, `MatchIndices` and `NegIndices`.
`PredScoreLabel` and `PredScoreWeight` are assigned based on `MatchIndices`. Assumed that the row offset for each instance in `X` is called lod,
Assumed that the row offset for each instance in `EncodedGTBBox` is called lod, this operator assigns classification/regression targets by performing the
this operato assigns classification/regression targets by performing the
following steps: following steps:
1. Assigning all outpts based on `MatchIndices`: 1. Assigning all outpts based on `MatchIndices`:
If id = MatchIndices[i][j] > 0, If id = MatchIndices[i][j] > 0,
PredBBoxLabel[i][j] = EncodedGTBBox[lod[i] + id][j] Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
PredBBoxWeight[i][j] = 1. OutWeight[i][j] = 1.
PredScoreLabel[i][j] = GTScoreLabel[lod[i] + id]
PredScoreWeight[i][j] = 1.
Otherwise, Otherwise,
PredBBoxLabel[j][j] = [0., 0., 0., 0.] Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
PredBBoxWeight[i][j] = 0. OutWeight[i][j] = 0.
PredScoreLabel[i][j] = background_label
PredScoreWeight[i][j] = 0.
2. Assigning PredScoreWeight based on `NegIndices`: 2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
Assumed that the row offset for each instance in `NegIndices` is caleed neg_lod, Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
for i-th instance and all ids of NegIndices in this instance: for i-th instance and each `id` of NegIndices in this instance:
PredScoreLabel[i][id] = background_label Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
PredScoreWeight[i][id] = 1.0 OutWeight[i][id] = 1.0
)DOC"); )DOC");
} }
}; };
template <typename T> template <typename T, typename WT>
struct NegTargetAssignFunctor<platform::CPUDeviceContext, T> { struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices, void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
const size_t* lod, const int num, const int num_prior_box, const size_t* lod, const int N, const int M, const int K,
const int background_label, int* out_label, T* out_label_wt) { const int mismatch_value, T* out, WT* out_wt) {
for (int i = 0; i < num; ++i) { for (int i = 0; i < N; ++i) {
for (size_t j = lod[i]; j < lod[i + 1]; ++j) { for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
int id = neg_indices[j]; int id = neg_indices[j];
out_label[i * num_prior_box + id] = background_label; int off = (i * M + id) * K;
out_label_wt[i * num_prior_box + id] = static_cast<T>(1.0); for (int k = 0; k < K; ++k) {
out[off + k] = mismatch_value;
out_wt[off + k] = static_cast<WT>(1.0);
}
} }
} }
} }
}; };
template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float>; template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
template struct NegTargetAssignFunctor<platform::CPUDeviceContext, double>; template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -198,5 +157,5 @@ REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp, ...@@ -198,5 +157,5 @@ REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
ops::TargetAssignOpMaker); ops::TargetAssignOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
target_assign, target_assign,
ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float>, ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, double>); ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
...@@ -17,39 +17,41 @@ limitations under the License. */ ...@@ -17,39 +17,41 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T, typename WT>
__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod, __global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
const int num, const int num_prior_box, const int N, const int M, const int K,
const int background_label, const int mismatch_value, T* out,
int* out_label, T* out_label_wt) { WT* out_wt) {
int bidx = blockIdx.x; int bidx = blockIdx.x;
int st = lod[bidx]; int st = lod[bidx];
int ed = lod[bidx + 1]; int ed = lod[bidx + 1];
int row_start = bidx * num_prior_box; int row_start = bidx * M;
for (int i = st + threadIdx.x; i < ed; i += blockDim.x) { for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
int id = row_start + neg_indices[i]; int id = row_start + neg_indices[i];
out_label[id] = background_label; for (int k = 0; k < K; ++k) {
out_label_wt[id] = 1.; out[id * K + k] = T(mismatch_value);
out_wt[id * K + k] = WT(1.);
}
} }
} }
template <typename T> template <typename T, typename WT>
struct NegTargetAssignFunctor<platform::CUDADeviceContext, T> { struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
void operator()(const platform::CUDADeviceContext& ctx, void operator()(const platform::CUDADeviceContext& ctx,
const int* neg_indices, const size_t* lod, const int num, const int* neg_indices, const size_t* lod, const int N,
const int num_prior_box, const int background_label, const int M, const int K, const int mismatch_value, T* out,
int* out_label, T* out_label_wt) { WT* out_wt) {
const int block_size = 256; const int block_size = 256;
const int grid_size = num; const int grid_size = N;
NegTargetAssignKernel<T><<<grid_size, block_size, 0, ctx.stream()>>>( NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
neg_indices, lod, num, num_prior_box, background_label, out_label, neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
out_label_wt);
} }
}; };
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float>; template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>; template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -57,5 +59,5 @@ template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>; ...@@ -57,5 +59,5 @@ template struct NegTargetAssignFunctor<platform::CUDADeviceContext, double>;
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
target_assign, target_assign,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float>, ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, double>); ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
...@@ -19,140 +19,113 @@ limitations under the License. */ ...@@ -19,140 +19,113 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, typename WT>
template <typename T>
struct TargetAssignFunctor { struct TargetAssignFunctor {
const T* gt_box_; const T* in_;
const int* gt_label_;
const int* match_indices_; const int* match_indices_;
const size_t* lod_; const size_t* lod_;
const int background_label_; const int mismatch_value_;
const int64_t num_; const int64_t N_;
const int64_t num_prior_box_; const int64_t M_;
const int64_t P_;
T* out_box_; const int64_t K_;
T* out_box_wt_;
int* out_label_; T* out_;
T* out_label_wt_; WT* out_wt_;
TargetAssignFunctor(const T* gt_box, const int* gt_label, TargetAssignFunctor(const T* input, const int* match_indices,
const int* match_indices, const size_t* lod, const size_t* lod, const int mismatch_value,
const int background_label, const int64_t num, const int64_t N, const int64_t M, const int64_t P,
const int64_t np, T* out_box, T* out_box_wt, const int64_t K, T* out, WT* out_wt)
int* out_label, T* out_label_wt) : in_(input),
: gt_box_(gt_box),
gt_label_(gt_label),
match_indices_(match_indices), match_indices_(match_indices),
lod_(lod), lod_(lod),
background_label_(background_label), mismatch_value_(mismatch_value),
num_(num), N_(N),
num_prior_box_(np), M_(M),
out_box_(out_box), P_(P),
out_box_wt_(out_box_wt), K_(K),
out_label_(out_label), out_(out),
out_label_wt_(out_label_wt) {} out_wt_(out_wt) {}
HOSTDEVICE void operator()(size_t i) const { HOSTDEVICE void operator()(size_t i) const {
int row = i / num_prior_box_; int h = i / M_;
int col = i - row * num_prior_box_; int w = i - h * M_;
size_t row_off = lod_[row]; size_t off = lod_[h];
int offset = row * num_prior_box_ + col; int id = match_indices_[i];
int id = match_indices_[offset]; T* out = out_ + i * K_;
T* obox = out_box_ + offset * 4; WT* out_wt = out_wt_ + i;
int* olabel = out_label_ + offset;
T* obox_wt = out_box_wt_ + offset;
T* olabel_wt = out_label_wt_ + offset;
if (id > -1) { if (id > -1) {
const T* gtbox = gt_box_ + ((row_off + id) * num_prior_box_ + col) * 4; int w_off = w % P_;
const T* in = in_ + ((off + id) * P_ + w_off) * K_;
obox[0] = gtbox[0]; for (int64_t k = 0; k < K_; ++k) {
obox[1] = gtbox[1]; out[k] = in[k];
obox[2] = gtbox[2]; }
obox[3] = gtbox[3]; out_wt[0] = static_cast<WT>(1.);
olabel[0] = gt_label_[row_off + id];
obox_wt[0] = static_cast<T>(1.);
olabel_wt[0] = static_cast<T>(1.);
} else { } else {
obox[0] = static_cast<T>(0.); for (int64_t k = 0; k < K_; ++k) {
obox[1] = static_cast<T>(0.); out[k] = static_cast<T>(mismatch_value_);
obox[2] = static_cast<T>(0.); }
obox[3] = static_cast<T>(0.); out_wt[0] = static_cast<WT>(0.);
olabel[0] = background_label_;
obox_wt[0] = static_cast<T>(0.);
olabel_wt[0] = static_cast<T>(0.);
} }
} }
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T, typename WT>
struct NegTargetAssignFunctor { struct NegTargetAssignFunctor {
void operator()(const platform::DeviceContext& ctx, const int* neg_indices, void operator()(const platform::DeviceContext& ctx, const int* neg_indices,
const size_t* lod, const int num, const int num_prior_box, const size_t* lod, const int N, const int M, const int K,
const int background_label, int* out_label, const int mismatch_value, T* out, WT* out_wt) const;
T* out_label_wt) const;
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T, typename WT>
class TargetAssignKernel : public framework::OpKernel<T> { class TargetAssignKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* enc_gt_box = ctx.Input<framework::LoDTensor>("EncodedGTBBox"); auto* x = ctx.Input<framework::LoDTensor>("X");
auto* gt_label = ctx.Input<framework::LoDTensor>("GTScoreLabel");
auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices"); auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
auto* out_box = ctx.Output<framework::Tensor>("PredBBoxLabel");
auto* out_box_wt = ctx.Output<framework::Tensor>("PredBBoxWeight");
auto* out_label = ctx.Output<framework::Tensor>("PredScoreLabel");
auto* out_label_wt = ctx.Output<framework::Tensor>("PredScoreWeight");
PADDLE_ENFORCE_EQ(enc_gt_box->lod().size(), 1UL); auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(gt_label->lod().size(), 1UL); auto* out_wt = ctx.Output<framework::Tensor>("OutWeight");
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
int background_label = ctx.Attr<int>("background_label"); PADDLE_ENFORCE_EQ(x->lod().size(), 1UL);
int mismatch_value = ctx.Attr<int>("mismatch_value");
const T* box_data = enc_gt_box->data<T>(); const T* x_data = x->data<T>();
const int* label_data = gt_label->data<int>();
const int* match_idx_data = match_indices->data<int>(); const int* match_idx_data = match_indices->data<int>();
const int* neg_idx_data = neg_indices->data<int>();
T* obox_data = out_box->mutable_data<T>(ctx.GetPlace()); T* out_data = out->mutable_data<T>(ctx.GetPlace());
T* obox_wt_data = out_box_wt->mutable_data<T>(ctx.GetPlace()); WT* out_wt_data = out_wt->mutable_data<WT>(ctx.GetPlace());
int* olabel_data = out_label->mutable_data<int>(ctx.GetPlace());
T* olabel_wt_data = out_label_wt->mutable_data<T>(ctx.GetPlace());
int64_t num = match_indices->dims()[0]; int64_t n = match_indices->dims()[0];
int64_t num_prior_box = match_indices->dims()[1]; int64_t m = match_indices->dims()[1];
int64_t p = x->dims()[1];
int64_t k = x->dims()[2];
auto gt_lod = enc_gt_box->lod().back(); auto x_lod = x->lod().back();
auto gt_label_lod = gt_label->lod().back(); size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
auto neg_lod = neg_indices->lod().back();
for (size_t i = 0; i < gt_lod.size(); ++i) {
PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
}
size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data, TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
gt_lod_data, background_label, num, mismatch_value, n, m, p, k, out_data,
num_prior_box, obox_data, obox_wt_data, out_wt_data);
olabel_data, olabel_wt_data);
auto& device_ctx = ctx.template device_context<DeviceContext>(); auto& device_ctx = ctx.template device_context<DeviceContext>();
platform::ForRange<DeviceContext> for_range(device_ctx, platform::ForRange<DeviceContext> for_range(device_ctx, n * m);
num * num_prior_box);
for_range(functor); for_range(functor);
NegTargetAssignFunctor<DeviceContext, T> neg_trg_functor; auto* neg_indices = ctx.Input<framework::LoDTensor>("NegIndices");
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, num, num_prior_box, if (neg_indices) {
background_label, olabel_data, olabel_wt_data); PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back();
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
mismatch_value, out_data, out_wt_data);
}
} }
}; };
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/string/printf.h" #include "paddle/fluid/string/printf.h"
#include <ostream> #include <ostream>
#include <sstream> #include <sstream>
......
...@@ -23,8 +23,8 @@ limitations under the License. */ ...@@ -23,8 +23,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/string/to_string.h" #include "paddle/fluid/string/to_string.h"
#ifdef __GNUC__ #ifdef __GNUC__
#include <cxxabi.h> // for __cxa_demangle #include <cxxabi.h> // for __cxa_demangle
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/string/piece.h" #include "paddle/fluid/string/piece.h"
using StringPiece = paddle::string::Piece; using StringPiece = paddle::string::Piece;
using paddle::string::HasPrefix; using paddle::string::HasPrefix;
......
...@@ -35,7 +35,7 @@ limitations under the License. */ ...@@ -35,7 +35,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/string/to_string.h" #include "paddle/fluid/string/to_string.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/string/piece.h" #include "piece.h"
#include <string.h> #include <string.h>
......
...@@ -28,7 +28,7 @@ namespace string { ...@@ -28,7 +28,7 @@ namespace string {
// its syntax is simple as it doesn't own/manage the string, it is // its syntax is simple as it doesn't own/manage the string, it is
// cheap to construct Pieces and pass them around. // cheap to construct Pieces and pass them around.
class Piece { class Piece {
public: public:
static const size_t npos = static_cast<size_t>(-1); static const size_t npos = static_cast<size_t>(-1);
// We provide non-explicit singleton constructors so users can // We provide non-explicit singleton constructors so users can
...@@ -55,7 +55,7 @@ public: ...@@ -55,7 +55,7 @@ public:
// Return a string that contains the copy of the referenced data. // Return a string that contains the copy of the referenced data.
std::string ToString() const { return std::string(data_, size_); } std::string ToString() const { return std::string(data_, size_); }
private: private:
const char* data_; const char* data_;
size_t size_; size_t size_;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/string/piece.h" #include "paddle/fluid/string/piece.h"
#include <sstream> #include <sstream>
......
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "paddle/string/tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat #include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat
namespace paddle { namespace paddle {
namespace string { namespace string {
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/string/printf.h" #include "printf.h"
#include <string> #include <string>
...@@ -24,6 +24,6 @@ TEST(StringPrintf, StringPrintf) { ...@@ -24,6 +24,6 @@ TEST(StringPrintf, StringPrintf) {
long hour = 14; long hour = 14;
int min = 44; int min = 44;
EXPECT_EQ(std::string("Wednesday, July 27, 14:44"), EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
paddle::string::Sprintf( paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
"%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min)); hour, min));
} }
...@@ -147,7 +147,7 @@ namespace detail { ...@@ -147,7 +147,7 @@ namespace detail {
// Test whether type T1 is convertible to type T2 // Test whether type T1 is convertible to type T2
template <typename T1, typename T2> template <typename T1, typename T2>
struct is_convertible { struct is_convertible {
private: private:
// two types of different size // two types of different size
struct fail { struct fail {
char dummy[2]; char dummy[2];
...@@ -160,7 +160,7 @@ private: ...@@ -160,7 +160,7 @@ private:
static succeed tryConvert(const T2 &); static succeed tryConvert(const T2 &);
static const T1 &makeT1(); static const T1 &makeT1();
public: public:
// Standard trick: the (...) version of tryConvert will be chosen from // Standard trick: the (...) version of tryConvert will be chosen from
// the overload set only if the version taking a T2 doesn't match. // the overload set only if the version taking a T2 doesn't match.
// Then we compare the sizes of the return types to check which // Then we compare the sizes of the return types to check which
...@@ -170,8 +170,7 @@ public: ...@@ -170,8 +170,7 @@ public:
// Format the value by casting to type fmtT. This default implementation // Format the value by casting to type fmtT. This default implementation
// should never be called. // should never be called.
template <typename T, template <typename T, typename fmtT,
typename fmtT,
bool convertible = is_convertible<T, fmtT>::value> bool convertible = is_convertible<T, fmtT>::value>
struct formatValueAsType { struct formatValueAsType {
static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); } static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
...@@ -241,11 +240,8 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char) ...@@ -241,11 +240,8 @@ TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
/// operator<< to format the type T, with special cases for the %c and %p /// operator<< to format the type T, with special cases for the %c and %p
/// conversions. /// conversions.
template <typename T> template <typename T>
inline void formatValue(std::ostream &out, inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
const char * /*fmtBegin*/, const char *fmtEnd, int ntrunc, const T &value) {
const char *fmtEnd,
int ntrunc,
const T &value) {
// The mess here is to support the %c and %p conversions: if these // The mess here is to support the %c and %p conversions: if these
// conversions are active we try to convert the type to a char or const // conversions are active we try to convert the type to a char or const
// void* respectively and format that instead of the value itself. For the // void* respectively and format that instead of the value itself. For the
...@@ -267,25 +263,22 @@ inline void formatValue(std::ostream &out, ...@@ -267,25 +263,22 @@ inline void formatValue(std::ostream &out,
} }
// Overloaded version for char types to support printing as an integer // Overloaded version for char types to support printing as an integer
#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \ #define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
inline void formatValue(std::ostream &out, \ inline void formatValue(std::ostream &out, const char * /*fmtBegin*/, \
const char * /*fmtBegin*/, \ const char *fmtEnd, int /**/, charType value) { \
const char *fmtEnd, \ switch (*(fmtEnd - 1)) { \
int /**/, \ case 'u': \
charType value) { \ case 'd': \
switch (*(fmtEnd - 1)) { \ case 'i': \
case 'u': \ case 'o': \
case 'd': \ case 'X': \
case 'i': \ case 'x': \
case 'o': \ out << static_cast<int>(value); \
case 'X': \ break; \
case 'x': \ default: \
out << static_cast<int>(value); \ out << value; \
break; \ break; \
default: \ } \
out << value; \
break; \
} \
} }
// per 3.9.1: char, signed char and unsigned char are all distinct types // per 3.9.1: char, signed char and unsigned char are all distinct types
TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char) TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
...@@ -482,7 +475,7 @@ namespace detail { ...@@ -482,7 +475,7 @@ namespace detail {
// each argument to be allocated as a homogenous array inside FormatList // each argument to be allocated as a homogenous array inside FormatList
// whereas a naive implementation based on inheritance does not. // whereas a naive implementation based on inheritance does not.
class FormatArg { class FormatArg {
public: public:
FormatArg() {} FormatArg() {}
template <typename T> template <typename T>
...@@ -491,22 +484,17 @@ public: ...@@ -491,22 +484,17 @@ public:
m_formatImpl(&formatImpl<T>), m_formatImpl(&formatImpl<T>),
m_toIntImpl(&toIntImpl<T>) {} m_toIntImpl(&toIntImpl<T>) {}
void format(std::ostream &out, void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
const char *fmtBegin,
const char *fmtEnd,
int ntrunc) const { int ntrunc) const {
m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value); m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
} }
int toInt() const { return m_toIntImpl(m_value); } int toInt() const { return m_toIntImpl(m_value); }
private: private:
template <typename T> template <typename T>
static void formatImpl(std::ostream &out, static void formatImpl(std::ostream &out, const char *fmtBegin,
const char *fmtBegin, const char *fmtEnd, int ntrunc, const void *value) {
const char *fmtEnd,
int ntrunc,
const void *value) {
formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value)); formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
} }
...@@ -516,11 +504,8 @@ private: ...@@ -516,11 +504,8 @@ private:
} }
const void *m_value; const void *m_value;
void (*m_formatImpl)(std::ostream &out, void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
const char *fmtBegin, const char *fmtEnd, int ntrunc, const void *value);
const char *fmtEnd,
int ntrunc,
const void *value);
int (*m_toIntImpl)(const void *value); int (*m_toIntImpl)(const void *value);
}; };
...@@ -569,12 +554,10 @@ inline const char *printFormatStringLiteral(std::ostream &out, ...@@ -569,12 +554,10 @@ inline const char *printFormatStringLiteral(std::ostream &out,
// necessary to pull out variable width and precision . The function returns a // necessary to pull out variable width and precision . The function returns a
// pointer to the character after the end of the current format spec. // pointer to the character after the end of the current format spec.
inline const char *streamStateFromFormat(std::ostream &out, inline const char *streamStateFromFormat(std::ostream &out,
bool &spacePadPositive, bool &spacePadPositive, int &ntrunc,
int &ntrunc,
const char *fmtStart, const char *fmtStart,
const detail::FormatArg *formatters, const detail::FormatArg *formatters,
int &argIndex, int &argIndex, int numFormatters) {
int numFormatters) {
if (*fmtStart != '%') { if (*fmtStart != '%') {
TINYFORMAT_ERROR( TINYFORMAT_ERROR(
"tinyformat: Not enough conversion specifiers in format string"); "tinyformat: Not enough conversion specifiers in format string");
...@@ -750,10 +733,8 @@ inline const char *streamStateFromFormat(std::ostream &out, ...@@ -750,10 +733,8 @@ inline const char *streamStateFromFormat(std::ostream &out,
} }
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
inline void formatImpl(std::ostream &out, inline void formatImpl(std::ostream &out, const char *fmt,
const char *fmt, const detail::FormatArg *formatters, int numFormatters) {
const detail::FormatArg *formatters,
int numFormatters) {
// Saved stream state // Saved stream state
std::streamsize origWidth = out.width(); std::streamsize origWidth = out.width();
std::streamsize origPrecision = out.precision(); std::streamsize origPrecision = out.precision();
...@@ -765,13 +746,9 @@ inline void formatImpl(std::ostream &out, ...@@ -765,13 +746,9 @@ inline void formatImpl(std::ostream &out,
fmt = printFormatStringLiteral(out, fmt); fmt = printFormatStringLiteral(out, fmt);
bool spacePadPositive = false; bool spacePadPositive = false;
int ntrunc = -1; int ntrunc = -1;
const char *fmtEnd = streamStateFromFormat(out, const char *fmtEnd =
spacePadPositive, streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
ntrunc, argIndex, numFormatters);
fmt,
formatters,
argIndex,
numFormatters);
if (argIndex >= numFormatters) { if (argIndex >= numFormatters) {
// Check args remain after reading any variable width/precision // Check args remain after reading any variable width/precision
TINYFORMAT_ERROR("tinyformat: Not enough format arguments"); TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
...@@ -820,15 +797,14 @@ inline void formatImpl(std::ostream &out, ...@@ -820,15 +797,14 @@ inline void formatImpl(std::ostream &out,
/// information has been stripped from the arguments, leaving just enough of a /// information has been stripped from the arguments, leaving just enough of a
/// common interface to perform formatting as required. /// common interface to perform formatting as required.
class FormatList { class FormatList {
public: public:
FormatList(detail::FormatArg *formatters, int N) FormatList(detail::FormatArg *formatters, int N)
: m_formatters(formatters), m_N(N) {} : m_formatters(formatters), m_N(N) {}
friend void vformat(std::ostream &out, friend void vformat(std::ostream &out, const char *fmt,
const char *fmt,
const FormatList &list); const FormatList &list);
private: private:
const detail::FormatArg *m_formatters; const detail::FormatArg *m_formatters;
int m_N; int m_N;
}; };
...@@ -841,7 +817,7 @@ namespace detail { ...@@ -841,7 +817,7 @@ namespace detail {
// Format list subclass with fixed storage to avoid dynamic allocation // Format list subclass with fixed storage to avoid dynamic allocation
template <int N> template <int N>
class FormatListN : public FormatList { class FormatListN : public FormatList {
public: public:
template <typename... Args> template <typename... Args>
FormatListN(const Args &... args) FormatListN(const Args &... args)
: FormatList(&m_formatterStore[0], N), : FormatList(&m_formatterStore[0], N),
...@@ -849,14 +825,14 @@ public: ...@@ -849,14 +825,14 @@ public:
static_assert(sizeof...(args) == N, "Number of args must be N"); static_assert(sizeof...(args) == N, "Number of args must be N");
} }
private: private:
FormatArg m_formatterStore[N]; FormatArg m_formatterStore[N];
}; };
// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard // Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
template <> template <>
class FormatListN<0> : public FormatList { class FormatListN<0> : public FormatList {
public: public:
FormatListN() : FormatList(0, 0) {} FormatListN() : FormatList(0, 0) {}
}; };
......
...@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/string/to_string.h" #include "to_string.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
constexpr char kOutputString[] = "User Defined Output"; constexpr char kOutputString[] = "User Defined Output";
class UserDefinedClass { class UserDefinedClass {
public: public:
}; };
std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) { std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
......
...@@ -115,8 +115,8 @@ EOF ...@@ -115,8 +115,8 @@ EOF
-DWITH_AVX=${WITH_AVX:-ON} \ -DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON \ -DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF -DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py framework_py_proto
make -j `nproc` paddle_python make -j `nproc` copy_paddle_pybind
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
popd popd
fi fi
......
...@@ -6,9 +6,9 @@ mkdir -p $TRAVIS_BUILD_DIR/build ...@@ -6,9 +6,9 @@ mkdir -p $TRAVIS_BUILD_DIR/build
cd $TRAVIS_BUILD_DIR/build cd $TRAVIS_BUILD_DIR/build
# Compile Documentation only. # Compile Documentation only.
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py make -j `nproc` gen_proto_py framework_py_proto
make -j `nproc` paddle_python make -j `nproc` copy_paddle_pybind
make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
# check websites for broken links # check websites for broken links
......
...@@ -33,6 +33,57 @@ class VarBlock: ...@@ -33,6 +33,57 @@ class VarBlock:
return "%s:%d:%d" % (self.varname, self.offset, self.size) return "%s:%d:%d" % (self.varname, self.offset, self.size)
class UnionFind(object):
""" Union-find data struct.
Union-find is a data struct that keeps track of a set of elements partitioned
into a number of disjoint (non-overlapping) subsets.
Reference:
https://en.wikipedia.org/wiki/Disjoint-set_data_structure
Args:
elements(list): The initialize element list.
"""
def __init__(self, elementes=None):
self._parents = [] # index -> parent index
self._index = {} # element -> index
self._curr_idx = 0
if not elementes:
elementes = []
for ele in elementes:
self._parents.append(self._curr_idx)
self._index.update({ele: self._curr_idx})
self._curr_idx += 1
def find(self, x):
# Find the root index of given element x,
# execute the path compress while findind the root index
if not x in self._index:
return -1
idx = self._index[x]
while idx != self._parents[idx]:
t = self._parents[idx]
self._parents[idx] = self._parents[t]
idx = t
return idx
def union(self, x, y):
# Union two given element
x_root = self.find(x)
y_root = self.find(y)
if x_root == y_root:
return
self._parents[x_root] = y_root
def is_connected(self, x, y):
# If two given elements have the same root index,
# then they are connected.
return self.find(x) == self.find(y)
def same_or_split_var(p_name, var_name): def same_or_split_var(p_name, var_name):
return p_name == var_name or p_name.startswith(var_name + ".block") return p_name == var_name or p_name.startswith(var_name + ".block")
...@@ -203,6 +254,21 @@ class DistributeTranspiler: ...@@ -203,6 +254,21 @@ class DistributeTranspiler:
(varname, self.trainer_id) (varname, self.trainer_id)
startup_prog.global_block().rename_var(varname, new_var_name) startup_prog.global_block().rename_var(varname, new_var_name)
# self.lr_param_mapping = self._create_lr_param_mapping()
# def _create_lr_param_mapping(self):
# lr_mapping = dict()
# for _, opt_op in enumerate(self.optimize_ops):
# if not opt_op.inputs or not opt_op.inputs.has_key("LearningRate") \
# or not opt_op.inputs.has_key("Param"):
# continue
# lr = opt_op.inputs["LearningRate"].name
# param = opt_op.inputs["Param"].name
# if not lr_mapping.has_key(lr):
# lr_mapping.update({lr: list()})
# lr_mapping[lr].append(param)
# return lr_mapping
def _create_vars_from_blocklist(self, program, block_list): def _create_vars_from_blocklist(self, program, block_list):
# Create respective variables using the block_list # Create respective variables using the block_list
block_map = dict() block_map = dict()
...@@ -333,50 +399,15 @@ class DistributeTranspiler: ...@@ -333,50 +399,15 @@ class DistributeTranspiler:
pass pass
return orig_shape return orig_shape
def _op_input_var(self, op, varname): def _fetch_var_names(self, param_dict):
pass res = []
if not param_dict:
def _is_op_on_pserver(self, endpoint, all_ops, idx): return res
""" for _, values in param_dict.iteritems():
Recursively check if the op need to run on current server. if not isinstance(values, list):
Assume that ops are in the execution order. values = [values]
""" res += [v.name for v in values]
param_names = [ return res
p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
]
op = all_ops[idx]
input_names = set(op.input_names)
# TODO(typhoonzero): using Param and Grad input name to identify
# that the operator is an optimization operator, need a better way.
if "Param" in input_names:
if op.input("Param")[0] in param_names:
return True
else:
for n in param_names:
if same_or_split_var(n, op.input("Param")[0]) \
and n != op.input("Param")[0]:
return True
return False
else:
j = idx - 1
while j >= 0:
prev_op = all_ops[j]
# NOTE(typhoonzero): consider list input/output
prev_output_names = prev_op.desc.output_arg_names()
prev_input_names = prev_op.desc.input_arg_names()
found1 = False
found2 = False
for varname in op.desc.input_arg_names():
if varname in prev_output_names:
found1 = self._is_op_on_pserver(endpoint, all_ops, j)
# later ops may produce output for prev op's next batch use.
for varname in op.desc.output_arg_names():
if varname in prev_input_names:
found2 = self._is_op_on_pserver(endpoint, all_ops, j)
if found1 or found2:
return True
j -= 1
return False
def _append_pserver_ops(self, optimize_block, opt_op, endpoint): def _append_pserver_ops(self, optimize_block, opt_op, endpoint):
program = optimize_block.program program = optimize_block.program
...@@ -394,11 +425,7 @@ class DistributeTranspiler: ...@@ -394,11 +425,7 @@ class DistributeTranspiler:
# do not append this op if current endpoint # do not append this op if current endpoint
# is not dealing with this grad block # is not dealing with this grad block
return return
merged_var = program.global_block().create_var( merged_var = program.global_block().vars[grad_block.name]
name=grad_block.name,
persistable=grad_block.persistable,
dtype=grad_block.dtype,
shape=grad_block.shape)
# append merging ops if trainers > 1 # append merging ops if trainers > 1
if self.trainers > 1: if self.trainers > 1:
vars2merge = self._create_var_for_trainers( vars2merge = self._create_var_for_trainers(
...@@ -429,13 +456,19 @@ class DistributeTranspiler: ...@@ -429,13 +456,19 @@ class DistributeTranspiler:
shape=param_block.shape) shape=param_block.shape)
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
elif key == "LearningRate":
# leraning rate variable has already be created by non-optimize op,
# don't create it once again.
new_inputs[key] = program.global_block().vars[opt_op.input(key)[
0]]
for key in opt_op.input_names: for key in opt_op.input_names:
if key in ["Param", "Grad"]: new_shape = None
if key in ["Param", "Grad", "LearningRate"]:
continue continue
var = program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape # update accumulator variable shape
param_shape = new_inputs["Param"].shape param_shape = new_inputs["Param"].shape
var = program.global_block().vars[opt_op.input(key)[0]]
new_shape = self._get_optimizer_input_shape(opt_op.type, key, new_shape = self._get_optimizer_input_shape(opt_op.type, key,
var.shape, param_shape) var.shape, param_shape)
tmpvar = program.global_block().create_var( tmpvar = program.global_block().create_var(
...@@ -446,12 +479,11 @@ class DistributeTranspiler: ...@@ -446,12 +479,11 @@ class DistributeTranspiler:
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
# change output's ParamOut variable # change output's ParamOut variable
outputs = self._get_output_map_from_op(program.global_block(), opt_op) opt_op.outputs["ParamOut"] = new_inputs["Param"]
outputs["ParamOut"] = new_inputs["Param"]
optimize_block.append_op( optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
inputs=new_inputs, inputs=new_inputs,
outputs=outputs, outputs=opt_op.outputs,
attrs=opt_op.attrs) attrs=opt_op.attrs)
def _append_pserver_non_opt_ops(self, optimize_block, opt_op): def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
...@@ -459,11 +491,10 @@ class DistributeTranspiler: ...@@ -459,11 +491,10 @@ class DistributeTranspiler:
# Append the ops for parameters that do not need to be optimized/updated # Append the ops for parameters that do not need to be optimized/updated
inputs = self._get_input_map_from_op(self.program.global_block().vars, inputs = self._get_input_map_from_op(self.program.global_block().vars,
opt_op) opt_op)
for var in inputs.itervalues(): for varlist in inputs.itervalues():
if type(var) == list: if not isinstance(varlist, list):
varlist = var varlist = [varlist]
else:
varlist = [var]
for var in varlist: for var in varlist:
if not program.global_block().vars.has_key(var.name): if not program.global_block().vars.has_key(var.name):
program.global_block().create_var( program.global_block().create_var(
...@@ -475,12 +506,70 @@ class DistributeTranspiler: ...@@ -475,12 +506,70 @@ class DistributeTranspiler:
outputs = self._get_output_map_from_op(self.program.global_block().vars, outputs = self._get_output_map_from_op(self.program.global_block().vars,
opt_op) opt_op)
for varlist in outputs.itervalues():
if not isinstance(varlist, list):
varlist = [varlist]
for var in varlist:
program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
optimize_block.append_op( optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
inputs=inputs, inputs=inputs,
outputs=outputs, outputs=outputs,
attrs=opt_op.attrs) attrs=opt_op.attrs)
def _is_op_connected(self, op1, op2):
# If one op's input is another op's output or
# one op's output is another op's input, we say
# the two operator is connected.
op1_input_names = self._fetch_var_names(op1.inputs)
op1_output_names = self._fetch_var_names(op1.outputs)
op2_input_names = self._fetch_var_names(op2.inputs)
op2_output_names = self._fetch_var_names(op2.outputs)
if set(op1_output_names) & set(op2_input_names) or \
set(op1_input_names) & set(op2_output_names):
return True
return False
def _create_ufind(self, optimize_ops):
# Create a unit find data struct by optimize ops
ufind = UnionFind(optimize_ops)
for i in xrange(len(optimize_ops)):
for j in xrange(i, len(optimize_ops)):
op1 = optimize_ops[i]
op2 = optimize_ops[j]
if self._is_op_connected(op1, op2):
ufind.union(op1, op2)
return ufind
def _is_opt_op(self, op):
# NOTE: It's a HACK implement.
# optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
if "Param" in op.input_names and \
"LearningRate" in op.input_names:
return True
return False
def _is_opt_op_on_pserver(self, endpoint, op):
param_names = [
p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
]
if op.input("Param") in param_names:
return True
else:
for n in param_names:
param = op.input("Param")
if same_or_split_var(n, param) and n != param:
return True
return False
return False
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
""" """
Get pserver side program using the endpoint Get pserver side program using the endpoint
...@@ -514,17 +603,30 @@ class DistributeTranspiler: ...@@ -514,17 +603,30 @@ class DistributeTranspiler:
recv_inputs.append(var) recv_inputs.append(var)
# step6 # step6
optimize_block = pserver_program.create_block(0) optimize_block = pserver_program.create_block(0)
# Iterate through the ops and append ops as needed # step 6.1
for idx, opt_op in enumerate(self.optimize_ops): # Create a union-find data struct by optimize ops,
is_op_on_pserver = self._is_op_on_pserver(endpoint, # If two ops are connected, we could add these two ops
self.optimize_ops, idx) # into one set.
if not is_op_on_pserver: ufind = self._create_ufind(self.optimize_ops)
continue # step 6.2
if "Grad" in opt_op.desc.input_arg_names(): # Iterate through the ops and append optimize op which
self._append_pserver_ops(optimize_block, opt_op, endpoint) # located on current pserver
else: opt_op_on_pserver = []
self._append_pserver_non_opt_ops(optimize_block, opt_op) for _, op in enumerate(self.optimize_ops):
if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
opt_op_on_pserver.append(op)
# step 6.3
# Iterate through the ops, and if an op and the optimize ops
# which located on current pserver are in one set, then
# append it into the sub program.
for _, op in enumerate(self.optimize_ops):
for _, opt_op in enumerate(opt_op_on_pserver):
if ufind.is_connected(op, opt_op):
if self._is_opt_op(op):
self._append_pserver_ops(optimize_block, op, endpoint)
else:
self._append_pserver_non_opt_ops(optimize_block, op)
break
# Append the listen_and_serv op # Append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="listen_and_serv", type="listen_and_serv",
......
...@@ -117,6 +117,7 @@ def monkey_patch_variable(): ...@@ -117,6 +117,7 @@ def monkey_patch_variable():
tmp_name = unique_tmp_name() tmp_name = unique_tmp_name()
out = self.block.create_var(name=tmp_name, dtype=lhs_dtype) out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)
self.block.append_op( self.block.append_op(
type=op_type, type=op_type,
inputs={'X': [self], inputs={'X': [self],
......
...@@ -99,7 +99,7 @@ elif training_role == "TRAINER": ...@@ -99,7 +99,7 @@ elif training_role == "TRAINER":
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
for data in train_reader(): for data in train_reader():
avg_cost_np = exe.run(fluid.default_main_program(), avg_cost_np = exe.run(t.get_trainer_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print("avg_cost_np", avg_cost_np) print("avg_cost_np", avg_cost_np)
......
...@@ -64,9 +64,7 @@ exe = fluid.Executor(place) ...@@ -64,9 +64,7 @@ exe = fluid.Executor(place)
[res1, res2] = exe.run(prog, fetch_list=[out1, out2]) [res1, res2] = exe.run(prog, fetch_list=[out1, out2])
test_pass = res1.shape == (10, 2) and res2.shape == (10, 1) if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
if not test_pass:
exit(1) exit(1)
exit(0) exit(0)
...@@ -73,5 +73,20 @@ class TestSequenceExpandCase3(TestSequenceExpand): ...@@ -73,5 +73,20 @@ class TestSequenceExpandCase3(TestSequenceExpand):
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
class TestSequenceExpandCase4(TestSequenceExpand):
def set_data(self):
x_data = np.array(
[0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
[2, 5]).astype('float32')
x_lod = [[
0,
1,
2,
]]
y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
y_lod = [[0, 1, 2], [0, 1, 2]]
self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -20,11 +20,11 @@ from op_test import OpTest ...@@ -20,11 +20,11 @@ from op_test import OpTest
class TestSplitOp(OpTest): class TestSplitOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "split" self.op_type = "split"
axis = 0 axis = 1
x = np.random.random((4, 2, 5)).astype('float32') x = np.random.random((4, 5, 6)).astype('float32')
out = np.split(x, [1, 3], axis) out = np.split(x, [2, 3], axis)
self.inputs = {'X': x} self.inputs = {'X': x}
self.attrs = {'axis': axis, 'sections': [1, 2, 1]} self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
self.outputs = {'Out': [('out%d' % i, out[i]) \ self.outputs = {'Out': [('out%d' % i, out[i]) \
for i in xrange(len(out))]} for i in xrange(len(out))]}
......
...@@ -43,7 +43,7 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod): ...@@ -43,7 +43,7 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod, def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
neg_lod, background_label): neg_lod, mismatch_value):
batch_size, num_prior = match_indices.shape batch_size, num_prior = match_indices.shape
# init target bbox # init target bbox
...@@ -52,7 +52,7 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod, ...@@ -52,7 +52,7 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32') trg_box_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
# init target label # init target label
trg_label = np.ones((batch_size, num_prior, 1)).astype('int32') trg_label = np.ones((batch_size, num_prior, 1)).astype('int32')
trg_label = trg_label * background_label trg_label = trg_label * mismatch_value
# init weight for target label # init weight for target label
trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32') trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
...@@ -65,53 +65,90 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod, ...@@ -65,53 +65,90 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
# target bbox # target bbox
for v, c in zip(col_val + gt_start, col_ids[0].tolist()): for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
trg_box[i][c][:] = encoded_box[v][c][:] trg_box[i][c][:] = encoded_box[v][c][:]
# weight for target bbox # weight for target bbox
trg_box_wt[i][col_ids] = 1.0 trg_box_wt[i][col_ids] = 1.0
trg_label[i][col_ids] = gt_label[col_val + gt_start] trg_label[i][col_ids] = gt_label[col_val + gt_start]
trg_label_wt[i][col_ids] = 1.0 trg_label_wt[i][col_ids] = 1.0
# set target label weight to 1.0 for the negative samples # set target label weight to 1.0 for the negative samples
neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]] if neg_indices is not None:
trg_label_wt[i][neg_ids] = 1.0 neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
trg_label_wt[i][neg_ids] = 1.0
return trg_box, trg_box_wt, trg_label, trg_label_wt return trg_box, trg_box_wt, trg_label, trg_label_wt
class TestTargetAssginOp(OpTest): class TestTargetAssginFloatType(OpTest):
def setUp(self): def setUp(self):
self.op_type = "target_assign" self.op_type = "target_assign"
num_prior = 120
num_class = 21
gt_lod = [0, 5, 11, 23]
neg_lod = [0, 4, 7, 13]
mismatch_value = 0
batch_size = len(gt_lod) - 1
num_gt = gt_lod[-1]
encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
gt_label = np.random.randint(
num_class, size=(num_gt, 1)).astype('int32')
match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
gt_lod, neg_lod)
out, out_wt, _, _ = target_assign(encoded_box, gt_label, match_indices,
neg_indices, gt_lod, neg_lod,
mismatch_value)
# assign regression targets
x = encoded_box
self.inputs = {
'X': (x, [gt_lod]),
'MatchIndices': match_indices,
}
self.attrs = {'mismatch_value': mismatch_value}
self.outputs = {
'Out': out,
'OutWeight': out_wt,
}
def test_check_output(self):
self.check_output()
class TestTargetAssginIntType(OpTest):
def setUp(self):
self.op_type = "target_assign"
num_prior = 120 num_prior = 120
num_class = 21 num_class = 21
gt_lod = [0, 5, 11, 23] gt_lod = [0, 5, 11, 23]
neg_lod = [0, 4, 7, 13] neg_lod = [0, 4, 7, 13]
mismatch_value = 0
batch_size = len(gt_lod) - 1 batch_size = len(gt_lod) - 1
num_gt = gt_lod[-1] num_gt = gt_lod[-1]
background_label = 0
encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32') encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
gt_label = np.random.randint( gt_label = np.random.randint(
num_class, size=(num_gt, 1)).astype('int32') num_class, size=(num_gt, 1)).astype('int32')
match_indices, neg_indices = gen_match_and_neg_indices(num_prior, match_indices, neg_indices = gen_match_and_neg_indices(num_prior,
gt_lod, neg_lod) gt_lod, neg_lod)
trg_box, trg_box_wt, trg_label, trg_label_wt = target_assign(
encoded_box, gt_label, match_indices, neg_indices, gt_lod, neg_lod,
background_label)
_, _, out, out_wt, = target_assign(encoded_box, gt_label, match_indices,
neg_indices, gt_lod, neg_lod,
mismatch_value)
# assign cassification argets
x = np.reshape(gt_label, (num_gt, 1, 1))
self.inputs = { self.inputs = {
'EncodedGTBBox': (encoded_box, [gt_lod]), 'X': (x, [gt_lod]),
'GTScoreLabel': (gt_label, [gt_lod]), 'MatchIndices': match_indices,
'MatchIndices': (match_indices),
'NegIndices': (neg_indices, [neg_lod]), 'NegIndices': (neg_indices, [neg_lod]),
} }
self.attrs = {'background_label': background_label} self.attrs = {'mismatch_value': mismatch_value}
self.outputs = { self.outputs = {
'PredBBoxLabel': (trg_box), 'Out': out,
'PredBBoxWeight': (trg_box_wt), 'OutWeight': out_wt,
'PredScoreLabel': (trg_label),
'PredScoreWeight': (trg_label_wt),
} }
def test_check_output(self): def test_check_output(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册