diff --git a/CMakeLists.txt b/CMakeLists.txt index e5b2f32fba7cf6b2f1eb9356833b3ff3a0be4c6d..ed704585d8a6bf3befd9a549aa5a62a33fea3da9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) -option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 586778feeeeecb0c6e5efdfa375f9786d5a5eaa7..08c29bb03f57aae3c485baffef88e830579dfcb5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -175,6 +175,7 @@ paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dim paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc index 8f4bab25ed4919881baf19a961a52aa229e06a8f..19248b4dfee1da81d18cd2effac08ba68dde80fb 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include +#include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { @@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("X", inputs); } op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); } // a->OP0->b diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 06286a109d01af638e74e06ccc83e2a5500663ea..2db7d95cae1c8c59691fd642e2462e92ed58814f 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include +#include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { @@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, op->SetInput("X", inputs); } op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); } // a->OP0->b diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 265a128e95e6205159de67d87d5cb8ca1ad7d475..813f620d7ceb6f341e5fa6e96fb5ca0face96f51 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,8 +23,62 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +namespace { + +void CheckProgram(const ProgramDesc &program) { + std::map visit; +#define _INT(role) static_cast(role) + + for (size_t i = 0; i < program.Size(); ++i) { + for (OpDesc *op : program.Block(i).AllOps()) { + // For backward compatibility, some program doesn't have role added. + if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; + int role_id = boost::get( + op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + visit[role_id] = true; + switch (role_id) { + case _INT(OpRole::kForward): + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kBackward)) == visit.end(), + "Cannot add forward operator before backward operator."); + break; + case _INT(OpRole::kBackward): + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator before optimize operator."); + break; + case _INT(OpRole::kForward) | _INT(OpRole::kLoss): + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator."); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator before optimize operator."); + break; + case _INT(OpRole::kOptimize): + case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators must follow backward operator."); + break; + case _INT(OpRole::kLRSched): + case _INT(OpRole::kDist): + case _INT(OpRole::kRPC): + case _INT(OpRole::kNotSpecified): + break; + default: + LOG(FATAL) << "Unknown operator role. Don't add new role because " + "you don't know what you are doing."; + } + } + } +#undef _INT +} +} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { + CheckProgram(program_); // Make the nodes id start from 0. Node::ResetId(); auto var_nodes = InitFromProgram(program_); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 29b604afbfcfc2bac67e447db8cd4c671c036dbe..b20d70132256bd5df7411c46ff4eb246b1f14ba8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() { return result; } +bool GraphItemCMP(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } +} + // TODO(Superjomn) enhance the function as it marks unique unique as duplicates // see https://github.com/PaddlePaddle/Paddle/issues/13550 void GraphPatternDetector::UniquePatterns( @@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns( std::vector result; std::unordered_set set; + std::hash hasher; for (auto &g : *subgraphs) { - size_t key = 0; - for (auto &item : g) { - key ^= std::hash{}(item.first); - key ^= std::hash{}(item.second); + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); + std::stringstream ss; + for (auto &item : sorted_keys) { + ss << item.first << ":" << item.second; } + auto key = hasher(ss.str()); if (!set.count(key)) { result.emplace_back(g); set.insert(key); diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 1e7da9a69c7cbf8c13306656599a759515802b76..669d08c70c9b7453264806b346a6c9eb211cfd4a 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); for (size_t j = 0; j < lod.size(); ++j) { auto &sub_lod = new_lod[j]; - auto &offset = sub_lod.back(); + size_t offset = sub_lod.back(); for (size_t k = 1; k < lod[j].size(); ++k) { sub_lod.push_back(lod[j][k] + offset); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9259bb740a8a2408e4dc7be21711560fdf250752..45fc36c70633204dbfadbd10757c08b009d2cc74 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() { } } -static bool VarIsTensor(const Variable* var) { - return var->IsType() || var->IsType(); +static bool VarIsTensor(const Variable& var) { + return var.IsType() || var.IsType(); } -const Tensor* GetTensorFromVar(Variable* var) { - if (var->IsType()) { - return var->GetMutable(); - } else if (var->IsType()) { - return var->GetMutable()->mutable_value(); +const Tensor* GetTensorFromVar(const Variable& var) { + if (var.IsType()) { + return static_cast(&(var.Get())); + } else if (var.IsType()) { + return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + var.Type().name()); } } @@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const { template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); - return var == nullptr ? nullptr - : GetTensorFromVar(const_cast(var)); + return var == nullptr ? nullptr : GetTensorFromVar(*var); } template <> @@ -428,7 +427,7 @@ const std::vector ExecutionContext::MultiInput( std::transform(names.begin(), names.end(), std::back_inserter(res), [&](const std::string& sub_name) { auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : GetTensorFromVar(var); + return var == nullptr ? nullptr : GetTensorFromVar(*var); }); return res; } @@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack( for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); - auto* transformed_tensor = - GetTensorFromVar(transfer_scope.FindVar(var_name)); + auto* var = transfer_scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr", + var_name); + auto* transformed_tensor = GetTensorFromVar(*var); original_tensor->ShareDataWith(*transformed_tensor); } } @@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData( for (auto& var_name : var_name_item.second) { auto* var = scope.FindVar(var_name); // Only tensor can be tranfer to another device. - if (var == nullptr || !VarIsTensor(var)) { + if (var == nullptr || !VarIsTensor(*var)) { continue; } - auto* tensor_in = GetTensorFromVar(var); + auto* tensor_in = GetTensorFromVar(*var); if (!tensor_in->IsInitialized()) { continue; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index a04d2834eb94c2d8df9c6e48782d10bb3254a6dd..96ad3205235b921a7cf60ed674a8350f74d18509 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -63,7 +63,7 @@ inline std::string GradVarName(const std::string& var_name) { } proto::VarType::Type GetDataTypeOfVar(const Variable* var); -const Tensor* GetTensorFromVar(Variable* var); +const Tensor* GetTensorFromVar(const Variable& var); class OperatorBase; class ExecutionContext; diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index cb2061c06a429d8e8116001a4aa4e8c46ea13428..a0a9a573603ceb6b577529101cb331adbc81337a 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -75,6 +75,19 @@ TEST(Tensor, MutableData) { platform::CPUPlace()); EXPECT_EQ(p1, p2); } + // Not sure if it's desired, but currently, Tensor type can be changed. + { + framework::Tensor src_tensor; + int8_t* p1 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + *p1 = 1; + + uint8_t* p2 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_EQ(static_cast(p2[0]), 1); + } #ifdef PADDLE_WITH_CUDA { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 69bcbc0e5891f95af4de8dfd49a25648ca920ab1..ca1e01c89f07c4ffc3979a6a6c3728328e0a1819 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cuda_pinned_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_pinned_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, + nullptr); } #endif } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index dbbe8bcba69a1d87e21c8eae18834fb708e8b1e4..d31c8e3b7d66a0cdb2c4725783c9a24f049c666d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,3 +1,6 @@ +if(WITH_TESTING) + include(test.cmake) # some generic cmake funtion for inference +endif() # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index d4d2fd4634f9e11f3f002e11e177c332ced49885..0354f9e6e9588af601210b8a71ae98c1f90d62f0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) -function (inference_analysis_test TARGET) - if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(mem_opt "") - if(WITH_GPU) - set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") - endif() - cc_test(${TARGET} - SRCS "${analysis_test_SRCS}" - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) - set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) - endif(WITH_TESTING) +function(inference_analysis_test TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS}) + endif() endfunction(inference_analysis_test) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 1682011c3d8cc9927a4b026b370671798cace625..50ce20621fb289023ecccf7bb39d98169765d5ee 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/ut_helper.h" @@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type, op->SetType(type); op->SetInput("Xs", inputs); op->SetOutput("Xs", outputs); + op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(framework::OpRole::kForward)); } TEST(DataFlowGraph, Build_IR_Graph) { diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index a55426f74f988176aeb180e48d1af8632ed3b5c7..49a9ebe3ddec1e4fd59ae1155a706859e249d25c 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -17,39 +17,12 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) - -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB} - ) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() -function(inference_api_test TARGET_NAME) - if (WITH_TESTING) - set(options "") - set(oneValueArgs SRC) - set(multiValueArgs ARGS) - cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if (WITH_GPU) - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15) - else() - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) - endif() - if(inference_test_ARGS) - set_tests_properties(${TARGET_NAME} - PROPERTIES DEPENDS "${inference_test_ARGS}") - endif() - endif(WITH_TESTING) -endfunction(inference_api_test) - cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) @@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) -inference_api_test(test_api_impl SRC api_impl_tester.cc - ARGS test_word2vec test_image_classification) +if(WITH_TESTING) + inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} + ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) + set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) +endif() cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api ARGS --dirname=${PYTHON_TESTS_DIR}/book) @@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) - -inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) + if(WITH_TESTING) + inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) + endif() endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 1d4dfb8649563ab23ffeec1f79bb305fd2ebae26..5152b8670ddb206f0927c03149684af4a096df42 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -22,12 +22,14 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" #ifdef __clang__ -#define ACC_DIFF 4e-2 +#define ACC_DIFF 4e-3 #else -#define ACC_DIFF 1e-2 +#define ACC_DIFF 1e-3 #endif -DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_string(word2vec_dirname, "", + "Directory of the word2vec inference model."); +DEFINE_string(book_dirname, "", "Directory of the book inference model."); namespace paddle { @@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.model_dir = FLAGS_word2vec_dirname; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 702158ea3bcab854eece3ccd40724d92efcbae67..89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { //# 1. Create PaddlePredictor with a config. NativeConfig config0; - config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config0.model_dir = FLAGS_dirname; config0.use_gpu = true; config0.fraction_of_gpu_memory = 0.3; config0.device = 0; MixedRTConfig config1; - config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config1.model_dir = FLAGS_dirname; config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; config1.device = 0; diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 1ac655bdbbf7c45bfdde2c5fa8026fab2c891903..ff718077c1ba6b10fe87aac10d84f96a23ad6bba 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB make -j - word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' + word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then for use_gpu in $use_gpu_list; do ./simple_on_word2vec \ diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 487fc7b14e2c04af1e17efff91de0bfeed15c8a7..5446fd4d4256c10442a53ea09a447cf308cbd681 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -70,12 +70,8 @@ void Main(bool use_gpu) { // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5), num_elements); i++) { - // Here will result random fail, for that the model is trained by CI, the - // train phase is not stable, so the result will be random. - // TODO(Superjomn) will restore after the model is upload. - // CHECK_NEAR(static_cast(outputs.front().data.data())[i], - // result[i], - // 0.001); + CHECK_NEAR(static_cast(outputs.front().data.data())[i], result[i], + 0.001); } } } diff --git a/paddle/fluid/inference/test.cmake b/paddle/fluid/inference/test.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7 --- /dev/null +++ b/paddle/fluid/inference/test.cmake @@ -0,0 +1,31 @@ +set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING + "A path setting inference demo download directories.") +function (inference_download install_dir url filename) + message(STATUS "Download inference test stuff from ${url}/${filename}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + message(STATUS "finish downloading ${filename}") +endfunction() + +function (inference_download_and_uncompress install_dir url filename) + inference_download(${install_dir} ${url} ${filename}) + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") +endfunction() + +set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") +if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +endif() +set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") + +function (inference_base_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() + cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS}) +endfunction() diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index c3dd1f433691e1c96e9f38ef7b595befad26408f..71fdc67068b3d92a774db82f569d212f6cffad78 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,18 +1,4 @@ -set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com") -set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING - "A path setting inference demo download directories.") set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) -function (inference_download install_dir url filename) - message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") - message(STATUS "finish downloading ${filename}") -endfunction() - -function (inference_download_and_uncompress install_dir url filename) - inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") -endfunction() function(download_model_and_data install_dir model_name data_name) if (NOT EXISTS ${install_dir}) diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed71594ba5781590f3291d56c4ba1a4443003bd5 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedSpatialTransformerDescriptor = + platform::ScopedSpatialTransformerDescriptor; + +template +class CUDNNAffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto* theta = ctx.Input("Theta"); + auto* output = ctx.Output("Output"); + const T* theta_data = theta->data(); + + int n = theta->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + T* output_data = output->mutable_data( + {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace()); + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward( + handle, cudnn_st_desc, theta_data, output_data)); + } +}; + +template +class CUDNNAffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + const T* output_grad_data = output_grad->data(); + T* theta_grad_data = theta_grad->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward( + handle, cudnn_st_desc, output_grad_data, theta_grad_data)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridOpKernel, + paddle::operators::CUDNNAffineGridOpKernel); +REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridGradOpKernel, + paddle::operators::CUDNNAffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ea28265a245c9cd1a35a79324a33f7cf208a159 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -0,0 +1,233 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/affine_grid_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx) { + Tensor numbers; + T* number_data = numbers.mutable_data({count}, platform::CPUPlace()); + T slice = (end - start) / (T)(count - 1); + for (int i = 0; i < count; ++i) { + number_data[i] = start + (T)i * slice; + } + return numbers; + } +}; + +class AffineGridOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Theta"), + "Input(Theta) of AffineGridOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of AffineGridOp should not be null."); + auto theta_dims = ctx->GetInputDim("Theta"); + PADDLE_ENFORCE(theta_dims.size() == 3, + "AffineGrid's Input(Theta) should be 3-D tensor."); + + auto output_shape = ctx->Attrs().Get>("output_shape"); + if (output_shape.size() == 0) { + PADDLE_ENFORCE(ctx->HasInput("OutputShape"), + "Input(OutputShape) of AffineGridOp should not be null if " + "attr(output_shape) is not configured."); + auto output_shape_dims = ctx->GetInputDim("OutputShape"); + PADDLE_ENFORCE(output_shape_dims.size() == 1, + "AffineGrid's Input(OutputShape) should be 1-D tensor."); + } else { + PADDLE_ENFORCE(output_shape.size() == 4, + "The size of attr(output_shape) should be 4."); + } + + PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2."); + PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3."); + // N * H * W * 2 + ctx->SetOutputDim("Output", + framework::make_ddim({theta_dims[0], -1, -1, 2})); + ctx->ShareLoD("Theta", "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif + auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + return framework::OpKernelType(data_type, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library); + } +}; + +class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Theta", + "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. " + "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, " + "y_1)."); + AddInput("OutputShape", + "(Tensor) The shape of target image with format [N, C, H, W].") + .AsDispensable(); + AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2]."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(true); + AddAttr>( + "output_shape", + "The target output image shape with format [N, C, H, W].") + .SetDefault(std::vector()); + + AddComment(R"DOC( + It generates a grid of (x,y) coordinates using the parameters of the + affine transformation that correspond to a set of points where the input + feature map should be sampled to produce the transformed output feature map. + + Given: + Theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + OutputShape = [2, 3, 5, 5] + + Step 1: + + Generate relative coordinates according to OutputShape. + The values of relative coordinates are in the interval between -1 and 1. + The shape of the relative coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + )DOC"); + } +}; + +class AffineGridOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + auto theta_dims = ctx->GetInputDim("Theta"); + if (ctx->HasOutput(framework::GradVarName("Theta"))) { + ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Theta")->type()), + ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + } +}; + +class AffineGridGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_grid_grad"); + op->SetInput("Theta", Input("Theta")); + op->SetInput("OutputShape", Input("OutputShape")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker, + ops::AffineGridGradMaker); +REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad); + +REGISTER_OP_CPU_KERNEL( + affine_grid, + ops::AffineGridOpKernel, + ops::AffineGridOpKernel); +REGISTER_OP_CPU_KERNEL( + affine_grid_grad, + ops::AffineGridGradOpKernel, + ops::AffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h new file mode 100644 index 0000000000000000000000000000000000000000..07e26c292c3bafc4d98bd392a9e1e21a9eb383a8 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +/** + *Return a tensor with evenly spaced numbers over a specified interval. + */ +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx); +}; + +template +class AffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto* theta = ctx.Input("Theta"); + int n = theta->dims()[0]; + + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + auto* output = ctx.Output("Output"); + output->mutable_data({n, h, w, 2}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), output, + static_cast(0)); + + Linspace linspace; + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); + Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2}); + blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, + T(0)); + } + } +}; + +template +class AffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), theta_grad, + static_cast(0)); + + Linspace linspace; + + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2}); + Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); + blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), + &sliced_theta_grad, T(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc index d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638..89416f7ab5d07ddac5b540b9bb361f831c1ef360 100644 --- a/paddle/fluid/operators/delete_var_op.cc +++ b/paddle/fluid/operators/delete_var_op.cc @@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase { } }; +class DeleteVarOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -48,4 +53,5 @@ It should not be configured by users directly. REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, paddle::framework::EmptyGradOpMaker, - paddle::operators::DeleteVarOpInfoMaker); + paddle::operators::DeleteVarOpInfoMaker, + paddle::operators::DeleteVarOpShapeInference); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 6fe30630e9683f59044b216b9e9b1f7dd647b1e2..d19ac9839c90a116265b761e3b1b3f855e2d95e8 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); + auto x_vars_name = ctx.Inputs("X"); framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout{framework::DataLayout::kAnyLayout}; @@ -81,10 +82,11 @@ class SumOp : public framework::OperatorWithKernel { if (x_vars[0]->IsType()) { int dtype = -1; - for (auto& x_var : x_vars) { + for (size_t idx = 0; idx < x_vars.size(); ++idx) { + PADDLE_ENFORCE(x_vars[idx] != nullptr, + "Input var[%s] should not be nullptr", x_vars_name[idx]); // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor. - auto tensor = framework::GetTensorFromVar( - const_cast(x_var)); + auto tensor = framework::GetTensorFromVar(*x_vars[idx]); if (tensor->numel() == 0) { continue; } diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index bb8b14bb9fa41942c3aa653ca224c0842fbf9a00..1ad66f0525e4857533d5cf5ef3ee041aec297794 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -341,6 +341,28 @@ class ScopedPoolingDescriptor { DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); }; +class ScopedSpatialTransformerDescriptor { + public: + ScopedSpatialTransformerDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); + } + ~ScopedSpatialTransformerDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); + } + + template + inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims, + const int dimA[]) { + PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor( + desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType::type, nbDims, dimA)); + return desc_; + } + + private: + cudnnSpatialTransformerDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor); +}; + inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { bool use_cudnn = ctx.Attr("use_cudnn"); use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index e6353f67ef118072a2d8e49111e8ecc486589998..d3d754b6f58d25a9dfacafaf55d50b353a71ee6d 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name); * include all needed cudnn functions in HPPL * different cudnn version has different interfaces **/ -#define CUDNN_DNN_ROUTINE_EACH(__macro) \ - __macro(cudnnSetTensor4dDescriptor); \ - __macro(cudnnSetTensor4dDescriptorEx); \ - __macro(cudnnSetTensorNdDescriptor); \ - __macro(cudnnGetTensorNdDescriptor); \ - __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnGetConvolutionForwardAlgorithm); \ - __macro(cudnnCreateTensorDescriptor); \ - __macro(cudnnDestroyTensorDescriptor); \ - __macro(cudnnCreateFilterDescriptor); \ - __macro(cudnnSetFilter4dDescriptor); \ - __macro(cudnnSetFilterNdDescriptor); \ - __macro(cudnnGetFilterNdDescriptor); \ - __macro(cudnnSetPooling2dDescriptor); \ - __macro(cudnnSetPoolingNdDescriptor); \ - __macro(cudnnGetPoolingNdDescriptor); \ - __macro(cudnnDestroyFilterDescriptor); \ - __macro(cudnnCreateConvolutionDescriptor); \ - __macro(cudnnCreatePoolingDescriptor); \ - __macro(cudnnDestroyPoolingDescriptor); \ - __macro(cudnnSetConvolution2dDescriptor); \ - __macro(cudnnDestroyConvolutionDescriptor); \ - __macro(cudnnSetConvolutionNdDescriptor); \ - __macro(cudnnGetConvolutionNdDescriptor); \ - __macro(cudnnDeriveBNTensorDescriptor); \ - __macro(cudnnCreate); \ - __macro(cudnnDestroy); \ - __macro(cudnnSetStream); \ - __macro(cudnnActivationForward); \ - __macro(cudnnConvolutionForward); \ - __macro(cudnnConvolutionBackwardBias); \ - __macro(cudnnGetConvolutionForwardWorkspaceSize); \ - __macro(cudnnTransformTensor); \ - __macro(cudnnPoolingForward); \ - __macro(cudnnPoolingBackward); \ - __macro(cudnnSoftmaxBackward); \ - __macro(cudnnSoftmaxForward); \ - __macro(cudnnGetVersion); \ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ __macro(cudnnGetErrorString); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a29562b0692684a52a2f022023ea57c3ca1ef712..d7676f89ab5e781f910f98d03e72d5f7c1023a9a 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -147,7 +147,6 @@ function cmake_gen() { -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} - -DWITH_FAST_BUNDLE_TEST=ON -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} @@ -180,7 +179,6 @@ EOF -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ - -DWITH_FAST_BUNDLE_TEST=ON \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 604f3eacd75beff306915b224b30c369dd3a486f..22c60c1cbe4faa8577fa655766e42694652e498d 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): load_prog = Program() load_block = load_prog.global_block() + need_delete_vars = [] for var_tuple in slice_vars_and_attrs: orig_var = var_tuple[0] start = var_tuple[1] slice_var = var_tuple[2] - end = start + reduce(lambda x, y: x * y, slice_var.shape) + end = start + slice_var.shape[0] clone_orig_var = load_block.create_var( name=orig_var.name, @@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): attrs={'axes': [0], 'starts': [start], 'ends': [end]}) - + need_delete_vars.append(clone_orig_var) + load_block.append_op( + type='delete_var', + inputs={'X': need_delete_vars}, ) executor.run(load_prog) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 69f0f8dc898b710256d3b2e9f72a5de94b81485c..cbabb0be229bc83fe75b84afb88d7cd5b5f7e2df 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -155,6 +155,7 @@ __all__ = [ 'sigmoid_cross_entropy_with_logits', 'maxout', 'space_to_depth', + 'affine_grid', 'sequence_reverse', 'affine_channel', 'hash', @@ -711,8 +712,18 @@ def dynamic_gru(input, The first part are weights of the update gate and reset gate with shape :math:`(D \\times 2D)`, and the second part are weights for candidate hidden state with shape :math:`(D \\times D)`. - bias_attr(ParamAttr): The parameter attribute for learnable the - hidden-hidden bias. + + If it is set to None or one attribute of ParamAttr, dynamic_gru will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, dynamic_gru will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. is_reverse(bool): Whether to compute reversed GRU, default :attr:`False`. gate_activation(str): The activation for update gate and reset gate. @@ -811,10 +822,29 @@ def gru_unit(input, Args: input (Variable): The fc transformed input value of current step. - hidden (Variable): The hidden value of lstm unit from previous step. + hidden (Variable): The hidden value of gru unit from previous step. size (integer): The input dimension value. - param_attr (ParamAttr): The weight parameters for gru unit. Default: None - bias_attr (ParamAttr): The bias parameters for gru unit. Default: None + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias + of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates + the bias in the update gate, reset gate and candidate calculations. + If it is set to False, no bias will be applied to the update gate, + reset gate and candidate calculations. If it is set to None or one + attribute of ParamAttr, gru_unit will create ParamAttr as + bias_attr. If the Initializer of the bias_attr is not set, the bias + is initialized zero. Default: None. activation (string): The activation type for cell (actNode). Default: 'tanh' gate_activation (string): The activation type for gates (actGate). @@ -4444,7 +4474,10 @@ def transpose(x, perm, name=None): Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32') + # use append_batch_size=False to avoid prepending extra + # batch size in shape + x = fluid.layers.data(name='x', shape=[5, 10, 15], + dtype='float32', append_batch_size=False) x_transposed = layers.transpose(x, perm=[1, 0, 2]) """ @@ -6109,6 +6142,124 @@ def crop(x, shape=None, offsets=None, name=None): return out +def affine_grid(theta, out_shape, name=None): + """ + It generates a grid of (x,y) coordinates using the parameters of + the affine transformation that correspond to a set of points where + the input feature map should be sampled to produce the transformed + output feature map. + + .. code-block:: text + + * Case 1: + + Given: + + theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + out_shape = [2, 3, 5, 5] + + Step 1: + + Generate normalized coordinates according to out_shape. + The values of the normalized coordinates are in the interval between -1 and 1. + The shape of the normalized coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + + Args: + theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. + out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. + out_shape can be a Variable or a list or tuple. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The output with shape [N, H, W, 2]. + + Raises: + ValueError: If the type of arguments is not supported. + + Examples: + + .. code-block:: python + theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") + out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") + data = fluid.layers.affine_grid(theta, out_shape) + + # or + data = fluid.layers.affine_grid(theta, [5, 3, 28, 28]) + + """ + helper = LayerHelper('affine_grid') + + if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \ + isinstance(out_shape, Variable)): + raise ValueError("The out_shape should be a list, tuple or Variable.") + + if not isinstance(theta, Variable): + raise ValueError("The theta should be a Variable.") + + out = helper.create_variable_for_type_inference(theta.dtype) + ipts = {'Theta': theta} + attrs = {} + if isinstance(out_shape, Variable): + ipts['OutputShape'] = out_shape + else: + attrs['output_shape'] = out_shape + + helper.append_op( + type='affine_grid', + inputs=ipts, + outputs={'Output': out}, + attrs=None if len(attrs) == 0 else attrs) + return out + + def rank_loss(label, left, right, name=None): """ **Rank loss layer for RankNet** diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..edc60550058f53da456c21de4b41142b907743df --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -0,0 +1,174 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import signal +import subprocess +import argparse +import time +import math +import random +from multiprocessing import Process +from functools import reduce + +import numpy as np +import unittest +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid import io + +from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP +from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5 + + +class TestDistSaveLoad2x2(TestDistSimnetBow2x2): + def _load_persistable_vars(self, executor, dirname, program): + def _is_checkpoint_var(var): + """ + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + + : param var(Variable) + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False + + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: + return False + + if "tmp_" in var.name: + return False + + return var.persistable + + io.load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + def run_pserver(self, args): + self.get_model(batch_size=2) + # NOTE: pserver should not call memory optimize + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), args.endpoints, + args.trainers, args.sync_mode) + pserver_prog = t.get_pserver_program(args.current_endpoint) + startup_prog = t.get_startup_program(args.current_endpoint, + pserver_prog) + + need_load = bool(int(os.getenv("LOAD", "0"))) + model_dir = os.getenv("MODEL_DIR", "") + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + + if need_load and model_dir: + self._load_persistable_vars(exe, model_dir, startup_prog) + exe.run(pserver_prog) + + def run_trainer(self, args): + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=2) + + if args.mem_opt: + fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) + if args.is_dist: + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), + args.endpoints, args.trainers, + args.sync_mode) + + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + + build_stra = fluid.BuildStrategy() + + if args.use_reduce: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + exe = fluid.ParallelExecutor( + args.use_cuda, + loss_name=avg_cost.name, + exec_strategy=strategy, + build_strategy=build_stra) + + feed_var_list = [ + var for var in trainer_prog.global_block().vars.values() + if var.is_data + ] + + feeder = fluid.DataFeeder(feed_var_list, place) + reader_generator = train_reader() + + def get_data(): + origin_batch = next(reader_generator) + if args.is_dist and args.use_reader_alloc: + new_batch = [] + for offset, item in enumerate(origin_batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return origin_batch + + need_save = bool(int(os.getenv("SAVE", "0"))) + model_dir = os.getenv("MODEL_DIR", "") + + if need_save: + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + if need_save and model_dir: + io.save_persistables(startup_exe, model_dir, trainer_prog) + + var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor()) + print(np.ravel(var).tolist()) + + +if __name__ == "__main__": + paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") + runtime_main(TestDistSaveLoad2x2) diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py new file mode 100644 index 0000000000000000000000000000000000000000..576d00940c4c7a5e30af5550e14b674a73e7df11 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py @@ -0,0 +1,79 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +def AffineGrid(theta, size): + n = size[0] + w = size[3] + h = size[2] + h_idx = np.repeat( + np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis] + w_idx = np.repeat( + np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis] + grid = np.concatenate( + [w_idx, h_idx, np.ones([h, w, 1])], axis=2) # h * w * 3 + grid = np.repeat(grid[np.newaxis, :], size[0], axis=0) # n * h * w *3 + + ret = np.zeros([n, h * w, 2]) + theta = theta.transpose([0, 2, 1]) + for i in range(len(theta)): + ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i]) + +# print ret.reshape([h * w, 2]).astype("float32") + return ret.reshape([n, h, w, 2]).astype("float32") + + +class TestAffineGridOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = "affine_grid" + theta = np.random.randint(1, 3, self.theta_shape).astype("float32") + theta = np.ones(self.theta_shape).astype("float32") + self.inputs = {'Theta': theta} + self.attrs = {"use_cudnn": True} + if self.dynamic_shape: + self.inputs['OutputShape'] = self.output_shape + else: + self.attrs['output_shape'] = self.output_shape + self.outputs = {'Output': AffineGrid(theta, self.output_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad( + ['Theta'], + 'Output', + no_grad_set=['OutputShape'], + max_relative_error=0.006) + + def initTestCase(self): + self.theta_shape = (3, 2, 3) + self.output_shape = np.array([3, 2, 5, 7]).astype("int32") + self.dynamic_shape = False + + +class TestAffineGridOpCase1(TestAffineGridOp): + def initTestCase(self): + self.theta_shape = (3, 2, 3) + self.output_shape = np.array([3, 2, 5, 7]).astype("int32") + self.dynamic_shape = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..8b50a312341ddb9bffcaf9f685805f80519f8427 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -0,0 +1,89 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import shutil +import unittest +import tempfile + +import numpy as np + +from test_dist_base import TestDistBase, RUN_STEP + + +class TestDistSaveLoadDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "http_proxy": "" + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + + model_dir = tempfile.mkdtemp() + + local_env = {} + local_env["SAVE"] = "1" + local_env["MODEL_DIR"] = model_dir + local_env.update(required_envs) + + cluster_env = {} + cluster_env["LOAD"] = "1" + cluster_env["MODEL_DIR"] = model_dir + cluster_env.update(required_envs) + + local_var = self._run_local(model_file, local_env, check_error_log) + tr0_var, tr1_var = self._run_cluster(model_file, cluster_env, + check_error_log) + + shutil.rmtree(model_dir) + + local_np = np.array(eval(local_var[0])) + train0_np = np.array(eval(tr0_var[0])) + train1_np = np.array(eval(tr1_var[0])) + self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta) + self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta) + self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta) + + def test_dist(self): + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_save_load.py", + delta=0, + check_error_log=False, + need_envs=need_envs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index c4511a98b0667ecccaa8f63b3064c4fc4e86cc78..986fdd9ff27fe2be54ce97f330028b4ae2358714 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest): trainer, _ = self.get_trainer() +class TestFtrl(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + opt = fluid.optimizer.Ftrl(learning_rate=0.1) + opt.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer, _ = self.get_trainer() + + class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') @@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest): ["sum", "scale", "scale", "elementwise_add", "momentum"]) +class TestEmptyPserverOptimizeBlocks(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + # only one parameter + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=False) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0) + sgd_optimizer.minimize(avg_cost) + + def transpiler_test_impl(self): + config = fluid.DistributeTranspilerConfig() + config.slice_var_up = False + + pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config) + + self.assertEqual(len(pserver.blocks), 2) + self.assertEqual(len(pserver.blocks[1].ops), 0) + + class TestDistLookupTableBase(TranspilerTest): def network_with_table(self, is_sparse, is_distributed): self.table_size = 1000 self.emb_size = 64 self.lookup_table_name = 'shared_w' - def emb_pool(ids): + def emb_pool(ids, table_name, is_distributed): emb = fluid.layers.embedding( input=ids, size=[self.table_size, self.emb_size], dtype='float32', - param_attr=self.lookup_table_name, # share parameter + param_attr=table_name, is_sparse=is_sparse, is_distributed=is_distributed) pool = fluid.layers.sequence_pool(input=emb, pool_type='average') @@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest): name='title_ids', shape=[1], dtype='int64', lod_level=1) brand_ids = fluid.layers.data( name='brand_ids', shape=[1], dtype='int64', lod_level=1) - title_emb = emb_pool(title_ids) - brand_emb = emb_pool(brand_ids) - fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1) + profile_ids = fluid.layers.data( + name='brand_ids', shape=[1], dtype='int64', lod_level=1) + title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed) + brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed) + profile_emb = emb_pool(profile_ids, "profile_emb", False) + fc0 = fluid.layers.concat( + input=[title_emb, brand_emb, profile_emb], axis=1) predict = fluid.layers.fc(input=fc0, size=2, act=None, @@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): pserver1, startup1 = self.get_pserver(self.pserver1_ep) - self.assertEqual(len(pserver1.blocks), 3) + self.assertEqual(len(pserver1.blocks), 4) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], @@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sum", "scale", "adam", "scale", "scale"]) + # 3 optimize for table 2 adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["sum", "scale", "adam", "scale", "scale"]) + trainer, _ = self.get_trainer() self.assertEqual(len(trainer.blocks), 1) ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', - 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad', - 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', - 'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', + 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'split_selected_rows', 'send', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', + 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -480,39 +535,45 @@ class TestDistLookupTable(TestDistLookupTableBase): def transpiler_test_impl(self): pserver1, startup1 = self.get_pserver(self.pserver1_ep) - self.assertEqual(len(pserver1.blocks), 5) + self.assertEqual(len(pserver1.blocks), 6) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], ["sum", "scale", "adam", "scale", "scale"]) - # 2 optimize for table sgd + # 4 prefetch -> lookup_sparse_table for data0 self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["sum", "scale", "adam", "scale", "scale"]) + # 2 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sum", "sgd"]) # 3 prefetch -> lookup_sparse_table for data0 - self.assertEqual([op.type for op in pserver1.blocks[3].ops], + self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["lookup_sparse_table"]) - # 4 save table - self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) trainer, trainer_startup = self.get_trainer() self.assertEqual(len(trainer.blocks), 1) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', - 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', - 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', - 'send', 'send_barrier', 'recv', 'recv', 'fetch_barrier' + 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'split_selected_rows', 'send', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', + 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) - startup_ops = [ 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', - 'fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', - 'fetch_barrier', 'fake_init' + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'uniform_random', + 'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', + 'fake_init' ] self.assertEqual([op.type for op in trainer_startup.blocks[0].ops], startup_ops) @@ -526,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): config = fluid.DistributeTranspilerConfig() pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) - self.assertEqual(len(pserver1.blocks), 3) + self.assertEqual(len(pserver1.blocks), 4) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], @@ -535,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["adam", "scale", "scale"]) + # 3 optimize for table adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["adam", "scale", "scale"]) trainer, _ = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) ops = [ 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', - 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad', - 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv', - 'recv', 'recv', 'concat' + 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', + 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'split_selected_rows', 'send', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', + 'recv', 'concat', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -559,29 +626,34 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) - self.assertEqual(len(pserver1.blocks), 5) + self.assertEqual(len(pserver1.blocks), 6) # 0 listen_and_serv # 1 optimize for fc_w or fc_b adam self.assertEqual([op.type for op in pserver1.blocks[1].ops], ["adam", "scale", "scale"]) - # 2 optimize for table sgd - self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"]) - # 3 prefetch -> lookup_sparse_table for data0 - self.assertEqual([op.type for op in pserver1.blocks[3].ops], + # 2 optimize for table adam + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["adam", "scale", "scale"]) + # 3 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"]) + # 4 prefetch -> lookup_sparse_table for data0 + self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["lookup_sparse_table"]) - # 4 save table - self.assertEqual([op.type for op in pserver1.blocks[4].ops], ["save"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) trainer, _ = self.get_trainer(config) self.assertEqual(len(trainer.blocks), 1) ops = [ 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', - 'sequence_pool', 'concat', 'mul', 'elementwise_add', - 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', - 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', - 'send', 'recv', 'recv' + 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'split_selected_rows', 'send', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', + 'recv', 'concat' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9dd733a54d7868e6d22308166b04066670c9fade..3e93322460de915db5cb80f4fe520fccfb9c1558 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -876,6 +876,22 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_affine_grid(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[2, 3, 3], dtype="float32") + out, ids = layers.argsort(input=data, axis=1) + + theta = layers.data(name="theta", shape=[2, 3], dtype="float32") + out_shape = layers.data( + name="out_shape", shape=[-1], dtype="float32") + data_0 = layers.affine_grid(theta, out_shape) + data_1 = layers.affine_grid(theta, [5, 3, 28, 28]) + + self.assertIsNotNone(data_0) + self.assertIsNotNone(data_1) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 48b52a5412eb99fbc7a5c8534a766ede4954e849..a0358f8b401e301312b5b9c0b18733d4275045e3 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): exe.run(pserver_prog) +def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers, + trainer_id): + x = fluid.layers.data(name='x', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + + # loss function + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + # optimizer + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + ps1 = ip + ":" + str(int(port) + 1) + ps2 = ip + ":" + port + pserver_endpoints = ps1 + "," + ps2 + + config = fluid.DistributeTranspilerConfig() + config.slice_var_up = False + t = fluid.DistributeTranspiler(config=config) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=sync_mode) + pserver_prog = t.get_pserver_program(ps2) + + # pserver2 have no parameter + assert (len(pserver_prog.blocks) == 2) + assert (len(pserver_prog.blocks[1].ops) == 0) + + pserver_startup = t.get_startup_program(ps2, pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + + class TestListenAndServOp(OpTest): def setUp(self): self.ps_timeout = 5 @@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest): self.trainers = 1 self.trainer_id = 0 - def _start_pserver(self, use_cuda, sync_mode): + def _start_pserver(self, use_cuda, sync_mode, pserver_func): p = Process( - target=run_pserver, + target=pserver_func, args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, self.trainer_id)) p.daemon = True @@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest): def test_handle_signal_in_serv_op(self): # run pserver on CPU in sync mode - p1 = self._start_pserver(False, True) + p1 = self._start_pserver(False, True, run_pserver) + self._wait_ps_ready(p1.pid) + + # raise SIGTERM to pserver + os.kill(p1.pid, signal.SIGINT) + p1.join() + + # run pserver on CPU in async mode + p2 = self._start_pserver(False, False, run_pserver) + self._wait_ps_ready(p2.pid) + + # raise SIGTERM to pserver + os.kill(p2.pid, signal.SIGTERM) + p2.join() + + def test_list_and_serv_run_empty_optimize_block(self): + # run pserver on CPU in sync mode + p1 = self._start_pserver(False, True, run_pserver_with_empty_block) self._wait_ps_ready(p1.pid) # raise SIGTERM to pserver @@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest): p1.join() # run pserver on CPU in async mode - p2 = self._start_pserver(False, False) + p2 = self._start_pserver(False, False, run_pserver_with_empty_block) self._wait_ps_ready(p2.pid) # raise SIGTERM to pserver diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..b913127ad625eb25de3ec36edd2161019ed09749 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py @@ -0,0 +1,130 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +from threading import Thread + + +def user_reader(inputs): + def _reader(): + for d in inputs: + yield d + + return _reader + + +def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"): + def _feeder(): + for batch_data in batch_reader(): + sample_batch = [] + label_batch = [] + for sample, label in batch_data: + sample_batch.append(sample) + label_batch.append([label]) + tensor = core.LoDTensor() + label = core.LoDTensor() + place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace() + tensor.set(np.array(sample_batch, dtype=img_dtype), place) + label.set(np.array(label_batch, dtype="int64"), place) + yield [tensor, label] + + return _feeder + + +class TestPyReader(unittest.TestCase): + def setUp(self): + self.capacity = 10 + self.shapes = [(-1, 3, 2, 1), (-1, 1)] + self.lod_levels = [0, 0] + self.dtypes = ['float32', 'int64'] + + def test_pin_memory_pyreader(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + executor = fluid.Executor(place) + + data_file = fluid.layers.py_reader( + capacity=self.capacity, + dtypes=self.dtypes, + lod_levels=self.lod_levels, + shapes=self.shapes) + # feed_queue = data_file.queue + read_out_data = fluid.layers.read_file(data_file) + + self.inputs = [] + for _ in range(10): + sample = np.random.uniform( + low=0, high=1, size=[3, 2, 1]).astype("float32") + label = np.random.uniform( + low=0, high=10, size=[1]).astype("int64") + self.inputs.append((sample, label)) + + self.input_tensors = [] + for d, l in batch_feeder( + paddle.batch( + user_reader(self.inputs), batch_size=2), + pin_memory=True + if fluid.core.is_compiled_with_cuda() else False)(): + ta = fluid.LoDTensorArray() + ta.append(d) + ta.append(l) + self.input_tensors.append(ta) + + self.batched_inputs = [] + for batch in paddle.batch(user_reader(self.inputs), batch_size=2)(): + feed_d = [] + feed_l = [] + for d, l in batch: + feed_d.append(d) + feed_l.append([l]) + self.batched_inputs.append([feed_d, feed_l]) + + data_file.decorate_tensor_provider( + batch_feeder( + paddle.batch( + user_reader(self.inputs), batch_size=2), + pin_memory=True + if fluid.core.is_compiled_with_cuda() else False)) + + executor.run(fluid.default_startup_program()) + self.outputs = [] + + data_file.start() + for _ in self.input_tensors: + self.outputs.append( + executor.run(fetch_list=list(read_out_data))) + data_file.reset() + self.validate() + + def validate(self): + self.assertEqual(len(self.batched_inputs), len(self.outputs)) + for in_data_list, out_data_list in zip(self.batched_inputs, + self.outputs): + self.assertEqual(len(in_data_list), len(out_data_list)) + in_data_list_np = [ + np.array(in_lod_tensor) for in_lod_tensor in in_data_list + ] + for in_data, out_data in zip(in_data_list_np, out_data_list): + self.assertTrue((in_data == out_data).all()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 8daac0f43b41b9497812a07fa2f96bffb727413d..9066fc9d1bf13176862f6debf0ed0bedaaaf3eba 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -35,6 +35,7 @@ import sys import numpy as np import collections import six +import logging from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework @@ -767,6 +768,15 @@ in a single call.") prefetch_var_name_to_block_id.extend( lookup_table_var_name_to_block_id) + if len(optimize_blocks) == 0: + logging.warn("pserver [" + str(endpoint) + + "] has no optimize block!!") + pre_block_idx = pserver_program.num_blocks - 1 + empty_block = pserver_program._create_block(pre_block_idx) + optimize_blocks.append(empty_block) + + # In some case, some parameter server will have no parameter to optimize + # So we give an empty optimize block to parameter server. attrs = { "optimize_blocks": optimize_blocks, "endpoint": endpoint, @@ -910,11 +920,11 @@ to transpile() call.") block_idx = int(block_name.split(block_suffix)[1]) orig_var = self.origin_program.global_block().vars[orig_var_name] - skip_numel = 0 + skip_dim0 = 0 slice_vars = self.param_var_mapping[orig_var_name] for slice_var in slice_vars[:block_idx]: - skip_numel += reduce(lambda x, y: x * y, slice_var.shape) - slice_vars_and_attrs.append([orig_var, skip_numel, param]) + skip_dim0 += slice_var.shape[0] + slice_vars_and_attrs.append([orig_var, skip_dim0, param]) return slice_vars_and_attrs @@ -1065,7 +1075,12 @@ to transpile() call.") continue_search_lookup_table_op = False all_ops = program.global_block().ops for op in all_ops: - if op.type == LOOKUP_TABLE_TYPE: + if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input( + "W")[0]: + if not op.attr('is_distributed'): + raise RuntimeError( + "lookup_table_op that lookup an distributed embedding table" + "should set is_distributed to true") continue_search_lookup_table_op = True lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list( @@ -1275,7 +1290,6 @@ to transpile() call.") } outputs = {"ParamOut": [param_var]} # only support sgd now - import logging logging.warn( "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of " + table_opt_op.type) @@ -1442,6 +1456,9 @@ to transpile() call.") elif op_type == "decayed_adagrad": if varkey == "Moment": return param_shape + elif op_type == "ftrl": + if varkey in ["SquaredAccumulator", "LinearAccumulator"]: + return param_shape elif op_type == "sgd": pass else: diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 5269bd94cec47a5262e2389c5b02f91edd5a7d17..9a13cecc646e8534a157fad882fd97836348deb4 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -61,6 +61,9 @@ class InferenceTranspiler(object): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + if use_mkldnn: + self._depthwise_conv_mkldnn(program) + self._fuse_batch_norm(program, place, scope) if use_mkldnn: self._fuse_conv_bias_mkldnn(program) @@ -70,6 +73,31 @@ class InferenceTranspiler(object): program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) + def _depthwise_conv_mkldnn(self, program): + ''' + Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program. + The result is: + - before: + - any_other_op->depthwise_conv->any_other_op + - after: + - any_other_op->conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type == 'depthwise_conv2d': + current_op.desc.set_type("conv2d") + i = i + 1 + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + def _fuse_conv_eltwise_mkldnn(self, program): ''' Transpile the program fusing elementwise_add into conv for MKLDNN