diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ab80987b3ad6c4793ceeac1bf3808d2e87fbd5b..231224f9249848b6e4981a98e0538794bf5d3c08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,12 @@ else() set(THIRD_PARTY_BUILD_TYPE Release) endif() +if(WITH_MKL) + option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF) + if (MKL_SPLIT_GEMM) + add_definitions(-DPADDLE_MKL_SPLIT_GEMM) + endif() +endif() set(WITH_MKLML ${WITH_MKL}) if (NOT DEFINED WITH_MKLDNN) if (WITH_MKL AND AVX2_FOUND) diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755 --- a/benchmark/paddle/image/run.sh +++ b/benchmark/paddle/image/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755 --- a/benchmark/paddle/image/run_mkl_infer.sh +++ b/benchmark/paddle/image/run_mkl_infer.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function clock_to_seconds() { diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755 --- a/benchmark/paddle/image/run_mkl_train.sh +++ b/benchmark/paddle/image/run_mkl_train.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755 --- a/benchmark/paddle/image/run_openblas_infer.sh +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function clock_to_seconds() { diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755 --- a/benchmark/paddle/rnn/run.sh +++ b/benchmark/paddle/rnn/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755 --- a/benchmark/tensorflow/image/run.sh +++ b/benchmark/tensorflow/image/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755 --- a/benchmark/tensorflow/image/run_multi.sh +++ b/benchmark/tensorflow/image/run_multi.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755 --- a/benchmark/tensorflow/rnn/run.sh +++ b/benchmark/tensorflow/rnn/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755 --- a/benchmark/tensorflow/rnn/run_multi.sh +++ b/benchmark/tensorflow/rnn/run_multi.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8c0d87db7a353eaa40e3eb31b4f0181ea3d1d33f..69d665b80fde22c10d9d57687b0e45dae7291969 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -180,13 +180,13 @@ paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, default paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) -paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True)) +paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) -paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)) paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -209,9 +209,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs= paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) -paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index e8057c35e8b957cb43e66937a5073a085c6e7708..df55b3d05402f1aeecfd8d4218a637a81d58ed87 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(var_handle SRCS var_handle.cc DEPS place) +cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index c52980472de8d48e8c21e7c1e53813aa4847cece..f1f8674caf663ce38df5a2eecbcf690b5ca87dc4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -333,7 +333,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); - return std::move(graph); + return graph; } bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d..e28264eb32756f77ef5baed3dff77ba9f0943160 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -35,14 +35,16 @@ struct ReduceLoDTensor { PADDLE_ENFORCE(!src_tensors_.empty()); auto &t0 = *src_tensors_[0]; PADDLE_ENFORCE_NE(t0.numel(), 0); + dst_tensor_.Resize(t0.dims()); T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - if (dst != t0.data()) { - std::copy(t0.data(), t0.data() + t0.numel(), dst); - } - for (size_t i = 1; i < src_tensors_.size(); ++i) { + for (size_t i = 0; i < src_tensors_.size(); ++i) { auto &t = *src_tensors_[i]; + if (dst == t.data()) { + continue; + } + PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); PADDLE_ENFORCE_EQ(t.type(), t0.type()); std::transform(t.data(), t.data() + t.numel(), dst, dst, diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index f1080610381128325ea0affba760ac66798fd948..2e397e86825a41765a360d31fa8595d17027f3ec 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -31,7 +31,7 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { std::unique_ptr Apply(std::unique_ptr graph) const override { auto new_graph = builder_->Apply(std::move(graph)); PADDLE_ENFORCE(IsValidGraph(new_graph.get())); - return std::move(new_graph); + return new_graph; } int GetVarDeviceID(const std::string& var_name) const override { diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index 411be02988a82b3e35d56833f92fc6fe405a2c3d..cd72162f44ca76aa6340606cf79a73601eae89af 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -53,7 +53,7 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { std::unique_ptr Apply(std::unique_ptr graph) const override { auto new_graph = builder_->Apply(std::move(graph)); printer_->Print(*new_graph, stream_ref_); - return std::move(new_graph); + return new_graph; } int GetVarDeviceID(const std::string& var_name) const override { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 38cde13fe279d264c51baff71cffcab7b6ebb227..f85c62dd6c4a8033a037b1e001ece6a9cc90ca98 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -171,7 +171,12 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( for (size_t i = 0; i < fetch_tensors.size(); ++i) { auto &var_name = fetch_tensors[i]; - auto &vars = fetched_vars.at(var_name); + auto fetched_var_it = fetched_vars.find(var_name); + PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), + "Cannot find fetched variable.(Perhaps the main_program " + "is not set to ParallelExecutor)"); + + auto &vars = fetched_var_it->second; temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d5ca74cb679b499da725a93b36384ca216ff4957..ee0604383ec9df826fa2abaef1f643ba0da6a096 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,5 +1,5 @@ -cc_library(graph SRCS graph.cc DEPS node) cc_library(node SRCS node.cc DEPS proto_desc) +cc_library(graph SRCS graph.cc DEPS node) cc_library(pass SRCS pass.cc DEPS graph node) cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 688f7ba5825bf1a1ab65a0912663481913223e80..702baf932593e14ca4d070a655aab0188075f2e9 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -21,6 +21,7 @@ namespace framework { // NOTE(paddle-dev): This graph contains circle. Graph::Graph(const ProgramDesc &program) : program_(program) { + VLOG(3) << "block in program:" << program_.Size(); std::unordered_map all_vars; for (auto *var : program.Block(0).AllVars()) { all_vars.emplace(var->Name(), var); diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f..919029c38f2f26a6f5e02da645c4f7718044cdae 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer, writer->Write(buffer.str()); } -std::vector ReadFromRecordIO( - recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) { - std::vector result; - if (scanner->HasNext()) { - std::istringstream sin(scanner->Next()); - uint32_t sz; - sin.read(reinterpret_cast(&sz), sizeof(uint32_t)); - result.resize(sz); - for (uint32_t i = 0; i < sz; ++i) { - DeserializeFromStream(sin, &result[i], dev_ctx); - } +bool ReadFromRecordIO(recordio::Scanner *scanner, + const platform::DeviceContext &dev_ctx, + std::vector *result_ptr) { + if (!scanner->HasNext()) { + return false; } - return result; + std::istringstream sin(scanner->Next()); + uint32_t sz; + sin.read(reinterpret_cast(&sz), sizeof(uint32_t)); + auto &result = *result_ptr; + result.resize(sz); + for (uint32_t i = 0; i < sz; ++i) { + DeserializeFromStream(sin, &result[i], dev_ctx); + } + + return true; } std::vector LoDTensor::SplitLoDTensor( diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 4a2729373b5c63176ed1e856f4acf29fd1e73254..e9b473d547252e80ed26ec61e1a33fbe1742dbe0 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer, const std::vector& tensor, const platform::DeviceContext& dev_ctx); -extern std::vector ReadFromRecordIO( - recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx); +extern bool ReadFromRecordIO(recordio::Scanner* scanner, + const platform::DeviceContext& dev_ctx, + std::vector* result_ptr); /* * Convert between length-based LoD and offset-based LoD. diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 38d3cd96d65f0a54b0ea87b4c677013f3802adfb..cd50aaa26054b78f1b1e8f0d470b397892155a2b 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -301,11 +301,12 @@ static void TestRecordIO() { { std::unique_ptr stream_ptr(stream); recordio::Scanner scanner(std::move(stream_ptr)); - auto tensors = ReadFromRecordIO(&scanner, ctx); + std::vector tensors; + ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); - tensors = ReadFromRecordIO(&scanner, ctx); + ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index 5897d320a8b7e5af541098cadff8e78f8324949c..40eafda9bf294f7e8ddd067e9014447f4de1cc0e 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -67,7 +67,8 @@ void ReaderBase::Start() { } } -ReaderBase::~ReaderBase() { Shutdown(); } +ReaderBase::~ReaderBase() {} +DecoratedReader::~DecoratedReader() { reader_->Shutdown(); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index a8d04feb42456607159bcbede0574fe90dfe995c..82562bf883d88787858912f7039cf8fef003eccf 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -25,8 +25,6 @@ namespace paddle { namespace framework { -enum ReaderStatus { kRunning, kStopped }; - class ReaderBase { public: virtual void ReadNext(std::vector* out); @@ -48,6 +46,8 @@ class ReaderBase { virtual void StartImpl() {} + enum ReaderStatus { kRunning, kStopped }; + ReaderStatus status_{kRunning}; mutable std::mutex mu_; @@ -74,6 +74,8 @@ class DecoratedReader : public ReaderBase, reader_->InsertDecoratedReader(shared_from_this()); } + ~DecoratedReader(); + protected: void ShutdownImpl() override { reader_->Shutdown(); } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index f98011e896f4033ef210e0eb69f93ce7800a3cd6..ab693004cfb038fd92afd9c60e0fcb4e16b9f8a9 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -15,6 +15,7 @@ #include #include #include +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace framework { @@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, os.write(out.data(), size); } { // the 3rd field, tensor data - uint64_t size = tensor.memory_size(); + uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type()); + auto* data_ptr = tensor.data(); PADDLE_ENFORCE(size < std::numeric_limits::max(), "Index overflow when writing tensor"); @@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, tensor->Resize(framework::make_ddim(dims)); void* buf; auto ctx = platform::CPUDeviceContext(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); if (platform::is_gpu_place(dev_ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA Tensor cpu_tensor; @@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, framework::VisitDataType( desc.data_type(), DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); - is.read(static_cast(buf), cpu_tensor.memory_size()); + is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); #else @@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, framework::VisitDataType( desc.data_type(), DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); - is.read(static_cast(buf), tensor->memory_size()); + is.read(static_cast(buf), size); } } } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index c9eff0fc28c5ff52f902d3d5a0ebb37fa7619e9c..16c9c81258a9fdb7730b9b3e34be990798c91639 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -38,4 +38,6 @@ if(WITH_TESTING) # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) endif() -add_subdirectory(api) +if(NOT APPLE) + add_subdirectory(api) +endif() diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..b3a1075e5adf4a24bf32017574c061f36c46ba8c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -22,8 +22,6 @@ #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" namespace paddle { -namespace inference { -namespace analysis { DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, "Enable subgraph to TensorRT engine for acceleration"); @@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +namespace inference { +namespace analysis { + class DfgPassManagerImpl final : public DfgPassManager { public: DfgPassManagerImpl() { diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index e9e14fb1947da059c8d126d3da182ce446f6421e..0132bf5b9c6552391aaa19542669487f42b685a7 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -45,14 +45,15 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/pass_manager.h" namespace paddle { -namespace inference { -namespace analysis { // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +namespace inference { +namespace analysis { + class Analyzer : public OrderedRegistry { public: // Register all the pass-managers. diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index d7c1a72932a39f878add2bb884e280b91d3c38c0..25a440e7e71fddb38cc515f99d15231675a8172e 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -13,13 +13,21 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" +#include #include "paddle/fluid/inference/analysis/ut_helper.h" namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, main) { +TEST_F(DFG_Tester, analysis_without_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Analyzer analyser; + analyser.Run(&argument); +} + +TEST_F(DFG_Tester, analysis_with_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69..8a3af0a8ebd5bad7be7046fa399cca4920da3d71 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -222,10 +222,19 @@ Node *GraphTraits::NodesDFSIterator::operator->() { return stack_.top(); } +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inlinks.size() == n; +} + GraphTraits::NodesTSIterator::NodesTSIterator( const std::vector &source) { PADDLE_ENFORCE(!source.empty(), "Start points of topological sorting should not be empty!"); + // CHECK all the inputs' in-degree is 0 + for (auto *node : source) { + PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); + } + std::unordered_set visited; std::unordered_set to_visit{source.begin(), source.end()}; @@ -233,6 +242,11 @@ GraphTraits::NodesTSIterator::NodesTSIterator( while (!to_visit.empty()) { std::vector queue(to_visit.begin(), to_visit.end()); for (auto *p : queue) { + if (p->deleted()) { + visited.insert(p); + to_visit.erase(p); + continue; + } inlink_visited.clear(); std::copy_if(p->inlinks.begin(), p->inlinks.end(), @@ -292,6 +306,37 @@ Node *GraphTraits::NodesTSIterator::operator->() { return sorted_[cursor_]; } +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT + std::unordered_set nodes(graph.begin(), graph.end()); + std::unordered_set inputs; + std::unordered_set outputs; + // Input a Value, check whether its inlink is in the subgraph. + auto inlink_in_subgraph = [&](Node *n) { + for (auto *in : n->inlinks) { + if (nodes.count(in)) return true; + } + return false; + }; + for (auto &node : graph) { + for (auto *in : node->inlinks) { + // The Value that is written by nodes inside a sub-graph shouldn't be the + // input of the sub-graph. + if (!nodes.count(in) && in->type() == Node::Type::kValue && + !inlink_in_subgraph(in)) { + inputs.insert(in); + } + } + for (auto *out : node->outlinks) { + if (!nodes.count(out) && out->type() == Node::Type::kValue) { + outputs.insert(out); + } + } + } + return std::make_pair(std::vector(inputs.begin(), inputs.end()), + std::vector(outputs.begin(), outputs.end())); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 5dd914d1971bfb5bcc0b1db41d73e2b67120bc06..1c60d5de21538043962cc58a6f508aea635fe8c4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -133,7 +133,7 @@ struct GraphTraits { private: std::vector sorted_; - int cursor_{0}; + size_t cursor_{0}; }; explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} @@ -173,36 +173,8 @@ struct GraphTraits { // Extract the inputs and outputs of a graph. The inputs and outputs of a // sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph. -static std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT - std::unordered_set nodes(graph.begin(), graph.end()); - std::unordered_set inputs; - std::unordered_set outputs; - // Input a Value, check whether its inlink is in the subgraph. - auto inlink_in_subgraph = [&](Node *n) { - for (auto *in : n->inlinks) { - if (nodes.count(in)) return true; - } - return false; - }; - for (auto &node : graph) { - for (auto *in : node->inlinks) { - // The Value that is written by nodes inside a sub-graph shouldn't be the - // input of the sub-graph. - if (!nodes.count(in) && in->type() == Node::Type::kValue && - !inlink_in_subgraph(in)) { - inputs.insert(in); - } - } - for (auto *out : node->outlinks) { - if (!nodes.count(out) && out->type() == Node::Type::kValue) { - outputs.insert(out); - } - } - } - return std::make_pair(std::vector(inputs.begin(), inputs.end()), - std::vector(outputs.begin(), outputs.end())); -} +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph); } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 29ca008123addf07959b965a4b54bf55b18c401d..2328d870422c5a31c22d7b09980aae35e01b2b25 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -22,14 +22,18 @@ namespace paddle { namespace inference { + +DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); + namespace analysis { using framework::proto::ProgramDesc; std::vector ExtractParameters( - const std::vector>& nodes); + const std::vector> &nodes); -bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { +bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument) ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) PADDLE_ENFORCE(!argument->transformed_program_desc); @@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { bool DataFlowGraphToFluidPass::Finalize() { return true; } -void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { - auto traits = GraphTraits(graph); - for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) { - if (it->deleted()) continue; +void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { + LOG(INFO) << "graph.inputs " << graph->inputs.size(); + for (auto &node : GraphTraits(graph).nodes_in_TS()) { + if (node.deleted()) continue; - switch (it->type()) { + switch (node.type()) { case Node::Type::kFunction: { - LOG(INFO) << "add function " << it->repr(); - AddFluidOp(&(*it)); + LOG(INFO) << "add function " << node.repr(); + AddFluidOp(&node); } break; case Node::Type::kFunctionBlock: { - LOG(INFO) << "add engine op " << it->repr() << " , " - << static_cast(&(*it))->subgraph.size(); - AddEngineOp(&(*it)); + LOG(INFO) << "add engine op " << node.repr() << " , " + << static_cast(&node)->subgraph.size(); + AddEngineOp(&node); } break; default: continue; } } + + PADDLE_ENFORCE(argument_->transformed_program_desc.get()); } -void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { - auto* ori_op = static_cast(node->pb_desc()); +void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { + auto *ori_op = static_cast(node->pb_desc()); // currently only the main block is analyzed. - auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto* op = main_block->add_ops(); + auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + auto *op = main_block->add_ops(); *op = *ori_op; // copy the attributes, by default, these will not be changed - // by analysis phrase. + // by analysis phrase. // The inputs and outputs of the existing ops are not changed by tensorrt // subgraph pass. // NOTE It might be changed by other passes in the long run. } -void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph, - const framework::proto::BlockDesc& block) { +void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, + const framework::proto::BlockDesc &block) { static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; - auto* func = static_cast(node); + auto *func = static_cast(node); // collect inputs std::vector io; - for (auto* x : func->inlinks) { + for (auto *x : func->inlinks) { io.push_back(x->name()); } desc.SetInput("Xs", io); // collect outputs io.clear(); - for (auto* x : func->outlinks) { + for (auto *x : func->outlinks) { io.push_back(x->name()); } desc.SetOutput("Ys", io); - desc.SetType("tensorrt_engine"); + + PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); // Set attrs SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); - SetAttr(desc.Proto(), "engine_unique_key", - "trt-" + std::to_string(counter++)); - SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter - SetAttr(desc.Proto(), "max_workspace", - 1024); // TODO(Superjomn) add config latter + SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); + SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); + SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); node->SetPbMsg(desc.Proto()->SerializeAsString()); } std::vector ExtractParameters( - const std::vector>& nodes) { + const std::vector> &nodes) { std::vector parameters; - for (const auto& node : nodes) { + for (const auto &node : nodes) { if (!node->IsValue()) continue; PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first"); framework::proto::VarDesc var; @@ -128,21 +133,30 @@ std::vector ExtractParameters( return parameters; } -void DataFlowGraphToFluidPass::AddEngineOp(Node* node) { +void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { // TODO(Superjomn) Here need to expose some arguments for default setting. PADDLE_ENFORCE(node->IsFunctionBlock()); - auto* block_node = static_cast(node); + auto *block_node = static_cast(node); framework::proto::BlockDesc proto; framework::BlockDesc block_desc(nullptr, &proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + LOG(INFO) << "origin variable size: " + << argument_->origin_program_desc->blocks(0).vars().size(); + LOG(INFO) << "transformed variable size: " + << block_desc.Proto()->vars().size(); // copy ops. - for (auto* node : block_node->subgraph) { - auto* op = block_desc.AppendOp(); + for (auto *node : block_node->subgraph) { + auto *op = block_desc.AppendOp(); PADDLE_ENFORCE(!node->pb_msg().empty()); op->Proto()->ParseFromString(node->pb_msg()); } + *block_desc.Proto()->mutable_vars() = + argument_->origin_program_desc->blocks(0).vars(); + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); - auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto* op = main_block->add_ops(); + auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); op->ParseFromString(node->pb_msg()); } @@ -151,7 +165,7 @@ namespace { class DFG_DebuggerPass : public DFG_GraphvizDrawPass { public: using Config = DFG_GraphvizDrawPass::Config; - explicit DFG_DebuggerPass(const Config& config) + explicit DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {} std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } @@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { }; } // namespace -Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { +Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( FLAGS_inference_analysis_graphviz_log_root, "data_flow_graph_to_fluid_graphviz_debugger")); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h index edc84b02ed20991e3e7c6c437d2b1fac169bae03..59c47365aa6c8ad5886c4515850d264f69cc4670 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h @@ -26,6 +26,10 @@ namespace paddle { namespace inference { + +DECLARE_int32(tensorrt_max_batchsize); +DECLARE_int32(tensorrt_workspace_size); + namespace analysis { class DataFlowGraphToFluidPass final : public DataFlowGraphPass { public: diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..65842b1e850953e77e3d4d28416609be271af9f1 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { no++; } // DFG is sensitive to ProgramDesc, be careful to change the existing models. - ASSERT_EQ(no, 82); + ASSERT_EQ(no, 83); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index e918622d74cfb11d83090555be2a768cc14e7742..496921db9eabce1b1e40c7cb13089446ca93321c 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); PADDLE_ENFORCE(argument); if (!argument->main_dfg) { - LOG(INFO) << "Init DFG"; argument->main_dfg.reset(new DataFlowGraph); } desc_ = argument->origin_program_desc.get(); @@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { v->SetPbMsg(var.SerializeAsString()); var2id[var.name()] = v->id(); } + for (int i = 0; i < main_block.ops_size(); i++) { const auto &op = main_block.ops(i); auto *o = graph->nodes.Create(Node::Type::kFunction); @@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { o->SetPbMsg(op.SerializeAsString()); // set inputs and outputs - // TODO(Superjomn) make sure the InputNames is the real variable name. + std::unordered_set inlinks; for (int j = 0; j < op.inputs_size(); j++) { auto &in_var = op.inputs(j); for (int k = 0; k < in_var.arguments_size(); k++) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); + inlinks.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { auto &out_var = op.outputs(j); for (int k = 0; k < out_var.arguments_size(); k++) { auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); + if (inlinks.count(out)) { + // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). + auto *out_alias = graph->nodes.Create(Node::Type::kValue); + out_alias->SetName(out->name()); + out_alias->SetPbDesc(out->pb_desc()); + out_alias->SetPbMsg(out->pb_msg()); + var2id[out_alias->name()] = out_alias->id(); // update a -> a0 + LOG(INFO) << "loop found in graph, create SSA alias node [" + << out_alias->repr() << "] for [" << out->repr() << "]"; + out = out_alias; + } out->inlinks.push_back(o); o->outlinks.push_back(out); } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index cbca5abdd5fff1672ba5d47a8876489c54ad6947..dadb84059d21adab44159a6145b345460663cb96 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -24,12 +24,12 @@ namespace analysis { TEST_F(DFG_Tester, Init) { FluidToDataFlowGraphPass pass; pass.Initialize(&argument); - DataFlowGraph graph; - pass.Run(&graph); + pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. - ASSERT_EQ(graph.nodes.size(), 37UL); + ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL); pass.Finalize(); - LOG(INFO) << '\n' << graph.DotString(); + ASSERT_FALSE(argument.main_dfg->DotString().empty()); + EXPECT_FALSE(argument.main_dfg->inputs.empty()); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { SubGraphFuse(graph, node_inside_subgraph_teller_)(); + VLOG(4) << "debug info " + << graph->HumanReadableInfo(false /*show_values*/, + true /*show_functions*/); } } // namespace analysis diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9d63d08dedf6a1bcdacc51bb83d2ed261bca4117..e28e144fd54cec06b0228ac9c478de7c641455a0 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -82,7 +82,7 @@ inference_api_test(test_api_impl if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec) endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index f6f3cb335897b02905e24c229b92f3940a37dbf8..0206ac60103759deda91be741617bde63e003de6 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { bool PaddleInferenceAnakinPredictor::Run( const std::vector &inputs, - std::vector *output_data) { + std::vector *output_data, int batch_size) { for (const auto &input : inputs) { if (input.dtype != PaddleDType::FLOAT32) { LOG(ERROR) << "Only support float type inputs. " << input.name diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 85ca83cd00756cca04d7b92437e9955d8ab297e7..def096c867ec85624f5b221782ef8b6240923c05 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. bool Run(const std::vector& inputs, - std::vector* output_data) override; + std::vector* output_data, + int batch_size = -1) override; std::unique_ptr Clone() override; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 786dc8e827806a9cea9dc01788fada2fd754b930..3ae255e13fc4f3ca28a6af62a5d5944d84303fc7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() { } bool NativePaddlePredictor::Run(const std::vector &inputs, - std::vector *output_data) { + std::vector *output_data, + int batch_size) { VLOG(3) << "Predictor::predict"; Timer timer; timer.tic(); diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 92e693578ab657004f3c40c09b979897afea1e1f..4f28c3cd34bade4189871210e6168c6c1c610c2c 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor { bool Init(std::shared_ptr parent_scope); bool Run(const std::vector &inputs, - std::vector *output_data) override; + std::vector *output_data, + int batch_size = -1) override; std::unique_ptr Clone() override; diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 0cdc88fa1eaf3935ce0da143e1e91eb84cd70dcf..c0891e9c281961fa03d278a0f5c676f92672c419 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { @@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { return false; } - // Analyze inference_program - Argument argument; - argument.origin_program_desc.reset( - new ProgramDesc(*inference_program_->Proto())); - Singleton::Global().Run(&argument); - CHECK(argument.transformed_program_desc); - VLOG(5) << "transformed program:\n" - << argument.transformed_program_desc->SerializeAsString(); - VLOG(5) << "to prepare executor"; - *inference_program_->Proto() = *argument.transformed_program_desc; + OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(5) << "to create variables"; @@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { return true; } + bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) override { + PADDLE_ENFORCE_GT(batch_size, 0, + "TensorRT engine needs the argument batch_size set"); + FLAGS_tensorrt_engine_batch_size = batch_size; + return NativePaddlePredictor::Run(inputs, output_data, batch_size); + } + + void OptimizeInferenceProgram() { + // Analyze inference_program + Argument argument; + argument.origin_program_desc.reset( + new ProgramDesc(*inference_program_->Proto())); + Singleton::Global().Run(&argument); + CHECK(argument.transformed_program_desc); + VLOG(5) << "transformed program:\n" + << argument.transformed_program_desc->SerializeAsString(); + VLOG(5) << "to prepare executor"; + inference_program_.reset( + new framework::ProgramDesc(*argument.transformed_program_desc)); + } + private: TensorRTConfig config_; }; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index b8ba2d14a5c161d491d838888ea14b776f769f23..2f8b4f8596946988a728b5cf82de251bfda778a9 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -98,7 +98,8 @@ class PaddlePredictor { // responsible for the output tensor's buffer, either allocated or passed from // outside. virtual bool Run(const std::vector& inputs, - std::vector* output_data) = 0; + std::vector* output_data, + int batch_size = -1) = 0; // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. diff --git a/paddle/fluid/inference/api/test_api.cc b/paddle/fluid/inference/api/test_api.cc index ac8a21a22be6f27311b8ae2507d04d9d1b510e76..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644 --- a/paddle/fluid/inference/api/test_api.cc +++ b/paddle/fluid/inference/api/test_api.cc @@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor { LOG(INFO) << "I get other_config " << config.other_config; } bool Run(const std::vector &inputs, - std::vector *output_data) override { + std::vector *output_data, + int batch_size = 0) override { LOG(INFO) << "Run"; return false; } diff --git a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc index 585f6d29376c3341c21ff76361d5335512c1b1b6..62d98a796708612e7d4ff8abfd85125978ce22c7 100644 --- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc @@ -15,50 +15,79 @@ #include #include #include +#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { DEFINE_string(dirname, "", "Directory of the inference model."); -void Main(bool use_gpu) { +void CompareTensorRTWithFluid(bool enable_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt; + //# 1. Create PaddlePredictor with a config. - TensorRTConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; - config.use_gpu = use_gpu; - config.fraction_of_gpu_memory = 0.15; - config.device = 0; - auto predictor = + NativeConfig config0; + config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config0.use_gpu = true; + config0.fraction_of_gpu_memory = 0.3; + config0.device = 0; + + TensorRTConfig config1; + config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config1.use_gpu = true; + config1.fraction_of_gpu_memory = 0.3; + config1.device = 0; + + auto predictor0 = + CreatePaddlePredictor(config0); + auto predictor1 = CreatePaddlePredictor(config); + PaddleEngineKind::kAutoMixedTensorRT>(config1); - for (int batch_id = 0; batch_id < 3; batch_id++) { + for (int batch_id = 0; batch_id < 1; batch_id++) { //# 2. Prepare input. - int64_t data[4] = {1, 2, 3, 4}; + std::vector data(20); + for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor{ + .name = "", + .shape = std::vector({10, 1}), + .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), + .dtype = PaddleDType::INT64}; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); //# 3. Run - std::vector outputs; - CHECK(predictor->Run(slots, &outputs)); + std::vector outputs0; + std::vector outputs1; + CHECK(predictor0->Run(slots, &outputs0)); + CHECK(predictor1->Run(slots, &outputs1, 10)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length(); - const size_t num_elements = outputs.front().data.length() / sizeof(float); - // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + ASSERT_EQ(outputs0.size(), 1UL); + ASSERT_EQ(outputs1.size(), 1UL); + + const size_t num_elements = outputs0.front().data.length() / sizeof(float); + const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); + EXPECT_EQ(num_elements, num_elements1); + + auto *data0 = static_cast(outputs0.front().data.data()); + auto *data1 = static_cast(outputs1.front().data.data()); + + ASSERT_GT(num_elements, 0UL); + for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { + EXPECT_NEAR(data0[i], data1[i], 1e-3); } } } -TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); } +TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) { + CompareTensorRTWithFluid(false); +} + +TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) { + CompareTensorRTWithFluid(true); +} } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 6697952051c4b1997ca6b550da17a52e64cb3454..968f7eb99ce8519edaa585fd3cb642bd80cc63cc 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -93,6 +93,10 @@ class OpConverter { framework::Scope* scope_{nullptr}; }; +} // namespace tensorrt +} // namespace inference +} // namespace paddle + #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ trt_##op_type__##_converter() { \ @@ -111,7 +115,3 @@ class OpConverter { extern int TouchConverterRegister_##op_type__(); \ static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ TouchConverterRegister_##op_type__(); - -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..fefec0df6d03669a294ce9643b666d7416593708 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -26,18 +26,20 @@ namespace paddle { namespace inference { namespace tensorrt { -void TensorRTEngine::Build(const DescType& paddle_model) { +void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } void TensorRTEngine::Execute(int batch_size) { - std::vector buffers; - for (auto& buf : buffers_) { + batch_size_ = batch_size; + std::vector buffers; + for (auto &buf : buffers_) { PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); PADDLE_ENFORCE_GT(buf.max_size, 0); PADDLE_ENFORCE(buf.device == DeviceType::GPU); buffers.push_back(buf.buffer); } + PADDLE_ENFORCE_NOT_NULL(stream_); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); cudaStreamSynchronize(*stream_); } @@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) { TensorRTEngine::~TensorRTEngine() { cudaStreamSynchronize(*stream_); // clean buffer - for (auto& buf : buffers_) { + for (auto &buf : buffers_) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); buf.buffer = nullptr; @@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() { // allocate GPU buffers. buffers_.resize(buffer_sizes_.size()); - for (auto& item : buffer_sizes_) { + for (auto &item : buffer_sizes_) { + // The output buffers are not set in the network building phrase, need to + // infer from the TesorRT network. if (item.second == 0) { auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); auto dims = infer_engine_->getBindingDimensions(slot_offset); item.second = kDataTypeSize[static_cast( infer_engine_->getBindingDataType(slot_offset))] * analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE_GT(item.second, 0); } - auto& buf = buffer(item.first); + + auto &buf = buffer(item.first); + buf.max_size = item.second * max_batch_; CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); - VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " - << buf.buffer; - buf.size = buf.max_size = item.second; + PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size)); + PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G + // buf.size will changed in the runtime. + buf.size = 0; buf.device = DeviceType::GPU; } } -nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, +nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, nvinfer1::DataType dtype, - const nvinfer1::Dims& dims) { + const nvinfer1::Dims &dims) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s", name); PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); - auto* input = infer_network_->addInput(name.c_str(), dtype, dims); + auto *input = infer_network_->addInput(name.c_str(), dtype, dims); PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * analysis::AccuDims(dims.d, dims.nbDims); @@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, return input; } -void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, - const std::string& name) { +void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, + const std::string &name) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", name); - auto* output = layer->getOutput(offset); + auto *output = layer->getOutput(offset); SetITensor(name, output); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); @@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, buffer_sizes_[name] = 0; } -void TensorRTEngine::DeclareOutput(const std::string& name) { +void TensorRTEngine::DeclareOutput(const std::string &name) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", name); - auto* output = TensorRTEngine::GetITensor(name); + auto *output = TensorRTEngine::GetITensor(name); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); PADDLE_ENFORCE(!output->isNetworkInput()); @@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) { buffer_sizes_[name] = 0; } -void* TensorRTEngine::GetOutputInGPU(const std::string& name) { +void *TensorRTEngine::GetOutputInGPU(const std::string &name) { return buffer(name).buffer; } -void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst, +void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, size_t max_size) { // determine data size auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GE(max_size, it->second); - auto& buf = buffer(name); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, cudaMemcpyDeviceToDevice, *stream_), 0); } -void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, +void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, size_t max_size) { + VLOG(4) << "get output in cpu"; + auto &buf = buffer(name); + + // Update needed buffer size. + auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); + auto dims = infer_engine_->getBindingDimensions(slot_offset); + buf.size = kDataTypeSize[static_cast( + infer_engine_->getBindingDataType(slot_offset))] * + analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE_LE(buf.size, buf.max_size); // determine data size - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_GE(max_size, it->second); - auto& buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second, - cudaMemcpyDeviceToHost, *stream_)); + // DEBUG + memset(dst, 0, buf.size); + PADDLE_ENFORCE_EQ( + 0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost)); } -Buffer& TensorRTEngine::buffer(const std::string& name) { +Buffer &TensorRTEngine::buffer(const std::string &name) { PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); @@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) { return buffers_[slot_offset]; } -void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data, +void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, size_t size) { - auto& buf = buffer(name); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer); + PADDLE_ENFORCE_NOT_NULL(data); + PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); + buf.size = size; PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, cudaMemcpyHostToDevice, *stream_)); } -void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, +void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, size_t size) { - auto& buf = buffer(name); + auto &buf = buffer(name); + buf.size = size; PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); @@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, cudaMemcpyDeviceToDevice, *stream_)); } -void TensorRTEngine::SetITensor(const std::string& name, - nvinfer1::ITensor* tensor) { +void TensorRTEngine::SetITensor(const std::string &name, + nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", name); itensor_map_[name] = tensor; } -nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) { +nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); return itensor_map_[name]; } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..7064d333f6db754f88c0ac6956a9527a48bf866c 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase { : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), - logger_(logger) {} + logger_(logger) { + cudaStreamCreate(&default_stream_); + } virtual ~TensorRTEngine(); @@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase { int max_batch_; // the max memory size the engine uses int max_workspace_; + + // batch size of the current data, will be updated each Executation. + int batch_size_{-1}; cudaStream_t* stream_; // If stream_ is not set from outside, hold its own stream. cudaStream_t default_stream_; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index e635f0f87d577a1f1ac74687ee60f762be525418..fca3488008ed83418b5e28b8af42d8019aaaa2a4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { LOG(INFO) << "to get output"; float y_cpu[2] = {-1., -1.}; + auto dims = engine_->GetITensor("y")->getDimensions(); + ASSERT_EQ(dims.nbDims, 3); + ASSERT_EQ(dims.d[0], 2); + ASSERT_EQ(dims.d[1], 1); engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index d265150f25419509126028e36e629aee3ee6bd0f..4e2002ad24415437ae4f85eba0e90a6c689e2996 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -168,6 +168,8 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") + elseif(${TARGET} STREQUAL "tensorrt_engine_op") + message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") else() file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") endif() @@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) - op_library(tensorrt_engine_op DEPS tensorrt_engine) + op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc - DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter + DEPS tensorrt_engine_op analysis) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index 6bd3e491bccb037406b784147dc9f91049b34d53..5edecd18e673da326ec119cf9a383f24f8045089 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -24,15 +24,16 @@ class AucOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Indices"), - "Input of Indices should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Predict"), + "Input of Out should not be null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input of Label should not be null."); - auto inference_height = ctx->GetInputDim("Out")[0]; + auto predict_width = ctx->GetInputDim("Predict")[1]; + PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification"); + auto predict_height = ctx->GetInputDim("Predict")[0]; auto label_height = ctx->GetInputDim("Label")[0]; - PADDLE_ENFORCE_EQ(inference_height, label_height, + PADDLE_ENFORCE_EQ(predict_height, label_height, "Out and Label should have same height."); int num_thres = ctx->Attrs().Get("num_thresholds"); @@ -43,14 +44,14 @@ class AucOp : public framework::OperatorWithKernel { ctx->SetOutputDim("FPOut", {num_thres}); ctx->SetOutputDim("FNOut", {num_thres}); - ctx->ShareLoD("Out", /*->*/ "AUC"); + ctx->ShareLoD("Predict", /*->*/ "AUC"); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Out")->type()), + framework::ToDataType(ctx.Input("Predict")->type()), ctx.device_context()); } }; @@ -58,18 +59,13 @@ class AucOp : public framework::OperatorWithKernel { class AucOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Out", - "A floating point 2D tensor, values are in the range [0, 1]." - "Each row is sorted in descending order. This input should be the" - "output of topk." + AddInput("Predict", + "A floating point 2D tensor with shape [batch_size, 2], values " + "are in the range [0, 1]." "Typically, this tensor indicates the probability of each label"); - AddInput("Indices", - "An int 2D tensor, indicating the indices of original" - "tensor before sorting. Typically, this tensor indicates which " - "label the probability stands for."); AddInput("Label", - "A 2D int tensor indicating the label of the training data." - "The height is batch size and width is always 1."); + "A 2D int tensor indicating the label of the training data. " + "shape: [batch_size, 1]"); AddInput("TP", "True-Positive value."); AddInput("FP", "False-Positive value."); AddInput("TN", "True-Negative value."); diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index 58fefc1600dfb7df3e3d71959c047865ed5e2e39..0a18585edb54a76aff5ae72ecc71e0eebb9f9361 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -31,7 +31,7 @@ template class AucKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); + auto* predict = ctx.Input("Predict"); auto* label = ctx.Input("Label"); auto* auc = ctx.Output("AUC"); // Only use output var for now, make sure it's persistable and @@ -41,24 +41,24 @@ class AucKernel : public framework::OpKernel { auto* true_negative = ctx.Output("TNOut"); auto* false_negative = ctx.Output("FNOut"); - float* auc_data = auc->mutable_data(ctx.GetPlace()); + auto* auc_data = auc->mutable_data(ctx.GetPlace()); std::string curve = ctx.Attr("curve"); int num_thresholds = ctx.Attr("num_thresholds"); - std::vector thresholds_list; + std::vector thresholds_list; thresholds_list.reserve(num_thresholds); for (int i = 1; i < num_thresholds - 1; i++) { - thresholds_list[i] = static_cast(i) / (num_thresholds - 1); + thresholds_list[i] = static_cast(i) / (num_thresholds - 1); } - const float kEpsilon = 1e-7; + const double kEpsilon = 1e-7; thresholds_list[0] = 0.0f - kEpsilon; thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon; - size_t batch_size = inference->dims()[0]; - size_t inference_width = inference->dims()[1]; + size_t batch_size = predict->dims()[0]; + size_t inference_width = predict->dims()[1]; - const T* inference_data = inference->data(); - const int64_t* label_data = label->data(); + const T* inference_data = predict->data(); + const auto* label_data = label->data(); auto* tp_data = true_positive->mutable_data(ctx.GetPlace()); auto* fn_data = false_negative->mutable_data(ctx.GetPlace()); @@ -66,20 +66,19 @@ class AucKernel : public framework::OpKernel { auto* fp_data = false_positive->mutable_data(ctx.GetPlace()); for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { - // caculate TP, FN, TN, FP for current thresh + // calculate TP, FN, TN, FP for current thresh int64_t tp = 0, fn = 0, tn = 0, fp = 0; for (size_t i = 0; i < batch_size; i++) { - // NOTE: label_data used as bool, labels >0 will be treated as true. + // NOTE: label_data used as bool, labels > 0 will be treated as true. if (label_data[i]) { - // use first(max) data in each row - if (inference_data[i * inference_width] >= + if (inference_data[i * inference_width + 1] >= (thresholds_list[idx_thresh])) { tp++; } else { fn++; } } else { - if (inference_data[i * inference_width] >= + if (inference_data[i * inference_width + 1] >= (thresholds_list[idx_thresh])) { fp++; } else { @@ -94,21 +93,21 @@ class AucKernel : public framework::OpKernel { fp_data[idx_thresh] += fp; } // epsilon to avoid divide by zero. - float epsilon = 1e-6; + double epsilon = 1e-6; // Riemann sum to caculate auc. Tensor tp_rate, fp_rate, rec_rate; tp_rate.Resize({num_thresholds}); fp_rate.Resize({num_thresholds}); rec_rate.Resize({num_thresholds}); - float* tp_rate_data = tp_rate.mutable_data(ctx.GetPlace()); - float* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); - float* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); + auto* tp_rate_data = tp_rate.mutable_data(ctx.GetPlace()); + auto* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); + auto* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); for (int i = 0; i < num_thresholds; i++) { - tp_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + tp_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / (tp_data[i] + fn_data[i] + epsilon); fp_rate_data[i] = - static_cast(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); - rec_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + static_cast(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); + rec_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / (tp_data[i] + fp_data[i] + epsilon); } *auc_data = 0.0f; diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 675ca36774beb72cc1e9b136ad0b18ce061689ac..6555b8101a90bba8351d2c82313ab12e572a01ee 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -1,33 +1,43 @@ +if(NOT WITH_DISTRIBUTE) + return() +endif() + +if(WITH_GRPC) + set(cc_generic_services "false") +else() + set(cc_generic_services "true") +endif() +configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) + if(WITH_GRPC) - grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor - selected_rows memory) + grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr - cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) - cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc - grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor - proto_desc lookup_table_op SERIAL) + cc_test(grpc_serde_test SRCS grpc_serde_test.cc + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(grpc_server_test SRCS rpc_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) return() endif() set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc + +set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + +brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) -find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so) -ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC}) - +set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) -find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so) -ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC}) +cc_test(brpc_server_test SRCS rpc_server_test.cc + DEPS ${brpc_test_depends} SERIAL) -cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc - brpc protobuf leveldb gflags glog - protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL) +cc_test(brpc_serde_test SRCS brpc_serde_test.cc + DEPS ${brpc_test_depends} SERIAL) diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc similarity index 96% rename from paddle/fluid/operators/distributed/bytebuffer_stream.cc rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc index 6e91b447db838c9095432eda22e9e1171e938d31..d192f54ee0c924b772045d9b6a01701f640e07c7 100644 --- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc +++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc @@ -17,7 +17,7 @@ limitations under the License. */ // file and did some modifications so that we can send gRPC // requests without too much copying of the tensor data. -#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h similarity index 87% rename from paddle/fluid/operators/distributed/bytebuffer_stream.h rename to paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h index e7de172c79c30761483b5d96f5bad19860208832..e9074574cdd163bbf7e62939df9283352706f840 100644 --- a/paddle/fluid/operators/distributed/bytebuffer_stream.h +++ b/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "grpc++/grpc++.h" +#include "paddle/fluid/operators/distributed/variable_response.h" namespace grpc { // A ZeroCopyInputStream that reads from grpc_byte_buffer @@ -107,25 +108,6 @@ class GrpcBufferReader final namespace paddle { namespace operators { namespace distributed { -// Source provides a way for a particular RPC implementation to provide -// received data to ParseFrom. -class Source { - public: - virtual ~Source() {} - - // Return the stream that contains the data to be parsed. - // Note that this method might be invoked more than once if - // ParseFrom needs to fall back to a more expensive parsing method. - // Every call must return a stream pointing at the beginning of - // the serialized RecvTensorResponse. - // - // Note that a subsequent call to contents() invalidates previous - // results of contents(). - // - // Ownership of the returned stream is retained by the Source and - // should not be deleted by the caller. - virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0; -}; // A ZeroCopyInputStream that reads from a grpc::ByteBuffer. class GrpcByteBufferSource diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 4d60801b6a6ecaabf1165321e0cb19044d27aa34..52c4bc1e7965323438de959d5eb1f3b4ef4f4cfe 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index d03a3e56aedbe4a008ee9ff187111f7635d14b58..11de84d9e265b2ca75d6d72a1d1e8797763f96a5 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -38,7 +38,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -46,23 +49,6 @@ namespace paddle { namespace operators { namespace distributed { -struct VarHandle { - // RPC endpoint. - std::string ep; - const platform::DeviceContext* ctx; - const framework::Scope* scope; - // Variable name. - std::string name; - // RPC method name. - std::string method; - - std::string String() const { - std::ostringstream s; - s << method << " name:[" << name << "], ep:[" << ep << "]"; - return s.str(); - } -}; - void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); class BaseProcessor { diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f8796713a6b89a308113981614673e07e8d367f --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include // NOLINT + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg, + const std::string& out_name) { + // Default DestroyCallback does nothing, When using GPU + // the CPU buffer need to be freed. + DestroyCallback destroy_callback = [](void* backing) {}; + VarMsg request; + void* payload = nullptr; + size_t payload_size; + + request.set_varname(name); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request.set_profile(platform::kEnableProfiler); + } else { + request.set_profile(platform::kDisableProfiler); + } + } + if (!out_name.empty()) { + request.set_out_varname(out_name); + } + if (var->IsType()) { + request.set_type(::sendrecv::LOD_TENSOR); + GetTensorPayload(var, ctx, &request, &payload, &payload_size); + } else if (var->IsType()) { + request.set_type(::sendrecv::SELECTED_ROWS); + GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request.set_type(::sendrecv::NCCL_ID); +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + // GPU data is copied to CPU buffer when sending, + // free the buffer when possible. + destroy_callback = [](void* backing) { + platform::CUDAPinnedPlace cuda_pinned; + memory::Free(cuda_pinned, backing); + }; +#endif + } + + std::string header; + request.AppendToString(&header); + auto buffer = std::unique_ptr(new char[1024]); + void* buf = buffer.get(); + ProtoEncodeHelper e(static_cast(buf), 1024); + e.WriteRawBytes(std::string(header.data(), header.size())); +// NCCLID is copied directly to the message, return bytebuffer +// with only one slice if serializing NCCLID. +#ifdef PADDLE_WITH_CUDA + if (var->IsType()) { + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, + NCCL_UNIQUE_ID_BYTES); + const ncclUniqueId& uid = var->Get(); + e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); + + // for serialize NCCL_ID + ::grpc::Slice slices(e.size()); + memcpy(const_cast(slices.begin()), e.data(), e.size()); + ::grpc::ByteBuffer tmp(&slices, 1); + msg->Swap(&tmp); + return; + } +#endif + + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + // steal reference of tensor data + ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows + int num_slices = 2; // only SelectedRows have rows buffer + slices[0] = ::grpc::Slice(e.size()); + memcpy(const_cast(slices[0].begin()), e.data(), e.size()); + slices[1] = ::grpc::Slice( + grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, + static_cast(payload)), + ::grpc::Slice::STEAL_REF); + + if (var->IsType()) { + auto* slr = var->GetMutable(); + ProtoEncodeHelper e2(static_cast(buf), 128); + size_t rows_memory_size = + slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); + slices[2] = ::grpc::Slice(e2.size()); + memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); + + slices[3] = ::grpc::Slice( + grpc_slice_new_with_user_data( + const_cast( + reinterpret_cast(slr->rows().data())), + rows_memory_size, [](void* backing) {}, + const_cast( + reinterpret_cast(slr->rows().data()))), + ::grpc::Slice::STEAL_REF); + num_slices = 4; + } + + ::grpc::ByteBuffer tmp(&slices[0], num_slices); + msg->Swap(&tmp); +} + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var) { + operators::distributed::GRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); + *var = resp.GetVar(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h new file mode 100644 index 0000000000000000000000000000000000000000..450c41dcd6b1bf9a33d3bbef3a1c94a2f83ff322 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc_serde.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +typedef void (*DestroyCallback)(void*); + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg, + const std::string& out_varname = std::string()); + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc index 3d107b533bcb7bfef3f9b13ec99afbd579a62e52..96ea05e74ed76768248a27ab435dc801b7d1b995 100644 --- a/paddle/fluid/operators/distributed/grpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc @@ -21,8 +21,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" @@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); framework::Scope scope; scope.Var("myvar"); - operators::distributed::VariableResponse resp(&scope, &ctx); + operators::distributed::GRPCVariableResponse resp(&scope, &ctx); EXPECT_EQ(resp.Parse(msg), 0); framework::Variable* var2 = resp.GetVar(); @@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { // deserialize zero-copy framework::Scope scope; scope.Var("myvar"); - operators::distributed::VariableResponse resp(&scope, &ctx); + operators::distributed::GRPCVariableResponse resp(&scope, &ctx); if (from_type == 0) { EXPECT_EQ(resp.Parse(msg), 0); } else { diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41..8edb00276df3ade1b320fbf2873e8b54ff3e1464 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/grpc_server.h" using ::grpc::ServerAsyncResponseWriter; @@ -84,9 +85,9 @@ class RequestSend final : public RequestBase { ::grpc::ServerCompletionQueue* cq, RequestHandler* request_handler, int req_id) : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new VariableResponse(request_handler->scope(), - request_handler->dev_ctx(), - !request_handler->sync_mode())); + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx(), + !request_handler->sync_mode())); int method_id = static_cast(distributed::GrpcMethod::kSendVariable); service_->RequestAsyncUnary( method_id, &ctx_, request_.get(), &responder_, cq_, cq_, @@ -109,7 +110,7 @@ class RequestSend final : public RequestBase { protected: sendrecv::VoidMessage reply_; - std::shared_ptr request_; + std::shared_ptr request_; ServerAsyncResponseWriter responder_; }; @@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase { : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_), local_scope_(nullptr) { - request_.reset(new VariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx(), true)); int method_id = static_cast(distributed::GrpcMethod::kPrefetchVariable); service_->RequestAsyncUnary( @@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase { } protected: - std::shared_ptr request_; + std::shared_ptr request_; ::grpc::ByteBuffer reply_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; framework::Scope* local_scope_; @@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase { ::grpc::ServerCompletionQueue* cq, RequestHandler* request_handler, int req_id) : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new VariableResponse(request_handler->scope(), - request_handler->dev_ctx())); + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx())); int method_id = static_cast(distributed::GrpcMethod::kCheckpointNotify); service_->RequestAsyncUnary( @@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase { } protected: - std::shared_ptr request_; + std::shared_ptr request_; sendrecv::VoidMessage reply_; ServerAsyncResponseWriter responder_; }; diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h index cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0..9ae9a31a003cbb1f808fd1127a5dd78511aa3e99 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -23,8 +23,7 @@ #include #include #include -#include "paddle/fluid/operators/distributed/variable_response.h" - +#include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/platform/profiler.h" // NOTE: This method was originally created by tensorflow @@ -42,17 +41,18 @@ class ServerContext; // Support parsing/unparsing of tensorflow::VariableResponse. // Wire-format is identical to RecvVariableResponse. template <> -class SerializationTraits { +class SerializationTraits< + paddle::operators::distributed::GRPCVariableResponse> { public: static Status Serialize( - const paddle::operators::distributed::VariableResponse& msg, + const paddle::operators::distributed::GRPCVariableResponse& msg, grpc_byte_buffer** bp, bool* own_buffer) { PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); return Status(); } static Status Deserialize( grpc_byte_buffer* buffer, - paddle::operators::distributed::VariableResponse* msg, + paddle::operators::distributed::GRPCVariableResponse* msg, int max_message_size = INT_MAX) { if (buffer == nullptr) { return Status(StatusCode::INTERNAL, "No payload"); diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc new file mode 100644 index 0000000000000000000000000000000000000000..34d47f3ec0f3025109447b66078b724607d2953a --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +enum WireType { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, +}; + +inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } + +inline WireType GetTagWireType(uint32_t tag) { + return static_cast(tag & 0x7); +} + +bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, + int* result) { + uint64_t v; + if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { + *result = static_cast(v); + return true; + } else { + return false; + } +} + +int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { + GrpcByteBufferSource source; + source.Init(byte_buffer); + GrpcByteBufferSourceWrapper r(&source); + + return Parse(&r); +} + +bool ParseLodData(::google::protobuf::io::CodedInputStream* input, + std::vector* lod) { + while (true) { + auto p = input->ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + + if (!p.second) { + return (tag == 0); + } + + switch (tag) { + case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { + uint64_t v; + if (wt == WIRETYPE_VARINT) { + if (!input->ReadVarint64(&v)) { + return false; + } + lod->push_back(v); + break; + } + + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int num_bytes = 0; + if (!input->ReadVarintSizeAsInt(&num_bytes)) { + return tag; + } + int start_pos = input->CurrentPosition(); + while (input->CurrentPosition() - start_pos < num_bytes) { + uint64_t v; + if (!input->ReadVarint64(&v)) { + return tag; + } + lod->push_back(v); + } + break; + } + + return false; + } + default: { return false; } + } + } + + return true; +} + +int GRPCVariableResponse::Parse(Source* source) { + ::google::protobuf::io::ZeroCopyInputStream* input_stream = + source->contents(); + ::google::protobuf::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (true) { + auto p = input.ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + if (!p.second) { + if (tag != 0) { + return -1; + } + return 0; + } + + switch (tag) { + case sendrecv::VariableMessage::kVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_varname(temp); + break; + } + case sendrecv::VariableMessage::kTypeFieldNumber: { + uint32_t v; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { + return tag; + } + + meta_.set_type(static_cast<::sendrecv::VarType>(v)); + break; + } + case sendrecv::VariableMessage::kDataTypeFieldNumber: { + uint32_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { + return tag; + } + + meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); + break; + } + case sendrecv::VariableMessage::kDimsFieldNumber: { + // not packed + if (wt == WIRETYPE_VARINT) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + break; + } + + // packed + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int num_bytes = 0; + if (!input.ReadVarintSizeAsInt(&num_bytes)) { + return tag; + } + int start_pos = input.CurrentPosition(); + while (input.CurrentPosition() - start_pos < num_bytes) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + } + break; + } + return tag; + } + case sendrecv::VariableMessage::kLodLevelFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_lod_level(static_cast(v)); + break; + } + case sendrecv::VariableMessage::kLodFieldNumber: { + int length = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &length)) { + return tag; + } + + std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = + input.IncrementRecursionDepthAndPushLimit(length); + + std::vector lod_data; + if (p.second < 0 || !ParseLodData(&input, &lod_data)) { + return tag; + } + + if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { + return tag; + } + + if (lod_data.size() == 0) { + break; + } + + auto lod = meta_.add_lod(); + for (uint32_t i = 0; i < lod_data.size(); i++) { + lod->add_lod_data(lod_data[i]); + } + break; + } + case sendrecv::VariableMessage::kSlrHeightFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_slr_height(static_cast(v)); + break; + } + case sendrecv::VariableMessage::kSerializedFieldNumber: { + int num_bytes = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &num_bytes)) { + return tag; + } + + if (!ProcSerializedField(tag, &input, num_bytes)) { + return tag; + } + + break; + } + case sendrecv::VariableMessage::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + int num_bytes = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &num_bytes)) { + return tag; + } + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return tag; + } + break; + } + case sendrecv::VariableMessage::kOutVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_out_varname(temp); + break; + } + case sendrecv::VariableMessage::kProfileFieldNumber: { + uint64_t profiling = 0; + if (!input.ReadVarint64(&profiling)) { + return tag; + } + meta_.set_profile(profiling); + int64_t listener_id = platform::ListenerId(); + if (listener_id <= 0) { + break; + } + if (profiling == platform::kEnableProfiler && + !platform::IsProfileEnabled()) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else if (profiling == platform::kDisableProfiler && + platform::IsProfileEnabled()) { + // TODO(panyx0718): Should we allow to customize file dir. + platform::DisableProfiler( + platform::EventSortingKey::kDefault, + string::Sprintf("/tmp/profile_ps_%lld", listener_id)); + } + break; + } + default: { + // Unknown tag, return unknown error. + return -1; + } + } + } + + return 0; +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc_variable_response.h new file mode 100644 index 0000000000000000000000000000000000000000..89df07c92cd33bcb76c8539b5566d74fa21bba5e --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc_variable_response.h @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class GRPCVariableResponse : public VariableResponse { + public: + GRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~GRPCVariableResponse() {} + + int Parse(Source* source) override; + + // return: + // 0:ok. + // -1: unkown error. + // other: number of error field. + int Parse(const ::grpc::ByteBuffer& byte_buffer); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 271306d5d20f1b849a81a9bfa6436f2faf261204..3d61171dff98d6752be98b4b90577bfd059525ab 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -51,6 +51,23 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; class RPCServer; +struct VarHandle { + // RPC endpoint. + std::string ep; + const platform::DeviceContext* ctx; + const framework::Scope* scope; + // Variable name. + std::string name; + // RPC method name. + std::string method; + + std::string String() const { + std::ostringstream s; + s << method << " name:[" << name << "], ep:[" << ep << "]"; + return s.str(); + } +}; + class RequestHandler { public: explicit RequestHandler(bool sync_mode) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6..f1f84072d47e58eaa81dd66dc018e17b182bb57b 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -53,7 +53,7 @@ bool RequestSendHandler::Handle(const std::string& varname, // Sync if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv batch barrier message"; + VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; rpc_server_->IncreaseBatchBarrier(kRequestSend); } else if (varname == BEGIN_PASS_MESSAGE) { VLOG(3) << "sync: recv begin pass message"; @@ -65,8 +65,7 @@ bool RequestSendHandler::Handle(const std::string& varname, VLOG(3) << "sync: processing received var: " << varname; if (invar == nullptr) { - LOG(ERROR) << "sync: Can not find server side var: " << varname; - PADDLE_THROW("sync: Can not find server side var"); + LOG(FATAL) << "sync: Can not find server side var: " << varname; return false; } if (invar->IsType()) { diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto.in similarity index 97% rename from paddle/fluid/operators/distributed/send_recv.proto rename to paddle/fluid/operators/distributed/send_recv.proto.in index e0902320cff003797b12ed0204f7f99c44554b62..8b0a09abe1d05dda10eda0030eb91cb9ca40683e 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -1,3 +1,4 @@ + /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,7 +15,7 @@ limitations under the License. */ syntax = "proto3"; package sendrecv; -// option cc_generic_services = true; +option cc_generic_services = @cc_generic_services@; service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 98129d9f1014c39347e3409533f2bc10092611d2..98a5dcbbb871f90db8b685da93f6baaec209fab8 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - #ifdef PADDLE_WITH_CUDA #include #endif #include #include // NOLINT -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -34,6 +28,11 @@ namespace distributed { using VarMsg = sendrecv::VariableMessage; +void* GetVarPayLoad(const std::string varname, int64_t size) { + platform::CUDAPinnedPlace cuda_pinned; + return memory::Alloc(cuda_pinned, size); +} + void GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request, void** payload, size_t* payload_size) { @@ -58,15 +57,17 @@ void GetTensorPayload(framework::Variable* var, if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); - platform::CUDAPinnedPlace cuda_pinned; + // platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - *payload = memory::Alloc(cuda_pinned, copy_size); + *payload = GetVarPayLoad(request->varname(), copy_size); + platform::CUDAPinnedPlace cuda_pinned; memory::Copy(cuda_pinned, *payload, boost::get(tensor.place()), reinterpret_cast(tensor.data()), copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); #endif } else { @@ -91,10 +92,11 @@ void GetSelectedRowsPayload(framework::Variable* var, auto* tensor = slr->mutable_value(); if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA - platform::CUDAPinnedPlace cuda_pinned; auto& gpu_dev_ctx = static_cast(ctx); auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type()); - *payload = memory::Alloc(cuda_pinned, copy_size); + *payload = GetVarPayLoad(request->varname(), copy_size); + + platform::CUDAPinnedPlace cuda_pinned; memory::Copy(cuda_pinned, *payload, boost::get(tensor->place()), reinterpret_cast(tensor->data()), copy_size, @@ -107,126 +109,6 @@ void GetSelectedRowsPayload(framework::Variable* var, *payload_size = tensor->numel() * framework::SizeOfType(tensor->type()); } -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_name) { - // Default DestroyCallback does nothing, When using GPU - // the CPU buffer need to be freed. - DestroyCallback destroy_callback = [](void* backing) {}; - VarMsg request; - void* payload = nullptr; - size_t payload_size; - - request.set_varname(name); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request.set_profile(platform::kEnableProfiler); - } else { - request.set_profile(platform::kDisableProfiler); - } - } - if (!out_name.empty()) { - request.set_out_varname(out_name); - } - if (var->IsType()) { - request.set_type(::sendrecv::LOD_TENSOR); - GetTensorPayload(var, ctx, &request, &payload, &payload_size); - } else if (var->IsType()) { - request.set_type(::sendrecv::SELECTED_ROWS); - GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size); -#ifdef PADDLE_WITH_CUDA - } else if (var->IsType()) { - request.set_type(::sendrecv::NCCL_ID); -#endif - } else { - PADDLE_THROW("Serialize does not support type: %s", - typeid(var->Type()).name()); - } - - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA - // GPU data is copied to CPU buffer when sending, - // free the buffer when possible. - destroy_callback = [](void* backing) { - platform::CUDAPinnedPlace cuda_pinned; - memory::Free(cuda_pinned, backing); - }; -#endif - } - - std::string header; - request.AppendToString(&header); - auto buffer = std::unique_ptr(new char[1024]); - void* buf = buffer.get(); - ProtoEncodeHelper e(static_cast(buf), 1024); - e.WriteRawBytes(std::string(header.data(), header.size())); -// NCCLID is copied directly to the message, return bytebuffer -// with only one slice if serializing NCCLID. -#ifdef PADDLE_WITH_CUDA - if (var->IsType()) { - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - NCCL_UNIQUE_ID_BYTES); - const ncclUniqueId& uid = var->Get(); - e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); - - // for serialize NCCL_ID - ::grpc::Slice slices(e.size()); - memcpy(const_cast(slices.begin()), e.data(), e.size()); - ::grpc::ByteBuffer tmp(&slices, 1); - msg->Swap(&tmp); - return; - } -#endif - - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); - // steal reference of tensor data - ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows - int num_slices = 2; // only SelectedRows have rows buffer - slices[0] = ::grpc::Slice(e.size()); - memcpy(const_cast(slices[0].begin()), e.data(), e.size()); - slices[1] = ::grpc::Slice( - grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, - static_cast(payload)), - ::grpc::Slice::STEAL_REF); - - if (var->IsType()) { - auto* slr = var->GetMutable(); - ProtoEncodeHelper e2(static_cast(buf), 128); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); - e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); - slices[2] = ::grpc::Slice(e2.size()); - memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); - - slices[3] = ::grpc::Slice( - grpc_slice_new_with_user_data( - const_cast( - reinterpret_cast(slr->rows().data())), - rows_memory_size, [](void* backing) {}, - const_cast( - reinterpret_cast(slr->rows().data()))), - ::grpc::Slice::STEAL_REF); - num_slices = 4; - } - - ::grpc::ByteBuffer tmp(&slices[0], num_slices); - msg->Swap(&tmp); -} - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var) { - operators::distributed::VariableResponse resp(scope, &ctx); - PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); - *var = resp.GetVar(); -} - } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index fe25e73fa608727ba0bb912a82776b330ec8d83a..4d08d3c77afa3c1f2b4d7602f7199558bb5a79c0 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -25,24 +25,21 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { namespace distributed { -typedef void (*DestroyCallback)(void*); +using VarMsg = sendrecv::VariableMessage; -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_varname = std::string()); +void GetTensorPayload(framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + void** payload, size_t* payload_size); -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var); +void GetSelectedRowsPayload(framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + void** payload, size_t* payload_size); inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { switch (type) { diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 45832c60bf9172497afabac927ba39a7cbfb9a52..466bce18af7cf97014a7b1ba64df68eab193c7c8 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,50 +13,20 @@ // limitations under the License. #include "paddle/fluid/operators/distributed/variable_response.h" - -#include -#include #include -#ifdef PADDLE_WITH_CUDA -#include -#endif -#include "paddle/fluid/platform/profiler.h" - -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" namespace paddle { namespace operators { namespace distributed { -enum WireType { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, -}; - -inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } - -inline WireType GetTagWireType(uint32_t tag) { - return static_cast(tag & 0x7); -} - -bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, - int* result) { - uint64_t v; - if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { - *result = static_cast(v); - return true; - } else { - return false; - } -} - -bool ReadRaw(::google::protobuf::io::CodedInputStream* input, - const platform::DeviceContext& dev_ctx, platform::Place place, - void* dest, int size) { +bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& dev_ctx, + platform::Place place, void* dest, + int64_t size) { const void* data = NULL; int size_to_write = 0; - int length = size; + int64_t length = size; int total_written = 0; if (platform::is_gpu_place(place)) { @@ -194,294 +164,49 @@ bool VariableResponse::CopySelectRowsData( return true; } -bool ParseLodData(::google::protobuf::io::CodedInputStream* input, - std::vector* lod) { - while (true) { - auto p = input->ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - - if (!p.second) { - return (tag == 0); - } - - switch (tag) { - case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { - uint64_t v; - if (wt == WIRETYPE_VARINT) { - if (!input->ReadVarint64(&v)) { - return false; - } - lod->push_back(v); - break; - } - - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input->ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input->CurrentPosition(); - while (input->CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input->ReadVarint64(&v)) { - return tag; - } - lod->push_back(v); - } - break; - } +bool VariableResponse::ProcSerializedField( + int tag, ::google::protobuf::io::CodedInputStream* input, + int64_t num_bytes) { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR || + meta_.type() == sendrecv::NCCL_ID) && + meta_.varname() != "", + "meta info should be got first!"); + if (meta_.type() == sendrecv::NCCL_ID) { +#ifdef PADDLE_WITH_CUDA + auto* var = scope_->FindVar(meta_.varname()); + if (var != nullptr) { + ncclUniqueId* id = var->GetMutable(); + if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal, + num_bytes)) { return false; } - default: { return false; } } - } - - return true; -} - -int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { - GrpcByteBufferSource source; - source.Init(byte_buffer); - GrpcByteBufferSourceWrapper r(&source); - - return Parse(&r); -} - -int VariableResponse::Parse(Source* source) { - ::google::protobuf::io::ZeroCopyInputStream* input_stream = - source->contents(); - ::google::protobuf::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (true) { - auto p = input.ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - if (!p.second) { - if (tag != 0) { - return -1; - } - return 0; - } - - switch (tag) { - case sendrecv::VariableMessage::kVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_varname(temp); - break; - } - case sendrecv::VariableMessage::kTypeFieldNumber: { - uint32_t v; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_type(static_cast<::sendrecv::VarType>(v)); - break; - } - case sendrecv::VariableMessage::kDataTypeFieldNumber: { - uint32_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); - break; - } - case sendrecv::VariableMessage::kDimsFieldNumber: { - // not packed - if (wt == WIRETYPE_VARINT) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - break; - } - - // packed - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input.ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input.CurrentPosition(); - while (input.CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - } - break; - } - return tag; - } - case sendrecv::VariableMessage::kLodLevelFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_lod_level(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kLodFieldNumber: { - int length = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &length)) { - return tag; - } - - std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = - input.IncrementRecursionDepthAndPushLimit(length); - - std::vector lod_data; - if (p.second < 0 || !ParseLodData(&input, &lod_data)) { - return tag; - } - - if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { - return false; - } - - if (lod_data.size() == 0) { - break; - } - - auto lod = meta_.add_lod(); - for (uint32_t i = 0; i < lod_data.size(); i++) { - lod->add_lod_data(lod_data[i]); - } - break; - } - case sendrecv::VariableMessage::kSlrHeightFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_slr_height(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kSerializedFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR || - meta_.type() == sendrecv::NCCL_ID) && - meta_.varname() != "", - "meta info should be got first!"); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (meta_.type() == sendrecv::NCCL_ID) { -#ifdef PADDLE_WITH_CUDA - auto* var = scope_->FindVar(meta_.varname()); - if (var != nullptr) { - ncclUniqueId* id = var->GetMutable(); - if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal, - num_bytes)) { - return tag; - } - } - break; + return true; #else - PADDLE_THROW("Not compiled with CUDA!"); + PADDLE_THROW("Not compiled with CUDA!"); + return false; #endif - } - - framework::DDim dims = GetDims(meta_.dims()); - if (meta_.type() == sendrecv::LOD_TENSOR) { - PADDLE_ENFORCE(meta_.lod_size() >= 0, - "lod info should be got first!"); - if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) { - return tag; - } - break; - } - - if (meta_.type() == sendrecv::SELECTED_ROWS) { - if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) { - return tag; - } - break; - } - - return tag; - } - case sendrecv::VariableMessage::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - "meta info should be got first!"); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return tag; - } - break; - } - case sendrecv::VariableMessage::kOutVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } + } - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } + framework::DDim dims = GetDims(meta_.dims()); + if (meta_.type() == sendrecv::LOD_TENSOR) { + PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!"); + if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) { + return false; + } + return true; + } - meta_.set_out_varname(temp); - break; - } - case sendrecv::VariableMessage::kProfileFieldNumber: { - uint64_t profiling = 0; - if (!input.ReadVarint64(&profiling)) { - return tag; - } - meta_.set_profile(profiling); - int64_t listener_id = platform::ListenerId(); - if (listener_id <= 0) { - break; - } - if (profiling == platform::kEnableProfiler && - !platform::IsProfileEnabled()) { - platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (profiling == platform::kDisableProfiler && - platform::IsProfileEnabled()) { - // TODO(panyx0718): Should we allow to customize file dir. - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("/tmp/profile_ps_%lld", listener_id)); - } - break; - } - default: { - // Unknown tag, return unknown error. - return -1; - } + if (meta_.type() == sendrecv::SELECTED_ROWS) { + if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) { + return false; } + return true; } - return 0; + return true; } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 1db4a0a522654ff2497b8bd9ee1381b5ab64067a..6aec52ca00f59a42ecca01da8df1680ce4eda432 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -22,18 +22,35 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { namespace distributed { +// Source provides a way for a particular RPC implementation to provide +// received data to ParseFrom. +class Source { + public: + virtual ~Source() {} + + // Return the stream that contains the data to be parsed. + // Note that this method might be invoked more than once if + // ParseFrom needs to fall back to a more expensive parsing method. + // Every call must return a stream pointing at the beginning of + // the serialized RecvTensorResponse. + // + // Note that a subsequent call to contents() invalidates previous + // results of contents(). + // + // Ownership of the returned stream is retained by the Source and + // should not be deleted by the caller. + virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0; +}; + class VariableResponse { public: VariableResponse(const framework::Scope* scope, @@ -51,22 +68,19 @@ class VariableResponse { } } - // return: - // 0:ok. - // -1: unkown error. - // other: number of error field. - int Parse(Source* source); + int Parse(Source* source, const sendrecv::VariableMessage& meta) { + meta_ = meta; + return Parse(source); + } // return: // 0:ok. // -1: unkown error. // other: number of error field. - int Parse(const ::grpc::ByteBuffer& byte_buffer); - - const framework::Scope& GetLocalScope() const { return *local_scope_; } - - framework::Scope* GetMutableLocalScope() const { return local_scope_; } + virtual int Parse(Source* source) = 0; + inline const framework::Scope& GetLocalScope() const { return *local_scope_; } + inline framework::Scope* GetMutableLocalScope() const { return local_scope_; } inline std::string Varname() const { return meta_.varname(); } inline std::string OutVarname() const { return meta_.out_varname(); } @@ -78,7 +92,11 @@ class VariableResponse { return scope_->FindVar(meta_.varname()); } - private: + protected: + bool ReadRaw(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& dev_ctx, platform::Place place, + void* dest, int64_t size); + bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input, const platform::DeviceContext& ctx, const framework::DDim& dims, int length); @@ -90,12 +108,16 @@ class VariableResponse { const platform::DeviceContext& ctx, const framework::DDim& dims, int length); - private: + bool ProcSerializedField(int tag, + ::google::protobuf::io::CodedInputStream* input, + int64_t num_bytes); + + protected: const framework::Scope* scope_; const platform::DeviceContext* dev_ctx_; bool create_scope_ = false; framework::Scope* local_scope_ = nullptr; - // only Skeleton + sendrecv::VariableMessage meta_; }; diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 238bd3f8def9eaa6c18afdab1031c4babfde8ae2..a0802ef90ca7e30a2b22d187cb9092163518d8e9 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -37,6 +37,7 @@ struct CBlas { libxsmm_sgemm(args...); } #endif + template static void AXPY(ARGS... args) { platform::dynload::cblas_saxpy(args...); @@ -76,6 +77,7 @@ struct CBlas { libxsmm_dgemm(args...); } #endif + template static void AXPY(ARGS... args) { platform::dynload::cblas_daxpy(args...); @@ -150,6 +152,7 @@ struct CBlas { } }; #endif + template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } @@ -190,30 +193,48 @@ inline bool UseXSMM(const int &m, const int &n, const int &k, return false; } -template <> template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, int M, - int N, int K, T alpha, const T *A, - const T *B, T beta, T *C) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; +inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { #ifdef PADDLE_WITH_LIBXSMM - if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, - beta)) { + if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, + beta)) { // Note: SMM use ColMajor const char transa = 'N'; const char transb = 'N'; CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc); - } else { + return; + } #endif - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, - ldb, beta, C, ldc); -#ifdef PADDLE_WITH_LIBXSMM + +#ifdef PADDLE_MKL_SPLIT_GEMM + constexpr int bs = 2; + if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) { + for (int off = 0; off < M; off += bs) { + CBlas::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha, + A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc); + } + return; } #endif + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); +} + +template <> +template +void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, + int N, int K, T alpha, const T *A, + const T *B, T beta, T *C) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + GEMM_WARP(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, + beta, C, ldc); } template <> @@ -222,9 +243,9 @@ void Blas::GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc) const { - CBlas::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, - lda, B, ldb, beta, C, ldc); + GEMM_WARP(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, + lda, B, ldb, beta, C, ldc); } template diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 078dd448c385dbb8a00025ee2ba08d0c41a4730a..2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -228,3 +228,57 @@ TEST(math_funciton, set_constant) { } delete ctx; } + +template +void GemmWarpTest(int m, int n, int k, T alpha, T beta) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor mat_b; + paddle::framework::Tensor mat_c_ref; + paddle::framework::Tensor mat_c_mkl; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* A = mat_a.mutable_data({m, k}, *cpu_place); + T* B = mat_b.mutable_data({k, n}, *cpu_place); + T* CREF = mat_c_ref.mutable_data({m, n}, *cpu_place); + T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); + + ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel()); + for (int i = 0; i < mat_a.numel(); ++i) { + A[i] = static_cast(i); + } + for (int i = 0; i < mat_b.numel(); ++i) { + B[i] = static_cast(i + 1); + } + for (int i = 0; i < mat_c_ref.numel(); ++i) { + CREF[i] = static_cast(i + 2); + CMKL[i] = CREF[i]; + } + + // this would call gemm_warp + paddle::platform::CPUDeviceContext context(*cpu_place); + GetBlas(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, + beta, CREF); + + // lda,ldb,ldc follow RowMajor + int lda = k; + int ldb = n; + int ldc = n; + paddle::operators::math::CBlas::GEMM(CblasRowMajor, CblasNoTrans, + CblasNoTrans, m, n, k, alpha, A, lda, + B, ldb, beta, CMKL, ldc); + + for (int i = 0; i < mat_c_mkl.numel(); ++i) { + EXPECT_FLOAT_EQ(CREF[i], CMKL[i]); + } +} + +TEST(math_function, gemm_warp) { + GemmWarpTest(3, 2, 5, 1.f, 0.f); + GemmWarpTest(3, 2, 5, 2.f, 1.f); + GemmWarpTest(8, 5, 6, 1.f, 0.f); + GemmWarpTest(8, 5, 6, 2.f, 1.f); + GemmWarpTest(3, 2, 5, 1.0, 0.0); + GemmWarpTest(3, 2, 5, 2.0, 1.0); + GemmWarpTest(8, 5, 6, 1.0, 0.0); + GemmWarpTest(8, 5, 6, 2.0, 1.0); +} diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -98,7 +98,7 @@ The update equations are as follows: $$ velocity = mu * velocity + gradient \\ if (use\_nesterov): \\ - param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\ + param = param - (gradient + mu * velocity) * learning\_rate \\ else: \\ param = param - learning\_rate * velocity. \\ $$ diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v, T g_val = g[i]; T v_new = v[i] * mu + g_val; v_out[i] = v_new; - p_out[i] = p[i] - (g_val - v_new * mu) * lr; + p_out[i] = p[i] - (g_val + v_new * mu) * lr; } } else { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel { v_out = v * mu + g; if (use_nesterov) { - p_out = p - (g - v_out * mu) * lr[0]; + p_out = p - (g + v_out * mu) * lr[0]; } else { p_out = p - lr[0] * v_out; } diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 9dbcc35e6f5bb01c159980a49dd4b4c9d37d2aab..728197377df04df8c993a48bc282431473fe9959 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -15,12 +15,13 @@ function(reader_library TARGET_NAME) PARENT_SCOPE) endfunction() -reader_library(open_files_op SRCS open_files_op.cc) +cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) +reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc) -reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc) +reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader) reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) reader_library(create_py_reader_op SRCS create_py_reader_op.cc) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba1b3d3e583868c5c360d4766e14c15538b22b33 --- /dev/null +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/buffered_reader.h" +#include + +namespace paddle { +namespace operators { +namespace reader { +BufferedReader::~BufferedReader() { reader_->Shutdown(); } +BufferedReader::BufferedReader( + const std::shared_ptr &reader, + const platform::Place &place, size_t buffer_size) + : framework::DecoratedReader(reader), + thread_pool_(1), + place_(place), + buffer_size_(buffer_size) { + cpu_buffer_.resize(buffer_size); + gpu_buffer_.resize(buffer_size); + ReadTillBufferFullAsync(); +} +void BufferedReader::ReadTillBufferFullAsync() { + PADDLE_ENFORCE_EQ(position_.size(), 0U); + for (size_t i = 0; i < buffer_size_; ++i) { + ReadAsync(i); + } +} +void BufferedReader::ReadAsync(size_t i) { + position_.emplace(thread_pool_.enqueue([this, i]() -> size_t { + TensorVec &cpu = cpu_buffer_[i]; + reader_->ReadNext(&cpu); + + if (cpu.empty()) { + return -1UL; + } + + if (platform::is_gpu_place(place_)) { + TensorVec &gpu = gpu_buffer_[i]; + gpu.resize(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + framework::TensorCopySync(cpu[i], place_, &gpu[i]); + gpu[i].set_lod(cpu[i].lod()); + } + } + return i; + })); +} +void BufferedReader::ShutdownImpl() { + reader_->Shutdown(); + while (!position_.empty()) { + position_.pop(); + } + prev_pos_ = -1UL; +} +void BufferedReader::StartImpl() { + reader_->Start(); + ReadTillBufferFullAsync(); +} +void BufferedReader::ReadNextImpl(std::vector *out) { + if (position_.empty()) { + out->clear(); + return; + } + size_t i = position_.front().get(); + position_.pop(); + + if (i == -1UL) { + ReadNextImpl(out); + return; + } + + *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i]; + + // Do not push current position into ReadAsync. Push the previous position + // Since all computation in fluid are async, change the data of + // current position may cause data error. + if (prev_pos_ != -1Ul) { + ReadAsync(prev_pos_); + } + prev_pos_ = i; +} + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h new file mode 100644 index 0000000000000000000000000000000000000000..cbe2bc1b5fdd69d1a843b768e3289acd621369a6 --- /dev/null +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "ThreadPool.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { +namespace reader { + +class BufferedReader : public framework::DecoratedReader { + using TensorVec = std::vector; + using VecFuture = std::future; + + public: + BufferedReader(const std::shared_ptr& reader, + const platform::Place& place, size_t buffer_size); + + ~BufferedReader() override; + + private: + void ReadTillBufferFullAsync(); + + void ReadAsync(size_t i); + + protected: + void ShutdownImpl() override; + void StartImpl() override; + void ReadNextImpl(std::vector* out) override; + + private: + ThreadPool thread_pool_; + platform::Place place_; + const size_t buffer_size_; + + std::queue> position_; + + // The buffer for reading data. + // NOTE: the simplest way to implement buffered reader is do not use any + // buffer, just read async and create futures as buffer size. However, to + // malloc tensors every time is extremely slow. Here we store all data in + // buffers and prevent alloc every time. + std::vector cpu_buffer_; + std::vector gpu_buffer_; + size_t prev_pos_{-1UL}; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 7b14370f4fd64e8fd5b8d9038006494b88d671dc..ed719f91d0980480aa62a5cd3c1f819e6c0e7475 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -12,83 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // NOLINT - -#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" namespace paddle { namespace operators { namespace reader { - -// 'Double buffer' means we shall maintain two batches of input data at the same -// time. So the kCacheSize shoul be at least 2. -static constexpr size_t kCacheSize = 3; -// There will be two bacthes out of the channel during training: -// 1. the one waiting to be sent to the channel -// 2. the one just be received from the channel, which is also being used by -// subsequent operators. -// So the channel size should be kChacheSize - 2 -static constexpr size_t kChannelSize = 1; // kCacheSize - 2 - -class DoubleBufferReader : public framework::DecoratedReader { - public: - explicit DoubleBufferReader( - const std::shared_ptr& reader, - platform::Place target_place = platform::CPUPlace()) - : DecoratedReader(reader), place_(target_place) { - cpu_tensor_cache_.resize(kCacheSize); - gpu_tensor_cache_.resize(kCacheSize); -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(place_)) { - for (size_t i = 0; i < kCacheSize; ++i) { - ctxs_.emplace_back(new platform::CUDADeviceContext( - boost::get(place_))); - } - } -#endif - StartPrefetcher(); - } - - void ReadNextImpl(std::vector* out) override; - - ~DoubleBufferReader() { EndPrefetcher(); } - - private: - void ShutdownImpl() override { - EndPrefetcher(); - reader_->Shutdown(); - } - - void StartImpl() override { - reader_->Start(); - StartPrefetcher(); - } - - void StartPrefetcher() { - channel_ = new reader::BlockingQueue(kChannelSize); - prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); - } - - void EndPrefetcher() { - channel_->Close(); - if (prefetcher_.joinable()) { - prefetcher_.join(); - } - delete channel_; - channel_ = nullptr; - } - - void PrefetchThreadFunc(); - - std::thread prefetcher_; - reader::BlockingQueue* channel_; - platform::Place place_; - std::vector> cpu_tensor_cache_; - std::vector> gpu_tensor_cache_; - std::vector> ctxs_; -}; - class CreateDoubleBufferReaderOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; @@ -118,8 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { place = platform::CUDAPlace(static_cast(num)); } - out->Reset(framework::MakeDecoratedReader( - underlying_reader, place)); + out->Reset(framework::MakeDecoratedReader(underlying_reader, + place, 2)); } }; @@ -146,51 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { } }; -void DoubleBufferReader::ReadNextImpl(std::vector* out) { - size_t cached_tensor_id; - if (channel_->Receive(&cached_tensor_id)) { - if (platform::is_gpu_place(place_)) { - *out = gpu_tensor_cache_[cached_tensor_id]; - } else { - // CPU place - *out = cpu_tensor_cache_[cached_tensor_id]; - } - } else { - out->clear(); - } -} - -void DoubleBufferReader::PrefetchThreadFunc() { - VLOG(5) << "A new prefetch thread starts."; - size_t cached_tensor_id = 0; - while (true) { - auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id]; - reader_->ReadNext(&cpu_batch); - if (cpu_batch.empty()) { - // The underlying reader have no next data. - break; - } - if (platform::is_gpu_place(place_)) { - auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id]; - gpu_batch.resize(cpu_batch.size()); - for (size_t i = 0; i < cpu_batch.size(); ++i) { - // TODO(fengjiayi): Use asynchronous TensorCopy instead - framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]); - gpu_batch[i].set_lod(cpu_batch[i].lod()); - } - } - if (!channel_->Send(cached_tensor_id)) { - VLOG(5) << "WARNING: The double buffer channel has been closed. The " - "prefetch thread will terminate."; - break; - } - ++cached_tensor_id; - cached_tensor_id %= kCacheSize; - } - channel_->Close(); - VLOG(5) << "Prefetch thread terminates."; -} - } // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 833776f56eef0ffb2ae5e963919f0482bcd511b8..0f31ca1a94326956ae5e6dffd582daedeb55a9e3 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -33,6 +33,8 @@ class PyReader : public framework::FileReader { if (!success) out->clear(); } + ~PyReader() { queue_->Close(); } + void Shutdown() override { queue_->Close(); } void Start() override { queue_->ReOpen(); } diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index b32f09b22524c8b67ce57cc6022ef46efc2e828d..a08a9dbd0da46e73082cdd24c019e8d210d8bcc4 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -33,11 +33,14 @@ class RecordIOFileReader : public framework::FileReader { protected: void ReadNextImpl(std::vector* out) override { + std::unique_ptr> guard; if (ThreadSafe) { - std::lock_guard guard(*mutex_); - *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_); - } else { - *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_); + guard.reset(new std::lock_guard(*mutex_)); + } + + bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out); + if (!ok) { + out->clear(); } } diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index 4b308abc290c10a8a5846672e719b503dfc79b21..3f72890a7cee1453585d50afa04fa62a9b059dc3 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -48,9 +48,9 @@ class ShuffleReader : public framework::DecoratedReader { private: void ShutdownImpl() override { + reader_->Shutdown(); buffer_.clear(); iteration_pos_ = 0; - reader_->Shutdown(); } void StartImpl() override { diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 9a8d203672fa2d560440d063d93fa5f8523690ef..38223e069975a08791d58d6ae10e2112b79a61fe 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -12,150 +12,200 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include // NOLINT - +#include "ThreadPool.h" +#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" namespace paddle { namespace operators { namespace reader { -class MultiFileReader : public framework::ReaderBase { +class IReaderContainer { public: - MultiFileReader(const std::vector& file_names, size_t thread_num, - size_t buffer_size) - : buffer_size_(buffer_size) { - readers_.reserve(file_names.size()); - for (const std::string& f_name : file_names) { - readers_.emplace_back(CreateReaderByFileName(f_name)); + virtual ~IReaderContainer() {} + virtual void AppendReader( + std::unique_ptr&& readers) = 0; + virtual void Stop() = 0; + virtual void Start() = 0; + virtual void ReadNext(std::vector* out) = 0; +}; + +class OrderedReaderContainer : public IReaderContainer { + public: + void AppendReader(std::unique_ptr&& reader) override { + pending_.emplace(std::move(reader)); + } + + void Stop() override { + while (!pending_.empty()) { + MoveFrontPendingToDone(); } - prefetchers_.resize(thread_num); - StartNewScheduler(); } - void ReadNextImpl(std::vector* out) override; + void Start() override { std::swap(done_, pending_); } - ~MultiFileReader() { EndScheduler(); } + void ReadNext(std::vector* out) override { + if (!pending_.empty()) { + pending_.front()->ReadNext(out); + if (out->empty()) { + MoveFrontPendingToDone(); + ReadNext(out); + } + } else { + out->clear(); + } + } private: - void ShutdownImpl() override { EndScheduler(); } - - void StartImpl() override { StartNewScheduler(); } - - void StartNewScheduler(); - void EndScheduler(); - void ScheduleThreadFunc(); - void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx); - - std::vector> readers_; - std::thread scheduler_; - std::vector prefetchers_; - size_t buffer_size_; - reader::BlockingQueue* waiting_reader_idx_; - reader::BlockingQueue* available_thread_idx_; - reader::BlockingQueue>* buffer_; + void MoveFrontPendingToDone() { + pending_.front()->Shutdown(); + pending_.front()->Start(); + done_.emplace(move(pending_.front())); + pending_.pop(); + } + + std::queue> pending_; + std::queue> done_; }; -void MultiFileReader::ReadNextImpl(std::vector* out) { - if (!buffer_->Receive(out)) { - out->clear(); - } -} +class PreemptiveReaderContainer : public IReaderContainer { + using ReaderList = std::list>; -void MultiFileReader::StartNewScheduler() { - size_t thread_num = prefetchers_.size(); - waiting_reader_idx_ = new reader::BlockingQueue(readers_.size()); - available_thread_idx_ = new reader::BlockingQueue(thread_num); - buffer_ = new reader::BlockingQueue>( - buffer_size_); + struct FutureItem { + std::vector data_; + ReaderList::iterator reader_it_; + std::exception_ptr exception_; + }; - for (size_t i = 0; i < readers_.size(); ++i) { - waiting_reader_idx_->Send(i); - } - waiting_reader_idx_->Close(); - for (size_t i = 0; i < thread_num; ++i) { - available_thread_idx_->Send(i); - } + using FutureList = std::list>; - scheduler_ = std::thread([this] { ScheduleThreadFunc(); }); -} + public: + explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {} -void MultiFileReader::EndScheduler() { - available_thread_idx_->Close(); - buffer_->Close(); - waiting_reader_idx_->Close(); - if (scheduler_.joinable()) { - scheduler_.join(); - } - delete buffer_; - delete available_thread_idx_; - delete waiting_reader_idx_; -} - -void MultiFileReader::ScheduleThreadFunc() { - VLOG(5) << "MultiFileReader schedule thread starts."; - size_t completed_thread_num = 0; - size_t thread_idx; - while (available_thread_idx_->Receive(&thread_idx)) { - std::thread& prefetcher = prefetchers_[thread_idx]; - if (prefetcher.joinable()) { - prefetcher.join(); - } - size_t reader_idx; - if (waiting_reader_idx_->Receive(&reader_idx)) { - // Still have files to read. Start a new prefetch thread. - prefetcher = std::thread([this, reader_idx, thread_idx] { - PrefetchThreadFunc(reader_idx, thread_idx); - }); - } else { - // No more file to read. - ++completed_thread_num; - if (completed_thread_num == prefetchers_.size()) { - buffer_->Close(); - break; + void Stop() override { + if (!pending_.empty()) { + for (auto& reader : pending_) { + reader->Shutdown(); + } + for (auto& fu : futures_) { + fu.wait(); } + futures_.clear(); + for (auto& reader : pending_) { + reader->Start(); + done_.emplace_back(std::move(reader)); + } + pending_.clear(); + bool timeout; + complete_queue_.PopAll(1000, &timeout); + PADDLE_ENFORCE(!timeout); } } - // If users invoke Shutdown() when scheduler is running, it will close the - // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler - // to release their resource. So a check is needed before scheduler ends. - for (auto& p : prefetchers_) { - if (p.joinable()) { - p.join(); + + void Start() override { + for (auto& reader : done_) { + AppendReader(std::move(reader)); } + done_.clear(); } - VLOG(5) << "MultiFileReader schedule thread terminates."; -} - -void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) { - VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts."; - std::unique_ptr& reader = readers_[reader_idx]; - while (true) { - std::vector ins; - reader->ReadNext(&ins); - if (ins.empty()) { - reader->Shutdown(); - reader->Start(); - break; + + void ReadNext(std::vector* out) override { + if (!pending_.empty()) { + auto future_it = complete_queue_.Pop(); + FutureItem item = future_it->get(); + if (item.exception_) { + for (auto it = futures_.begin(); it != futures_.end(); ++it) { + if (it != future_it) { + it->wait(); // Wait all other threads complete. + } + } + std::rethrow_exception(item.exception_); + + } else if (item.data_.empty()) { // reader done. + done_.emplace_back(std::move(*item.reader_it_)); + pending_.erase(item.reader_it_); + futures_.erase(future_it); + ReadNext(out); + } else { + *out = item.data_; + // continue read async + ReadAsync(item.reader_it_, &future_it); + } + } else { + out->clear(); } - try { - buffer_->Send(std::move(ins)); - } catch (paddle::platform::EnforceNotMet e) { - VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch " - "thread of file idx '" - << reader_idx << "' will terminate."; - break; + } + + private: + void AppendReader(std::unique_ptr&& reader) override { + pending_.emplace_back(std::move(reader)); + auto reader_it = pending_.end(); + --reader_it; + + futures_.emplace_back(); + auto future_it = futures_.end(); + --future_it; + + ReadAsync(reader_it, &future_it); + } + + void ReadAsync(const ReaderList::iterator& reader_it, + FutureList::iterator* future_it_ptr) { + auto& future_it = *future_it_ptr; + *future_it = pool_.enqueue([reader_it, future_it, this] { + try { + FutureItem item; + item.reader_it_ = reader_it; + (*reader_it)->ReadNext(&item.data_); + if (item.data_.empty()) { + (*reader_it)->Shutdown(); + (*reader_it)->Start(); + } + complete_queue_.Push(future_it); + return item; + } catch (...) { + FutureItem item; + item.exception_ = std::current_exception(); + complete_queue_.Push(future_it); + return item; + } + }); + } + + FutureList futures_; + ThreadPool pool_; + framework::BlockingQueue complete_queue_; + std::list> pending_; + std::list> done_; +}; + +class MultiFileReader : public framework::ReaderBase { + public: + MultiFileReader(const std::vector& file_names, + std::unique_ptr&& container) + : container_(std::move(container)) { + for (auto& fn : file_names) { + container_->AppendReader(CreateReaderByFileName(fn)); } } - if (!available_thread_idx_->Send(thread_idx)) { - VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. " - "Fail to send thread_idx."; + ~MultiFileReader() { container_->Stop(); } + + protected: + void ReadNextImpl(std::vector* out) override { + container_->ReadNext(out); } - VLOG(5) << "The prefetch thread of file idx '" << reader_idx - << "' terminates."; -} + void ShutdownImpl() override { container_->Stop(); } + void StartImpl() override { container_->Start(); } + + private: + std::unique_ptr container_; +}; class OpenFilesOp : public framework::OperatorBase { public: @@ -173,13 +223,27 @@ class OpenFilesOp : public framework::OperatorBase { "shape concat's length."); const auto& file_names = Attr>("file_names"); PADDLE_ENFORCE(!file_names.empty(), "No file to be read!"); - const size_t thread_num = Attr("thread_num"); - const size_t buffer_size = Attr("buffer_size"); + bool is_test = Attr("is_test"); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); - out->Reset( - std::make_shared(file_names, thread_num, buffer_size)); + std::unique_ptr container; + + if (is_test) { + container.reset(new OrderedReaderContainer()); + } else { + container.reset(new PreemptiveReaderContainer( + static_cast(Attr("thread_num")))); + } + + std::shared_ptr reader( + new MultiFileReader(file_names, std::move(container))); + auto buffer_size = Attr("buffer_size"); + if (buffer_size > 1) { + reader = framework::MakeDecoratedReader( + reader, platform::CPUPlace(), buffer_size); + } + out->Reset(reader); } }; @@ -187,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase { protected: void Apply() override { AddAttr>("file_names", "Files to be read."); - AddAttr("thread_num", "The maximal concurrent prefetch thread number.") - .GreaterThan(0); - AddAttr("buffer_size", "The size of prefetch buffer.").GreaterThan(0); + AddAttr("is_test", "Used for testing data.").SetDefault(false); AddComment(R"DOC( OpenFiles Operator @@ -197,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase { An OpenFilesOp creates a MultiFileReader, which is able to read data multi-threaded from multiple files. )DOC"); + AddAttr("thread_num", + "The maximal concurrent prefetch thread number. Used only " + "when is_test = False"); + AddAttr("buffer_size", "The reading buffer of these files.") + .GreaterThan(0); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..43672d6db92a981f0fbe6e8f7079dafc6ae4052e 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -24,6 +24,9 @@ #include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { + +DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); + namespace operators { using inference::Singleton; @@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); - switch (shape.size()) { case 2: return nvinfer1::Dims2(shape[0], shape[1]); @@ -90,27 +92,36 @@ void TensorRTEngineKernel::Prepare( engine->InitNetwork(); framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); + VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; for (auto &input : context.Inputs("Xs")) { VLOG(4) << "declare input " << input; auto *var = block.FindVar(input); + // TensorRT engine need to create parameters. The parameter's description + // should be set in + PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); auto shape = var->GetShape(); + // For the special batch_size placeholder -1, drop it and pass the real + // shape of data. + // TODO(Superjomn) fix this with batch broadcast, or it can't handle + // variational batch size. + if (shape[0] == -1) { + shape[0] = FLAGS_tensorrt_engine_batch_size; + } engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(var->GetShape())); + Vec2TRT_Dims(shape)); } inference::Singleton::Global().ConvertBlock( block_desc, parameters, context.scope(), engine); // Add outputs - VLOG(4) << "declare outputs"; for (auto &output : context.Outputs("Ys")) { - VLOG(4) << "declare output " << output; engine->DeclareOutput(output); } @@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL( ops::TensorRTEngineKernel, ops::TensorRTEngineKernel); +// A trick to compile with the needed TensorRT op converter. +USE_TRT_CONVERTER(mul) + #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..a332d70030ffa6a033f6b2b33487a4fd279b7016 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -24,6 +24,9 @@ #include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { + +DECLARE_int32(tensorrt_engine_batch_size); + namespace operators { using inference::Singleton; @@ -53,7 +56,6 @@ template class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - VLOG(4) << "TensorRTEngineKernel executing"; auto engine_name = context.Attr("engine_uniq_key"); if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); @@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - // Try to determine a batch_size - auto& tensor0 = inference::analysis::GetFromScope( - context.scope(), input_names.front()); - int batch_size = tensor0.dims()[0]; - PADDLE_ENFORCE_LE(batch_size, context.Attr("max_batch")); + PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, + context.Attr("max_batch")); // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { @@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel { } } // Execute the engine. - PADDLE_ENFORCE_GT(batch_size, 0); - engine->Execute(batch_size); + PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); + engine->Execute(FLAGS_tensorrt_engine_batch_size); // Convert output tensor from engine to fluid for (const auto& y : context.Outputs("Ys")) { // convert output and copy to fluid. @@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* fluid_v = context.scope().FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto* fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); - if (platform::is_cpu_place(fluid_t->place())) { - // TODO(Superjomn) change this float to dtype size. - engine->GetOutputInCPU( - y, fluid_t->mutable_data(platform::CPUPlace()), - size * sizeof(float)); - } else { - engine->GetOutputInGPU( - y, fluid_t->mutable_data(platform::CUDAPlace()), - size * sizeof(float)); - } + fluid_t->Resize(framework::make_ddim(ddim)); + + // TODO(Superjomn) find some way to determine which device to output the + // tensor. + // if (platform::is_cpu_place(fluid_t->place())) { + // TODO(Superjomn) change this float to dtype size. + engine->GetOutputInCPU(y, + fluid_t->mutable_data(platform::CPUPlace()), + size * sizeof(float)); + //} else { + // engine->GetOutputInGPU( + // y, fluid_t->mutable_data(platform::CUDAPlace()), + // size * sizeof(float)); + //} } cudaStreamSynchronize(*engine->stream()); diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc index 06a13e6c5b6ea76456e231e3f7b1eb33492b16ea..a0a2f984228db0e7a015630655a3176aa4d1a5a4 100644 --- a/paddle/fluid/recordio/scanner.cc +++ b/paddle/fluid/recordio/scanner.cc @@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr &&stream) Scanner::Scanner(const std::string &filename) : stream_(new std::ifstream(filename)), parser_(*stream_) { + PADDLE_ENFORCE(static_cast(*stream_), "Cannot open file %s", filename); Reset(); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9837117fbbc0b4218dc89fc5de0c176abcd32b83..9e58a39eb0939fa15e9c19e1e6fc89a6f99d9a0c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -333,8 +333,7 @@ function assert_api_not_changed() { python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate - # Use git diff --name-only HEAD^ may not get file changes for update commits in one PR - API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true` + API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true` echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then # TODO: curl -H 'Authorization: token ${TOKEN}' @@ -600,11 +599,11 @@ function main() { cicheck) cmake_gen ${PYTHON_ABI:-""} build - assert_api_not_changed run_test gen_capi_package gen_fluid_inference_lib test_fluid_inference_lib + assert_api_not_changed ;; *) print_usage diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 782aa933f2ee86274e800045c9356d8072915fc1..79a11a30d602cb33c2583873e0d0f2e15e0fcb8c 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -25,9 +25,6 @@ import numpy __all__ = [ 'split_lod_tensor', 'merge_lod_tensor', - 'BlockGuard', - 'BlockGuardWithCompletion', - 'WhileGuard', 'While', 'Switch', 'lod_rank_table', diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 0665c09bfb52c932219be68ca801cfa951d672d3..07a7ef15ac5cfd39b4cdb30bcff95fc499ae50ae 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -12,14 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib +import multiprocessing +import threading -from .. import core -from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program -from ..unique_name import generate as unique_name +from ..data_feeder import DataFeeder from control_flow import BlockGuard -from ..layer_helper import LayerHelper +from layer_function_generator import templatedoc +from .. import core from ..executor import global_scope -from layer_function_generator import generate_layer_fn, templatedoc +from ..framework import convert_np_dtype_to_dtype_, default_main_program, \ + default_startup_program, program_guard, Program +from ..layer_helper import LayerHelper +from ..unique_name import generate as unique_name __all__ = [ 'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', @@ -445,7 +449,12 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): return monkey_patch_reader_methods(main_prog_var) -def py_reader(capacity, shapes, dtypes, lod_levels=None): +def py_reader(capacity, + shapes, + dtypes, + lod_levels=None, + name=None, + use_double_buffer=True): """ Create a reader and blocking queue for data feeding in Python @@ -458,10 +467,13 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None): using `close()` method when unused. Args: + use_double_buffer(bool): Whether use double buffer or not. capacity(int): The maximum capacity of the BlockingQueue. - shapes(list): List of tuples which declaring data shapes. - dtypes(list): List of strs which declaring data type. - lod_levels(list): List of ints which declaring data lod_level. + shapes(list|tuple): List of tuples which declaring data shapes. + dtypes(list|tuple): List of strs which declaring data type. + lod_levels(list|tuple): List of ints which declaring data lod_level. + name(basestring): The prefix Python queue name and Reader name. None will + be generated automatically. Returns: tuple(Variable, BlockingQueue): @@ -502,15 +514,23 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None): if lod_levels is None: lod_levels = [0] * len(shapes) - queue_name = unique_name('lod_tensor_blocking_queue') + if name is None: + queue_name = unique_name('lod_tensor_blocking_queue') + reader_name = unique_name('create_py_reader') + double_buffer_name = unique_name('double_buffer') + else: + queue_name = "_".join([name, "queue"]) + reader_name = "_".join([name, "reader"]) + double_buffer_name = "_".join([name, "double_buffer"]) + var = global_scope().var(queue_name) feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) startup_blk = default_startup_program().current_block() - startup_var = startup_blk.create_var(name=unique_name('create_py_reader')) + startup_var = startup_blk.create_var(name=reader_name) startup_blk.append_op( type='create_py_reader', - inputs={'blocking_queue': queue_name}, + inputs={'blocking_queue': [queue_name]}, outputs={'Out': [startup_var]}, attrs={ 'shape_concat': shape_concat, @@ -524,17 +544,96 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None): main_prog_var = _copy_reader_var_(default_main_program().current_block(), startup_var) - return monkey_patch_reader_methods(main_prog_var), feed_queue + reader = monkey_patch_reader_methods(main_prog_var) + if use_double_buffer: + double_buffer_reader = double_buffer(reader, name=double_buffer_name) + # we return a double buffer reader. However, the reset method comes from + # py_reader. + double_buffer_reader.reset = reader.reset + reader = double_buffer_reader + + # monkey patch py_reader special methods + reader.queue = feed_queue + current_reset_method = reader.reset + reader.thread = None + reader.tensor_provider = None + reader.exited = False + + def start_provide_thread(func): + def __provider_thread__(): + for tensors in func(): + array = core.LoDTensorArray() + for item in tensors: + if not isinstance(item, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(item, core.CPUPlace()) + item = tmp + + array.append(item) + + if reader.exited: + break + feed_queue.push(array) + if reader.exited: + break + feed_queue.close() + + reader.thread = threading.Thread(target=__provider_thread__) + reader.thread.start() + + def __set_tensor_provider__(func): + reader.tensor_provider = func + + def __set_paddle_reader__(paddle_reader): + with program_guard(Program(), Program()): + feed_list = [] + counter = 0 + for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels): + name = str(counter) + feed_list.append( + data( + name=name, + dtype=dtype, + shape=shape, + lod_level=lod_level)) + counter += 1 + + feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace()) + paddle_reader = feeder.decorate_reader( + paddle_reader, multi_devices=False) + + def __tensor_provider__(): + for slots in paddle_reader(): + yield [slots[str(idx)] for idx in xrange(counter)] + + __set_tensor_provider__(__tensor_provider__) + + def __reset__(): + current_reset_method() + if reader.thread is not None and reader.tensor_provider is not None: + reader.exited = True + reader.thread.join() + reader.exited = False + + def __start__(): + start_provide_thread(reader.tensor_provider) + + reader.reset = __reset__ + reader.decorate_tensor_provider = __set_tensor_provider__ + reader.decorate_paddle_reader = __set_paddle_reader__ + reader.start = __start__ + + return reader def open_files(filenames, shapes, lod_levels, dtypes, - thread_num=1, + thread_num=None, buffer_size=None, pass_num=1, - for_parallel=True): + is_test=None): """ Open files @@ -547,14 +646,14 @@ def open_files(filenames, shapes(list): List of tuples which declaring data shapes. lod_levels(list): List of ints which declaring data lod_level. dtypes(list): List of strs which declaring data type. - thread_num(int): The maximal concurrent prefetch thread number. - buffer_size(int|None): The size of prefetch buffer. If it is setted None, - buffer size will be thread_num * 3. - Default: None + thread_num(None): The number of thread to read files. + Default: min(len(filenames), cpu_number). + buffer_size(None): The buffer size of reader. Default: 3 * thread_num pass_num(int): Number of passes to run. - for_parallel(Bool): Set it as True if you are going to run - subsequent operators in parallel. - Default: True + is_test(bool|None): Whether `open_files` used for testing or not. If it + is used for testing, the order of data generated is same as the file + order. Otherwise, it is not guaranteed the order of data is same + between every epoch. [Default: False]. Returns: Variable: A Reader Variable via which we can get file data. @@ -566,15 +665,21 @@ def open_files(filenames, './data2.recordio'], shapes=[(3,224,224), (1)], lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=2, - buffer_size=2) + dtypes=['float32', 'int64']) # Via the reader, we can use 'read_file' layer to get data: image, label = fluid.layers.io.read_file(reader) """ + if thread_num is None: + thread_num = min(len(filenames), multiprocessing.cpu_count()) + else: + thread_num = int(thread_num) + if buffer_size is None: - buffer_size = thread_num * 3 + buffer_size = 3 * thread_num + else: + buffer_size = int(buffer_size) + if isinstance(filenames, basestring): filenames = [filenames] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] @@ -588,17 +693,18 @@ def open_files(filenames, multi_file_reader_name = unique_name('multi_file_reader') startup_blk = default_startup_program().current_block() startup_reader = startup_blk.create_var(name=multi_file_reader_name) + attrs = { + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'ranks': ranks, + 'file_names': filenames, + 'thread_num': thread_num, + 'buffer_size': buffer_size + } + if is_test is not None: + attrs['is_test'] = is_test startup_blk.append_op( - type='open_files', - outputs={'Out': [startup_reader]}, - attrs={ - 'shape_concat': shape_concat, - 'lod_levels': lod_levels, - 'ranks': ranks, - 'file_names': filenames, - 'thread_num': thread_num, - 'buffer_size': buffer_size - }) + type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs) startup_reader.desc.set_dtypes(dtypes) startup_reader.persistable = True diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 194a16b123c441ac1318b8ce58158f67e2a8093d..e7d7a9e826de95514b6f2e04e7408075ab0b8cb6 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -114,23 +114,13 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): prediction = network(image, is_infer=True) auc_out=fluid.layers.auc(input=prediction, label=label) """ - - warnings.warn( - "This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \ - but can not aggregate them and get the pass AUC, because pass \ - auc can not be averaged with weighted from the minibatch auc value. \ - Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \ - which can get every minibatch and every pass auc value.", Warning) helper = LayerHelper("auc", **locals()) - topk_out = helper.create_tmp_variable(dtype=input.dtype) - topk_indices = helper.create_tmp_variable(dtype="int64") - topk_out, topk_indices = nn.topk(input, k=k) - auc_out = helper.create_tmp_variable(dtype="float32") + auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. - tp = helper.create_global_variable(persistable=True) - tn = helper.create_global_variable(persistable=True) - fp = helper.create_global_variable(persistable=True) - fn = helper.create_global_variable(persistable=True) + tp = helper.create_global_variable(persistable=True, dtype='int64') + tn = helper.create_global_variable(persistable=True, dtype='int64') + fp = helper.create_global_variable(persistable=True, dtype='int64') + fn = helper.create_global_variable(persistable=True, dtype='int64') for var in [tp, tn, fp, fn]: helper.set_variable_initializer( var, Constant( @@ -139,8 +129,7 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): helper.append_op( type="auc", inputs={ - "Out": [topk_out], - "Indices": [topk_indices], + "Predict": [input], "Label": [label], "TP": [tp], "TN": [tn], @@ -156,4 +145,4 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): "FPOut": [fp], "FNOut": [fn] }) - return auc_out + return auc_out, [tp, tn, fp, fn] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 56124663929d1e33b7144ab57ae3b3c55e1652b3..ab40d0c217f565493b30d9a4cb3a600863122bc7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -166,7 +166,8 @@ def fc(input, param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias - of this layer. If it is set to None, no bias will be added to the output units. + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. act (str, default None): Activation to be applied to the output of this layer. is_test(bool): A flag indicating whether execution is in test phase. use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 17bb0826a6ea86c98a069263dfab84b99e1177ad..b37b09ac81687882443c948569d9c4fca9310f78 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -591,7 +591,7 @@ class Auc(MetricBase): for i in range(self._num_thresholds - 2)] thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] - # caculate TP, FN, TN, FP count + # calculate TP, FN, TN, FP count for idx_thresh, thresh in enumerate(thresholds): tp, fn, tn, fp = 0, 0, 0, 0 for i, lbl in enumerate(labels): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7fc8e106fb43666be9c1ea245994dc1c7ac85d7d..3fe99f55011ab7f745c3ad98ec44dfe277a13e05 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer): & if (use\_nesterov): - &\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate + &\quad param = param - (gradient + mu * velocity) * learning\_rate & else: diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7dbf8106268582c9c771c7ab9e2a7be7a2f313 --- /dev/null +++ b/python/paddle/fluid/tests/demo/pyreader.py @@ -0,0 +1,95 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy + +import paddle +import paddle.dataset.mnist as mnist +import paddle.fluid as fluid +import paddle.v2 + + +def network(is_train): + reader = fluid.layers.py_reader( + capacity=10, + shapes=((-1, 784), (-1, 1)), + dtypes=('float32', 'int64'), + name="train_reader" if is_train else "test_reader") + img, label = fluid.layers.read_file(reader) + + hidden = img + + for i in xrange(2): + hidden = fluid.layers.fc(input=hidden, size=100, act='tanh') + hidden = fluid.layers.dropout( + hidden, dropout_prob=0.5, is_test=not is_train) + + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + return fluid.layers.mean(loss), reader + + +def main(): + train_prog = fluid.Program() + startup_prog = fluid.Program() + + with fluid.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + loss, train_reader = network(True) + adam = fluid.optimizer.Adam(learning_rate=0.01) + adam.minimize(loss) + + test_prog = fluid.Program() + test_startup = fluid.Program() + with fluid.program_guard(test_prog, test_startup): + with fluid.unique_name.guard(): + test_loss, test_reader = network(False) + + fluid.Executor(fluid.CUDAPlace(0)).run(startup_prog) + fluid.Executor(fluid.CUDAPlace(0)).run(test_startup) + + trainer = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=train_prog) + + tester = fluid.ParallelExecutor( + use_cuda=True, share_vars_from=trainer, main_program=test_prog) + + train_reader.decorate_paddle_reader( + paddle.v2.reader.shuffle( + paddle.batch(mnist.train(), 512), buf_size=8192)) + + test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512)) + + for epoch_id in xrange(10): + train_reader.start() + try: + while True: + print 'train_loss', numpy.array( + trainer.run(fetch_list=[loss.name])) + except fluid.core.EOFException: + print 'End of epoch', epoch_id + train_reader.reset() + + test_reader.start() + try: + while True: + print 'test loss', numpy.array( + tester.run(fetch_list=[test_loss.name])) + except fluid.core.EOFException: + print 'End of testing' + test_reader.reset() + + +if __name__ == '__main__': + main() diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py index 9425d472a48056e71da5da364f659971ef6c2520..8244617711138d590193b2898de5d2f3aeb1e11e 100644 --- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py +++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py @@ -31,7 +31,10 @@ def load_vocab(filename): # load word dict with paddle inner function -word_dict = load_vocab(sys.argv[1]) +if len(sys.argv) == 1: + word_dict = paddle.dataset.imdb.word_dict() +else: + word_dict = load_vocab(sys.argv[1]) word_dict[""] = len(word_dict) print "Dict dim = ", len(word_dict) diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py index e408684c6e0941a1b317ffeac66f071c1382836d..281c2869d642c7fe41a386c42208ca2da1dc2891 100644 --- a/python/paddle/fluid/tests/demo/text_classification/train.py +++ b/python/paddle/fluid/tests/demo/text_classification/train.py @@ -41,16 +41,14 @@ def network_cfg(is_train, pass_num=100): pass_num=pass_num, shapes=[[-1, 1], [-1, 1]], lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) + dtypes=['int64', 'int64']) test_file_obj = fluid.layers.open_files( filenames=TEST_FILES, pass_num=1, shapes=[[-1, 1], [-1, 1]], lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) + dtypes=['int64', 'int64']) if is_train: file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8c564abf986d351b31e993b62bfe1f17c52a4e10..322d76515e76c3d322ac7c4f989bbc95875cb654 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..72bc1729b0f63b23ad7ecb5ad703b984a4c614ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -0,0 +1,350 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import sys +import signal + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class SE_ResNeXt(): + def __init__(self, layers=50): + self.params = train_parameters + self.layers = layers + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=3, + stride=2, + act='relu') + conv = self.conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = self.conv_bn_layer( + input=conv, + num_filters=128, + filter_size=3, + stride=1, + act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, \ + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) + stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) + out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') + return out + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + filter_size = 1 + return self.conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + def bottleneck_block(self, input, num_filters, stride, cardinality, + reduction_ratio): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = self.squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = self.shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + def squeeze_excitation(self, input, num_channels, reduction_ratio): + pool = fluid.layers.pool2d( + input=input, pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def get_model(batch_size): + # Input data + image = fluid.layers.fill_constant( + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) + label = fluid.layers.fill_constant( + shape=[batch_size, 1], dtype='int64', value=0.0) + + # Train program + model = SE_ResNeXt(layers=50) + out = model.net(input=image, class_dim=102) + cost = fluid.layers.cross_entropy(input=out, label=label) + + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + # Evaluator + test_program = fluid.default_main_program().clone(for_test=True) + + # Optimization + total_images = 6149 # flowers + epochs = [30, 60, 90] + step = int(total_images / batch_size + 1) + + bd = [step * e for e in epochs] + base_lr = 0.1 + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + optimizer.minimize(avg_cost) + + # Reader + train_reader = paddle.batch( + paddle.dataset.flowers.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.flowers.test(), batch_size=batch_size) + + return test_program, avg_cost, train_reader, test_reader, acc_top1, out + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +class DistSeResneXt2x2: + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, + trainer_id): + get_model(batch_size=2) + t = get_transpiler(trainer_id, + fluid.default_main_program(), pserver_endpoints, + trainers) + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(pserver_prog) + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + print("waiting ps ready: ", pid) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( + batch_size=20) + if is_dist: + t = get_transpiler(trainer_id, + fluid.default_main_program(), endpoints, + trainers) + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, + loss_name=avg_cost.name, + exec_strategy=strategy, + num_trainers=trainers, + trainer_id=trainer_id) + + feed_var_list = [ + var for var in trainer_prog.global_block().vars.itervalues() + if var.is_data + ] + + feeder = fluid.DataFeeder(feed_var_list, place) + reader_generator = train_reader() + first_loss, = exe.run(fetch_list=[avg_cost.name]) + print(first_loss) + for i in xrange(5): + loss, = exe.run(fetch_list=[avg_cost.name]) + last_loss, = exe.run(fetch_list=[avg_cost.name]) + print(last_loss) + + +def main(role="pserver", + endpoints="127.0.0.1:9123", + trainer_id=0, + current_endpoint="127.0.0.1:9123", + trainers=1, + is_dist=True): + model = DistSeResneXt2x2() + if role == "pserver": + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + else: + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print( + "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + ) + role = sys.argv[1] + endpoints = sys.argv[2] + trainer_id = int(sys.argv[3]) + current_endpoint = sys.argv[4] + trainers = int(sys.argv[5]) + is_dist = True if sys.argv[6] == "TRUE" else False + main( + role=role, + endpoints=endpoints, + trainer_id=trainer_id, + current_endpoint=current_endpoint, + trainers=trainers, + is_dist=is_dist) diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 6bd5e2332a99693f5e53e147491aa83c35859548..6580c70ca68c4ba24919f03d071f6f88fb68953c 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -15,13 +15,13 @@ import unittest import numpy as np from op_test import OpTest +from paddle.fluid import metrics class TestAucOp(OpTest): def setUp(self): self.op_type = "auc" pred = np.random.random((128, 2)).astype("float32") - indices = np.random.randint(0, 2, (128, 2)) labels = np.random.randint(0, 2, (128, 1)) num_thresholds = 200 tp = np.zeros((num_thresholds, )).astype("int64") @@ -30,8 +30,7 @@ class TestAucOp(OpTest): fn = np.zeros((num_thresholds, )).astype("int64") self.inputs = { - 'Out': pred, - 'Indices': indices, + 'Predict': pred, 'Label': labels, 'TP': tp, 'TN': tn, @@ -39,57 +38,18 @@ class TestAucOp(OpTest): 'FN': fn } self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} - # NOTE: sklearn use a different way to generate thresholds - # which will cause the result differs slightly: - # from sklearn.metrics import roc_curve, auc - # fpr, tpr, thresholds = roc_curve(labels, pred) - # auc_value = auc(fpr, tpr) - # we caculate AUC again using numpy for testing - kepsilon = 1e-7 # to account for floating point imprecisions - thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) - for i in range(num_thresholds - 2)] - thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] - # caculate TP, FN, TN, FP count - tp_list = np.ndarray((num_thresholds, )) - fn_list = np.ndarray((num_thresholds, )) - tn_list = np.ndarray((num_thresholds, )) - fp_list = np.ndarray((num_thresholds, )) - for idx_thresh, thresh in enumerate(thresholds): - tp, fn, tn, fp = 0, 0, 0, 0 - for i, lbl in enumerate(labels): - if lbl: - if pred[i, 0] >= thresh: - tp += 1 - else: - fn += 1 - else: - if pred[i, 0] >= thresh: - fp += 1 - else: - tn += 1 - tp_list[idx_thresh] = tp - fn_list[idx_thresh] = fn - tn_list[idx_thresh] = tn - fp_list[idx_thresh] = fp - - epsilon = 1e-6 - tpr = (tp_list.astype("float32") + epsilon) / ( - tp_list + fn_list + epsilon) - fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon) - rec = (tp_list.astype("float32") + epsilon) / ( - tp_list + fp_list + epsilon) - - x = fpr[:num_thresholds - 1] - fpr[1:] - y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 - auc_value = np.sum(x * y) + python_auc = metrics.Auc(name="auc", + curve='ROC', + num_thresholds=num_thresholds) + python_auc.update(pred, labels) self.outputs = { - 'AUC': auc_value, - 'TPOut': tp_list, - 'FNOut': fn_list, - 'TNOut': tn_list, - 'FPOut': fp_list + 'AUC': python_auc.eval(), + 'TPOut': python_auc.tp_list, + 'FNOut': python_auc.fn_list, + 'TNOut': python_auc.tn_list, + 'FPOut': python_auc.fp_list } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index 6d810920d55ccf069ff408c553069e8f5e590271..aa09b0ea445adccae3f741b53850f8182f3270cc 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -142,8 +142,7 @@ class TestDataBalance(unittest.TestCase): filenames=[self.lod_data_file_name], shapes=[[-1, 3], [-1, 1]], lod_levels=[1, 0], - dtypes=['float32', 'int32'], - thread_num=1) + dtypes=['float32', 'int32']) ins, label = fluid.layers.read_file(data_reader) place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() @@ -156,7 +155,7 @@ class TestDataBalance(unittest.TestCase): main_program=main_prog, build_strategy=build_strategy) - if (parallel_exe.device_count > self.batch_size): + if parallel_exe.device_count > self.batch_size: print("WARNING: Unittest TestDataBalance skipped. \ For the result is not correct when device count \ is larger than batch size.") @@ -190,3 +189,7 @@ class TestDataBalance(unittest.TestCase): def test_all(self): self.main() self.main_lod() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e7036f08cb88087ae45fe7d7c7565c102dab8a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -0,0 +1,122 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import unittest +import os +import signal +import subprocess + + +class TestDistSeResneXt2x2(unittest.TestCase): + def setUp(self): + self._trainers = 2 + self._pservers = 2 + self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" + self._python_interp = "python" + + def start_pserver(self): + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) + ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) + + ps0_proc = subprocess.Popen( + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps1_proc = subprocess.Popen( + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return ps0_proc, ps1_proc + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def non_test_with_place(self): + # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN + required_envs = { + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH"), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15" + } + # Run local to get a base line + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + env_local.update(required_envs) + local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \ + (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1) + local_proc = subprocess.Popen( + local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local) + local_proc.wait() + local_ret = local_proc.stdout.read() + + # Run dist train to compare with local results + ps0, ps1 = self.start_pserver() + self._wait_ps_ready(ps0.pid) + self._wait_ps_ready(ps1.pid) + + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) + tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) + + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + env0.update(required_envs) + env1.update(required_envs) + FNULL = open(os.devnull, 'w') + + tr0_proc = subprocess.Popen( + tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1) + + tr0_proc.wait() + tr1_proc.wait() + loss_data0 = tr0_proc.stdout.read() + lines = loss_data0.split("\n") + dist_first_loss = eval(lines[0].replace(" ", ","))[0] + dist_last_loss = eval(lines[1].replace(" ", ","))[0] + + local_lines = local_ret.split("\n") + local_first_loss = eval(local_lines[0])[0] + local_last_loss = eval(local_lines[1])[0] + + self.assertAlmostEqual(local_first_loss, dist_first_loss) + self.assertAlmostEqual(local_last_loss, dist_last_loss) + + # check tr0_out + # FIXME: ensure the server process is killed + # replace with ps0.terminate() + os.kill(ps0.pid, signal.SIGKILL) + os.kill(ps1.pid, signal.SIGKILL) + FNULL.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 49ea18c1691d68e7d1c4093f95f847b72062f0af..e628195e7265ec564bd64a212c4a35fdff495063 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -102,7 +102,7 @@ class TestLearningRateDecay(unittest.TestCase): exe.run(startup_prog) - # fluid.memory_optimize(main_prog) + fluid.memory_optimize(main_prog) for step in range(10): lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr]) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index aaea9c1809213c5707e8540eebbdd6f269836fdc..c75d3bd276a5b494090c1aa1fea0bb4f2c067173 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest): velocity_out = mu * velocity + grad if use_nesterov: - param_out = param - grad * learning_rate + \ + param_out = param - grad * learning_rate - \ velocity_out * mu * learning_rate else: param_out = param - learning_rate * velocity_out @@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest): velocity_out = mu * velocity + grad if use_nesterov: - param_out = param - grad * learning_rate + \ + param_out = param - grad * learning_rate - \ velocity_out * mu * learning_rate else: param_out = param - learning_rate * velocity_out diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py index dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5..cb0ea96ff69ce32b0bb1b49f0318c353aa08d388 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py +++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py @@ -39,17 +39,17 @@ class TestMultipleReader(unittest.TestCase): copyfile('./mnist_0.recordio', './mnist_1.recordio') copyfile('./mnist_0.recordio', './mnist_2.recordio') - def main(self, thread_num): + def main(self, is_test=False): file_list = [ './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio' ] with fluid.program_guard(fluid.Program(), fluid.Program()): data_files = fluid.layers.open_files( filenames=file_list, - thread_num=thread_num, shapes=[(-1, 784), (-1, 1)], lod_levels=[0, 0], - dtypes=['float32', 'int64']) + dtypes=['float32', 'int64'], + is_test=is_test) img, label = fluid.layers.read_file(data_files) if fluid.core.is_compiled_with_cuda(): @@ -71,6 +71,9 @@ class TestMultipleReader(unittest.TestCase): self.assertEqual(batch_count, self.num_batch * 3) def test_main(self): - self.main(thread_num=3) # thread number equals to file number - self.main(thread_num=10) # thread number is larger than file number - self.main(thread_num=2) # thread number is less than file number + self.main(is_test=False) + self.main(is_test=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 4580ab85e235e9f73d138058ac6a9aa3beb49a54..b21e16439a5070e5f6d763e1617d4cfffe8bd618 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -33,9 +33,7 @@ def simple_fc_net(use_feed): filenames=[MNIST_RECORDIO_FILE], shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) + dtypes=['float32', 'int64']) reader = fluid.layers.io.double_buffer(reader) img, label = fluid.layers.read_file(reader) hidden = img @@ -61,9 +59,7 @@ def fc_with_batchnorm(use_feed): filenames=[MNIST_RECORDIO_FILE], shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], - dtypes=['float32', 'int64'], - thread_num=1, - for_parallel=True) + dtypes=['float32', 'int64']) reader = fluid.layers.io.double_buffer(reader) img, label = fluid.layers.read_file(reader) @@ -102,6 +98,16 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + # simple_fc def check_simple_fc_convergence(self, use_cuda, use_reduce=False): if use_cuda and not core.is_compiled_with_cuda(): return @@ -109,8 +115,8 @@ class TestMNIST(TestParallelExecutorBase): self.check_network_convergence( simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') + img, label = self._init_data() + self.check_network_convergence( simple_fc_net, feed_dict={"image": img, @@ -118,6 +124,37 @@ class TestMNIST(TestParallelExecutorBase): use_cuda=use_cuda, use_reduce=use_reduce) + def check_simple_fc_convergence_with_Reduce(self, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, use_reduce=True) + self.check_network_convergence( + simple_fc_net, + use_cuda=use_cuda, + allow_op_delay=True, + use_reduce=True) + + img, label = self._init_data() + + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=False) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + simple_fc_net, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=True) + + for loss in zip(all_reduce_first_loss, reduce_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(all_reduce_last_loss, reduce_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + def test_simple_fc(self): # use_cuda self.check_simple_fc_convergence(True) @@ -125,14 +162,15 @@ class TestMNIST(TestParallelExecutorBase): def test_simple_fc_with_new_strategy(self): # use_cuda, use_reduce - self.check_simple_fc_convergence(True, True) - self.check_simple_fc_convergence(False, True) + self.check_simple_fc_convergence_with_Reduce(True) + self.check_simple_fc_convergence_with_Reduce(False) - def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False): + def check_simple_fc_parallel_accuracy(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') + + img, label = self._init_data(random=False) + single_first_loss, single_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1000, @@ -146,8 +184,7 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_parallel_executor=True, - use_reduce=use_reduce) + use_parallel_executor=True) for p_f in parallel_first_loss: self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) @@ -158,32 +195,53 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy(False) - def test_simple_fc_parallel_accuracy_with_new_strategy(self): - # use_cuda, use_reduce - self.check_simple_fc_parallel_accuracy(True, True) - self.check_simple_fc_parallel_accuracy(False, True) - - def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False): + def check_batchnorm_fc_convergence(self, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return + self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) - img = np.zeros(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') + + img, label = self._init_data() + + self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda) + + def check_batchnorm_fc_convergence_use_reduce(self, use_cuda): + if use_cuda and not core.is_compiled_with_cuda(): + return self.check_network_convergence( + fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True) + + img, label = self._init_data() + + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( fc_with_batchnorm, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - use_reduce=use_reduce) + use_reduce=False) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + use_reduce=True) + + for loss in zip(all_reduce_first_loss, reduce_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(all_reduce_last_loss, reduce_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-4) def test_batchnorm_fc(self): self.check_batchnorm_fc_convergence(True) self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - # use_cuda, use_reduce - self.check_batchnorm_fc_convergence(True, True) - self.check_batchnorm_fc_convergence(False, True) + self.check_batchnorm_fc_convergence_use_reduce(True) + self.check_batchnorm_fc_convergence_use_reduce(False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py index 05715464848d835684dd3cf0e99e5d4dd482e0b6..91b1fd2af7d8aaf85d17965f8b02c35ee3990291 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py @@ -45,12 +45,12 @@ class TestPyReader(unittest.TestCase): ) else fluid.CPUPlace() executor = fluid.Executor(place) - data_file, feed_queue = fluid.layers.py_reader( + data_file = fluid.layers.py_reader( capacity=self.capacity, dtypes=self.dtypes, lod_levels=self.lod_levels, shapes=self.shapes) - + feed_queue = data_file.queue read_out_data = fluid.layers.read_file(data_file) self.inputs = [] diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 9a5b69eea46e74deeba87aefae4afac84c7745f1..9a379bdbaa7e278879117a8cdc2dddb335a10ca1 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -52,11 +52,13 @@ def simple_fc_net(in_size, batch_size, queue_capacity, use_double_buffer=False): - reader, feed_queue = fluid.layers.py_reader( + reader = fluid.layers.py_reader( capacity=queue_capacity, shapes=[[-1, in_size], [-1, 1]], lod_levels=[0, 0], - dtypes=['float32', 'int64']) + dtypes=['float32', 'int64'], + use_double_buffer=False) + feed_queue = reader.queue reader = fluid.layers.batch(reader, batch_size=batch_size) if use_double_buffer: reader = fluid.layers.double_buffer(reader)