diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eeda759ff18ccb86ce6a585fe41cb972ea3ae295..e718b32cb6c48d11e73600509a17db107f438708 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: clang-format-with-version-check name: clang-format description: Format files with ClangFormat. - entry: bash ./.clang_format.hook -i + entry: bash ./tools/codestyle/clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: local @@ -52,7 +52,7 @@ repos: hooks: - id: copyright_checker name: copyright_checker - entry: python ./.copyright.hook + entry: python ./tools/codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile index b9eaca5ee6b487bb37bb954b3c606c3096d37aeb..707fadb1fae97cefe8a41715cd57d71754abda41 100644 --- a/benchmark/fluid/Dockerfile +++ b/benchmark/fluid/Dockerfile @@ -1,11 +1,18 @@ FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# Use UBUNTU_MIRROR can speed up apt-get speed. +# ARG UBUNTU_MIRROR +# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so -RUN pip install -U pip -RUN pip install -U kubernetes paddlepaddle # IMPORTANT: # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. +# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ... + +RUN pip install -U pip +RUN pip install -U kubernetes paddlepaddle RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python' @@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root +RUN chmod +x /usr/bin/paddle_k8s ADD *.whl / -RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s +RUN pip install /*.whl && rm -f /*.whl ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ +ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/ +ADD models/ /workspace/models/ diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index acd803ddeee89b43c26ed2da506291f8a10aff7d..ece1102dce987cda994ff086b07f756498ce26e6 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) - if args.update_method == "pserver": - exe.bcast_params() if args.use_reader_op: num_samples += args.batch_size * args.gpus else: @@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples): (num_samples, train_elapsed, examples_per_sec)) +def print_paddle_envs(): + print('----------- Configuration envs -----------') + for k in os.environ: + if "PADDLE_" in k: + print "ENV %s:%s" % (k, os.environ[k]) + print('------------------------------------------------') + + def main(): args = parse_args() print_arguments(args) + print_paddle_envs() # the unique trainer id, starting from 0, needed by trainer # only diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py index f8afa3e9efd9b8780a08d937c9df3670baa735db..dfe8b5cdd58456902fa8ec355e9837dface3f7be 100644 --- a/benchmark/fluid/kube_gen_job.py +++ b/benchmark/fluid/kube_gen_job.py @@ -17,6 +17,7 @@ import copy import argparse import random import os +import copy from kube_templates import pserver, trainer, envs @@ -109,10 +110,9 @@ def gen_job(): envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)}) - envs.append({"name": "PSERVERS", "value": str(args.pservers)}) + envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)}) envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) - envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) # NOTE: these directories below are cluster specific, please modify # this settings before you run on your own cluster. envs.append({ @@ -166,7 +166,7 @@ def gen_job(): tn["spec"]["template"]["spec"]["volumes"] = volumes tn_container["volumeMounts"] = volumeMounts - ps_container["env"] = envs + ps_container["env"] = copy.deepcopy(envs) ps_container["env"].append({ "name": "PADDLE_TRAINING_ROLE", "value": "PSERVER" diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index acc8b4aa3fb258e5beef2d1e54919d429cf7ea6f..9ce6a9a7c329055a755cdb0a40c8c1c2af09a61c 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,7 +1,7 @@ #!/bin/bash python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst -for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer +for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler do python gen_doc.py ${module} > ${module}.rst done diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..b3535b449eb0e5ac6563256ddac3bf4a27fd8ce6 --- /dev/null +++ b/doc/fluid/api/transpiler.rst @@ -0,0 +1,46 @@ +.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` + !DO NOT EDIT THIS FILE MANUALLY! + +========== +transpiler +========== + +DistributeTranspiler +-------------------- + +.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler + :members: + :noindex: + +InferenceTranspiler +------------------- + +.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler + :members: + :noindex: + +memory_optimize +--------------- + +.. autofunction:: paddle.fluid.transpiler.memory_optimize + :noindex: + +release_memory +-------------- + +.. autofunction:: paddle.fluid.transpiler.release_memory + :noindex: + +HashName +-------- + +.. autoclass:: paddle.fluid.transpiler.HashName + :members: + :noindex: + +RoundRobin +---------- + +.. autoclass:: paddle.fluid.transpiler.RoundRobin + :members: + :noindex: diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst index c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f..84005b54e07cf810649370d2c1f6b6c522434bf6 100644 --- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst +++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst @@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz `_ cuda8.0_cudnn5_avx_mkl `fluid.tgz `_ cuda8.0_cudnn7_avx_mkl `fluid.tgz `_ +cuda9.0_cudnn7_avx_mkl `fluid.tgz `_ ====================== ======================================== 从源码编译 diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc index 192a6414260ce06048b8c765402d89882cabc51b..2a4bfc87069b9fd8ece58dde210a6cb8344da536 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc @@ -40,10 +40,9 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleBuf buf{.data = data, .length = sizeof(data)}; PaddleTensor tensor{.name = "", .shape = std::vector({4, 1}), - .data = buf, + .data = PaddleBuf(data, sizeof(data)), .dtype = PaddleDType::INT64}; // For simplicity, we set all the slots with the same data. @@ -55,14 +54,12 @@ void Main(bool use_gpu) { //# 4. Get output. ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length; - const size_t num_elements = outputs.front().data.length / sizeof(float); + LOG(INFO) << "output buffer size: " << outputs.front().data.length(); + const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data)[i]; + LOG(INFO) << static_cast(outputs.front().data.data())[i]; } - // TODO(Superjomn): this is should be free automatically - free(outputs[0].data.data); } } @@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) { // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; - PaddleBuf buf{.data = data, .length = sizeof(data)}; PaddleTensor tensor{.name = "", .shape = std::vector({4, 1}), - .data = buf, + .data = PaddleBuf(data, sizeof(data)), .dtype = PaddleDType::INT64}; std::vector inputs(4, tensor); std::vector outputs; @@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) { // 4. Get output. ASSERT_EQ(outputs.size(), 1UL); LOG(INFO) << "TID: " << tid << ", " - << "output buffer size: " << outputs.front().data.length; - const size_t num_elements = outputs.front().data.length / sizeof(float); + << "output buffer size: " << outputs.front().data.length(); + const size_t num_elements = + outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data)[i]; + LOG(INFO) << static_cast(outputs.front().data.data())[i]; } - free(outputs[0].data.data); } }); } diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc index d67e1e7667800d6dd00cb8915b0d6dc7c664970b..dc2842ae0eeb5592b6d4571b70df162886aee7a2 100644 --- a/paddle/contrib/inference/paddle_inference_api.cc +++ b/paddle/contrib/inference/paddle_inference_api.cc @@ -13,3 +13,53 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/contrib/inference/paddle_inference_api.h" + +namespace paddle { + +PaddleBuf::PaddleBuf(PaddleBuf&& other) + : data_(other.data_), + length_(other.length_), + memory_owned_(other.memory_owned_) { + other.memory_owned_ = false; + other.data_ = nullptr; + other.length_ = 0; +} + +PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } + +PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { + // only the buffer with external memory can be copied + assert(!other.memory_owned_); + data_ = other.data_; + length_ = other.length_; + memory_owned_ = other.memory_owned_; + return *this; +} + +void PaddleBuf::Resize(size_t length) { + // Only the owned memory can be reset, the external memory can't be changed. + if (length_ == length) return; + assert(memory_owned_); + Free(); + data_ = new char[length]; + length_ = length; + memory_owned_ = true; +} + +void PaddleBuf::Reset(void* data, size_t length) { + Free(); + memory_owned_ = false; + data_ = data; + length_ = length; +} + +void PaddleBuf::Free() { + if (memory_owned_ && data_) { + assert(length_ > 0); + delete static_cast(data_); + data_ = nullptr; + length_ = 0; + } +} + +} // namespace paddle \ No newline at end of file diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h index 77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e..bd4530fcf9518cb3bf06179d8f60a1dde38ff7dd 100644 --- a/paddle/contrib/inference/paddle_inference_api.h +++ b/paddle/contrib/inference/paddle_inference_api.h @@ -21,6 +21,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -32,12 +33,38 @@ enum PaddleDType { INT64, }; -struct PaddleBuf { - void* data; // pointer to the data memory. - size_t length; // number of memory bytes. +class PaddleBuf { + public: + PaddleBuf() = default; + PaddleBuf(PaddleBuf&& other); + // Copy only available when memory is managed externally. + explicit PaddleBuf(const PaddleBuf&); + PaddleBuf& operator=(const PaddleBuf&); + // Do not own the memory. + PaddleBuf(void* data, size_t length) + : data_(data), length_(length), memory_owned_{false} {} + // Own memory. + PaddleBuf(size_t length) + : data_(new char[length]), length_(length), memory_owned_(true) {} + // Resize to `length` bytes. + void Resize(size_t length); + // Reset to external memory. + void Reset(void* data, size_t length); + bool empty() const { return length_ == 0; } + void* data() const { return data_; } + size_t length() const { return length_; } + + ~PaddleBuf() { Free(); } + + private: + void Free(); + void* data_{nullptr}; // pointer to the data memory. + size_t length_{0}; // number of memory bytes. + bool memory_owned_{true}; }; struct PaddleTensor { + PaddleTensor() = default; std::string name; // variable name. std::vector shape; // TODO(Superjomn) for LoD support, add a vector> field if needed. @@ -67,8 +94,9 @@ class PaddlePredictor { // Predict an record. // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be alive until Run returns. caller should be - // responsible for releasing the memory of `output_data`. + // `inputs`. `inputs` should be available until Run returns. Caller should be + // responsible for the output tensor's buffer, either allocated or passed from + // outside. virtual bool Run(const std::vector& inputs, std::vector* output_data) = 0; diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc index 5bafc58fa53f7d99de571f66b6224f0f2de66e32..ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc @@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run( auto d_tensor_in_p = executor_.get_in(input.name); float *d_data_p = d_tensor_in_p->mutable_data(); if (cudaMemcpy(d_data_p, - static_cast(input.data.data), + static_cast(input.data.data()), d_tensor_in_p->valid_size() * sizeof(float), cudaMemcpyHostToDevice) != 0) { LOG(ERROR) << "copy data from CPU to GPU error"; @@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run( for (auto &output : *output_data) { auto *tensor = executor_.get_out(output.name); output.shape = tensor->shape(); + if (output.data.length() < tensor->valid_size() * sizeof(float)) { + output.data.Resize(tensor->valid_size() * sizeof(float)); + } // Copy data from GPU -> CPU - if (cudaMemcpy(output.data.data, + if (cudaMemcpy(output.data.data(), tensor->mutable_data(), tensor->valid_size() * sizeof(float), cudaMemcpyDeviceToHost) != 0) { diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc index 1d41a5c73e75723f8614d810eae09ed8cdc8cf2b..f92e9d4190412f5847e353ef1dc0324cad668c9a 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc +++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc @@ -37,28 +37,26 @@ TEST(inference, anakin) { float data[1 * 3 * 224 * 224] = {1.0f}; - PaddleBuf buf{.data = data, .length = sizeof(data)}; PaddleTensor tensor{.name = "input_0", .shape = std::vector({1, 3, 224, 224}), - .data = buf, + .data = PaddleBuf(data, sizeof(data)), .dtype = PaddleDType::FLOAT32}; // For simplicity, we set all the slots with the same data. - std::vector paddle_tensor_feeds(1, tensor); + std::vector paddle_tensor_feeds; + paddle_tensor_feeds.emplace_back(std::move(tensor)); - float data_out[1000]; - - PaddleBuf buf_out{.data = data_out, .length = sizeof(data)}; PaddleTensor tensor_out{.name = "prob_out", .shape = std::vector({1000, 1}), - .data = buf_out, + .data = PaddleBuf(), .dtype = PaddleDType::FLOAT32}; - std::vector outputs(1, tensor_out); + std::vector outputs; + outputs.emplace_back(std::move(tensor_out)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); - float* data_o = static_cast(outputs[0].data.data); + float* data_o = static_cast(outputs[0].data.data()); for (size_t j = 0; j < 1000; ++j) { LOG(INFO) << "output[" << j << "]: " << data_o[j]; } diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc index bda2981a14482e2c4a29773d37b074506cc344b1..d9129a704bc289ce1d416474537fc9234a07e5b8 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/paddle_inference_api_impl.cc @@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), - inputs[i].data.data, - inputs[i].data.length); + inputs[i].data.data(), + inputs[i].data.length()); feeds->push_back(input); } return true; @@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch( } outputs->at(i).shape = shape; - outputs->at(i).data.length = sizeof(float) * data.size(); - outputs->at(i).data.data = malloc(outputs->at(i).data.length); - std::memcpy( - outputs->at(i).data.data, data.data(), outputs->at(i).data.length); + auto &buffer = outputs->at(i).data; + if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) { + buffer.Resize(sizeof(float) * data.size()); + } + std::memcpy(buffer.data(), data.data(), buffer.length()); outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 5d843010e02b09087e6b328428e80fb40eb5bb97..88c4e665a3daed0ed34b23b75d360acbd586401f 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -27,13 +27,12 @@ namespace paddle { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor pt; - pt.data.data = t->data(); if (t->type() == typeid(int64_t)) { - pt.data.length = t->numel() * sizeof(int64_t); + pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; } else if (t->type() == typeid(float)) { - pt.data.length = t->numel() * sizeof(float); + pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { LOG(FATAL) << "unsupported type."; @@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) { std::vector outputs; ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_EQ(outputs.size(), 1UL); - size_t len = outputs[0].data.length; - float* data = static_cast(outputs[0].data.data); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); for (size_t j = 0; j < len / sizeof(float); ++j) { ASSERT_LT(data[j], 1.0); ASSERT_GT(data[j], -1.0); @@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) { EXPECT_LT(lod_data[i] - data[i], 1e-3); EXPECT_GT(lod_data[i] - data[i], -1e-3); } - - free(outputs[0].data.data); } void MainImageClassification(bool use_gpu) { @@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) { std::vector outputs; ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_EQ(outputs.size(), 1UL); - size_t len = outputs[0].data.length; - float* data = static_cast(outputs[0].data.data); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); float* lod_data = output1.data(); for (size_t j = 0; j < len / sizeof(float); ++j) { EXPECT_NEAR(lod_data[j], data[j], 1e-3); } - free(data); } void MainThreadsWord2Vec(bool use_gpu) { @@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) { // check outputs range ASSERT_EQ(local_outputs.size(), 1UL); - const size_t len = local_outputs[0].data.length; - float* data = static_cast(local_outputs[0].data.data); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); for (size_t j = 0; j < len / sizeof(float); ++j) { ASSERT_LT(data[j], 1.0); ASSERT_GT(data[j], -1.0); @@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) { for (int i = 0; i < refs[tid].numel(); ++i) { EXPECT_NEAR(ref_data[i], data[i], 1e-3); } - free(data); }); } for (int i = 0; i < num_jobs; ++i) { @@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) { // check outputs correctness ASSERT_EQ(local_outputs.size(), 1UL); - const size_t len = local_outputs[0].data.length; - float* data = static_cast(local_outputs[0].data.data); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); float* ref_data = refs[tid].data(); EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); for (int i = 0; i < refs[tid].numel(); ++i) { EXPECT_NEAR(ref_data[i], data[i], 1e-3); } - free(data); }); } for (int i = 0; i < num_jobs; ++i) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 78356cb1be3bd089c26dde663275e2c8109df951..99dcaf27134e879fa85f57e5a675382442e9edf2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } + balance_vars_.resize(places_.size(), 0); } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, @@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp( checker(op.InputArgumentNames(), recv_vars); } +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + std::unique_ptr MultiDevSSAGraphBuilder::Build( const ProgramDesc &program) const { - std::unordered_map all_vars; for (auto *var : program.Block(0).AllVars()) { - all_vars[var->Name()] = var; + all_vars_.emplace(var->Name(), var); } auto graph = new SSAGraph(); @@ -161,35 +181,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( auto send_vars = FindDistTrainSendVars(program); auto recv_vars = FindDistTrainRecvVars(program); - std::vector> var_name_on_devices; std::vector> bcast_var_name_set; - var_name_on_devices.resize(places_.size()); bcast_var_name_set.resize(places_.size()); size_t cur_device_id = 0; - std::vector balance_grads(places_.size(), 0); - - auto get_appropriate_dev = [&](std::string &g_name) -> size_t { - auto var_desc = all_vars.at(g_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GE(numel, 0); - auto smallest = - std::min_element(std::begin(balance_grads), std::end(balance_grads)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_grads), smallest)); - balance_grads[dev_id] += numel; - return dev_id; - }; - bool is_forwarding = true; + for (auto *op : program.Block(0).AllOps()) { if (boost::get( op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - // append rpc op if program is distributed trainer main program. - // always use the first device CreateRPCOp(&result, *op); } else if (IsDistTrainOp(*op, send_vars, recv_vars)) { CreateDistTrainOp(&result, *op); @@ -199,15 +200,19 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( BuildStrategy::GradientScaleStrategy::kCustomized) { CreateScaleLossGradOp(&result); } + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(var_name_on_devices, *op); + int op_dev_id = GetOpDeviceID(*op); if (op_dev_id == -1) { // var on all device CreateComputationalOps(&result, *op, places_.size()); } else { CreateComputationalOp(&result, *op, op_dev_id); for (auto &var_name : op->OutputArgumentNames()) { - var_name_on_devices[op_dev_id].emplace(var_name); + var_name_on_devices_.emplace(var_name, op_dev_id); } } if (!is_forwarding && places_.size() > 1) { @@ -230,19 +235,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( switch (strategy_.reduce_) { case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = get_appropriate_dev(g_name); + cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - var_name_on_devices[cur_device_id].emplace(g_name); + var_name_on_devices_.emplace(g_name, cur_device_id); bcast_var_name_set[cur_device_id].emplace(p_name); break; case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(all_vars, g_name)) { + if (IsSparseGradient(g_name)) { CreateReduceOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0); } else { InsertAllReduceOp(&result, g_name); } break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; } } } catch (boost::bad_get e) { @@ -261,7 +269,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } /* Dependency graph has been constructed. However, there are still data - harzaeds need to be handled. + hazards need to be handled. */ PolishGraphToSupportDataHazards(&result); @@ -273,11 +281,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( return std::unique_ptr(graph); } -bool MultiDevSSAGraphBuilder::IsSparseGradient( - const std::unordered_map &all_vars, - const std::string &og) const { - PADDLE_ENFORCE(all_vars.count(og) != 0); - if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { return true; } return false; @@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID( - const std::vector> &var_name_on_devices, - const OpDesc &op) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int var_dev_id = -1; - for (auto &var_name : op.InputArgumentNames()) { - if (var_dev_id != -1) break; - for (size_t i = 0; i < var_name_on_devices.size(); ++i) { - if (var_name_on_devices[i].count(var_name)) { - var_dev_id = static_cast(i); - break; - } + for (auto &varname : op.InputArgumentNames()) { + int dev_id = GetVarDeviceID(varname); + if (dev_id != -1) { + return dev_id; } } - return var_dev_id; + return -1; +} + +int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { + auto got = var_name_on_devices_.find(varname); + return got == var_name_on_devices_.end() ? -1 : got->second; } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { @@ -449,6 +454,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, return var; } +// Find the first occurence of `prev_op_name` and make current `op` depend +// on it. void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, const std::string &prev_op_name) const { for (auto &prev_op : result->ops_) { @@ -463,16 +470,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const { - CreateComputationalOp(result, op, 0); + int op_dev_id = -1; + if (op.Type() == "split_byref") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); + for (auto &varname : op.InputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } + for (auto &varname : op.OutputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } else if (op.Type() == "concat") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + } else { + PADDLE_ENFORCE( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", op.Type()); + + CreateComputationalOp(result, op, op_dev_id); if (op.Type() == "concat") { ConnectOp(result, result->ops_.back().get(), "fetch_barrier"); } } +// Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, const OpDesc &op) const { - result->ops_.emplace_back( - new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); + int op_dev_id = -1; + if (op.Type() == "send") { + op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + // the variable name which contains .block means it was splited by + // split_byref op + // so that we can balance the variable blocks to all the pserver instances. + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && + op.InputArgumentNames()[0].find(".block") == std::string::npos) { + op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); + for (auto &varname : op.InputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } + } else if (op.Type() == "recv") { + op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames()); + for (auto &varname : op.OutputArgumentNames()) { + var_name_on_devices_.emplace(varname, op_dev_id); + } + } else { + // send_barrier and fetch_barrier op can be scheduled on device 0 + op_dev_id = 0; + } + + PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", + op.Type()); + + result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id], + op.Type(), places_[op_dev_id])); if (op.Type() == "send_barrier") { ConnectOp(result, result->ops_.back().get(), "send"); @@ -488,9 +545,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, "send, send_barrier. recv, fetch_barrier]"); } - // TODO(Yancey1989): schedule rpc op on different place may - // increate throughput - CreateOpHandleIOs(result, op, 0); + CreateOpHandleIOs(result, op, op_dev_id); } bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 78581755fe4890800636944d6cd89875a852cc19..eb1c07630ab665a90d76b810a421cffb0ce673c2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #endif std::unique_ptr Build(const ProgramDesc &program) const override; + int GetVarDeviceID(const std::string &varname) const; private: void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - size_t place_id) const; + size_t device_id) const; private: std::string loss_var_name_; @@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID( - const std::vector> &var_name_on_devices, - const OpDesc &op) const; + int GetOpDeviceID(const OpDesc &op) const; void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const; - bool IsSparseGradient( - const std::unordered_map &all_vars, - const std::string &og) const; + bool IsSparseGradient(const std::string &og) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; private: BuildStrategy strategy_; + mutable std::unordered_map all_vars_; + mutable std::unordered_map var_name_on_devices_; + mutable std::vector balance_vars_; void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 5fc12a44b51fae26e5a8f5fdba952d3879e82d0f..9eb23c46264f9036f009b0ae9aeeb34ec70c0e53 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -30,6 +30,7 @@ class SSAGraphBuilder { SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; + virtual int GetVarDeviceID(const std::string &var_name) const { return -1; } DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 6c5098ce85b784a3edcf8f48d2cc828aabd8e161..b1706eb12d080364d04108c7ef4da31e1e7c1deb 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { + std::lock_guard l(exception_mu_); if (exception_) { auto exp = *exception_; exception_.reset(); @@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp( ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EnforceNotMet ex) { + std::lock_guard l(exception_mu_); exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { LOG(FATAL) << "Unknown exception catched"; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 4a2075f1cccb3211316567197da56c01d26f35ce..90430be996758364387b552019762d9c2e9dfe45 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; + std::mutex exception_mu_; std::unique_ptr exception_; std::atomic running_ops_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index b30a9806eb19ee12d2a70afe3ca806224b0f75d6..ae98fccc9600a2a75f12fa516c982bec0ef13f9f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #ifdef PADDLE_WITH_DISTRIBUTE -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc_client.h" #endif #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {} #ifdef PADDLE_WITH_DISTRIBUTE void Executor::Complete() { - ::paddle::operators::detail::RPCClient::GetInstance< - ::paddle::operators::detail::GRPCClient>() + ::paddle::operators::distributed::RPCClient::GetInstance< + ::paddle::operators::distributed::GRPCClient>() ->SendComplete(); } #endif @@ -321,7 +321,8 @@ std::vector> Executor::Prepare( } void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, - bool create_local_scope, bool create_vars) { + bool create_local_scope, bool create_vars, + bool keep_kids) { Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { @@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } } platform::DeviceContextPool::Instance().Get(place_)->Wait(); - if (create_vars && create_local_scope) { + if (local_scope != scope) { scope->DeleteScope(local_scope); } else { - // Delete the local scopes created in operators. - scope->DropKids(); + if (!keep_kids) { + // By default, we should delete all kid scopes after run executor because + // some operators may create local scope when running, such as while_op. + // But when while_op also create a local executor to run it's sub block, + // the sub scopes it created should not be dropped immediately, because + // while_grad_op will use some variables created during while_op run, so + // we need to keep the kids and wait for the outer executor to drop them. + scope->DropKids(); + } } + if (FLAGS_benchmark) { VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "Memory used after deleting local scope: " diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 67a0761dac2a9adcdd0ce2b218c4aa505d688d56..3aa5ffef69cd29681f248e915575c5715ad0d3fa 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -78,7 +78,7 @@ class Executor { void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope = true, - bool create_vars = true); + bool create_vars = true, bool keep_kids = false); void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, std::map* feed_targets, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 9406c6155da860c90739bddac1e81403b094e619..d478865fa8f24c653a4185cabd05747a5410ceaa 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor( // Step 3. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - details::SSAGraphBuilderFactory builder_factory( member_->places_, loss_var_name, params, member_->local_scopes_, build_strategy); @@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor( #endif } + builder_ = std::move(builder_factory.Create()); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, - builder_factory.Create()->Build(main_program))); + builder_->Build(main_program))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), @@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToGPUs( const std::unordered_set &vars) const { - auto *main_scope = member_->local_scopes_[0]; + // the the initialize bcast, all vars would be bcast from device(0), otherwise + // bcast from the specified device. + bool initialize = builder_.get() == nullptr ? true : false; for (auto &var : vars) { - auto *main_var = main_scope->FindVar(var); + int var_dev_id = + builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); + if (!initialize && var_dev_id == -1) continue; + + framework::Variable *main_var = nullptr; + if (initialize) { + main_var = member_->local_scopes_[0]->FindVar(var); + } else { + main_var = member_->local_scopes_[var_dev_id]->FindVar(var); + } + if (main_var == nullptr || !main_var->IsType()) { continue; } @@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs( for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; - if (i == 0) { + + if ((initialize && i == 0) || (!initialize && i == var_dev_id)) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5247e790649e76567f4527d54499d6e95dac5c27..058f83f07c26224e3180d140630c08a24c40cd80 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -19,12 +19,14 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" + namespace paddle { namespace framework { @@ -68,6 +70,7 @@ class ParallelExecutor { private: ParallelExecutorPrivate *member_; + std::unique_ptr builder_; }; } // namespace framework diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index b75df33b71311acd0e626e5a13c18469b19ef136..c7f40d43c922a328febd343cea7240fcb09f3d02 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { SubGraphFuse(graph, node_inside_subgraph_teller_); } -} // analysis -} // inference +} // namespace analysis +} // namespace inference -} // paddle +} // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8c08ae34306744f86d71e370fad47233c61ac8f8..4c338c67d34fa229de17019ce97e8b8dc39ea737 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -184,9 +184,9 @@ else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() -add_subdirectory(detail) if(WITH_DISTRIBUTE) - + add_subdirectory(distributed) + set(DISTRIBUTE_DEPS "") if(WITH_GRPC) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) @@ -195,20 +195,11 @@ if(WITH_DISTRIBUTE) endif() set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(checkpoint_notify_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(checkpoint_notify_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) - op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op") + op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endforeach() + #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 8206cc9890160da756efb13c991020f09b20126a..cc158e57f7140c84f02bc7e091d8eac0d2b672e1 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -21,8 +21,6 @@ namespace operators { using batch_norm_bwd = mkldnn::batch_normalization_backward; using batch_norm_fwd = mkldnn::batch_normalization_forward; -using framework::DataLayout; -using framework::Tensor; using mkldnn::memory; using mkldnn::primitive; using mkldnn::reorder; @@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNMemDesc; using platform::to_void_cast; -template -using EigenArrayMap = - Eigen::Map>; -template -using ConstEigenArrayMap = - Eigen::Map>; -template -using EigenVectorArrayMap = Eigen::Map>; -template -using ConstEigenVectorArrayMap = - Eigen::Map>; - namespace { template struct bn_type_traits { diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 625ca2d7c4c70d1098b0fb28380d8d1eb24cb338..52b0bf85c07fee380f9e7ba1c703b56367628644 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -22,22 +22,6 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DataLayout = framework::DataLayout; - -template -using EigenArrayMap = - Eigen::Map>; -template -using ConstEigenArrayMap = - Eigen::Map>; -template -using EigenVectorArrayMap = Eigen::Map>; -template -using ConstEigenVectorArrayMap = - Eigen::Map>; - class BatchNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 9e5fc41598f29336074335f3624a2300ad018d09..5e3d630d6889e445c5e84fa836d2d81bb7266779 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -19,6 +19,22 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + template class BatchNormKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc index 2572e813d656353a2187c29da89266733a32f3ce..2dc3399da183fbcf7664066f6f7ce12db3dc6d5e 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/bilinear_interp_op.cc @@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, ops::BilinearInterpOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); -REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel); +REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel, + ops::BilinearInterpKernel); REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::BilinearInterpGradKernel); diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h index 8b03cd5a0635584a45782fe5a4823c37fe4fa8e8..70847cb8c1abe2e94bc844ab8117d1f23fea533b 100644 --- a/paddle/fluid/operators/bilinear_interp_op.h +++ b/paddle/fluid/operators/bilinear_interp_op.h @@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel { int in_chw = channels * in_hw; int out_chw = channels * out_hw; - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; if (in_h == out_h && in_w == out_w) { memcpy(output, input, input_t->numel() * sizeof(T)); @@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel { for (int i = 0; i < out_h; ++i) { // loop for images int h = ratio_h * i; int hid = (h < in_h - 1) ? 1 : 0; - T h1lambda = ratio_h * i - h; - T h2lambda = 1 - h1lambda; + float h1lambda = ratio_h * i - h; + float h2lambda = 1.f - h1lambda; for (int j = 0; j < out_w; ++j) { int w = ratio_w * j; int wid = (w < in_w - 1) ? 1 : 0; - T w1lambda = ratio_w * j - w; - T w2lambda = 1 - w1lambda; + float w1lambda = ratio_w * j - w; + float w2lambda = 1.f - w1lambda; // calculate four position for bilinear interpolation const T* in_pos = &input[k * in_chw + h * in_w + w]; T* out_pos = &output[k * out_chw + i * out_w + j]; for (int c = 0; c < channels; ++c) { // loop for channels // bilinear interpolation - out_pos[0] = + out_pos[0] = static_cast( h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + h1lambda * (w2lambda * in_pos[hid * in_w] + - w1lambda * in_pos[hid * in_w + wid]); + w1lambda * in_pos[hid * in_w + wid])); in_pos += in_hw; out_pos += out_hw; } @@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel { int in_chw = channels * in_hw; int out_chw = channels * out_hw; - T ratio_h = (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; - T ratio_w = (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; + float ratio_h = + (out_h > 1) ? static_cast(in_h - 1) / (out_h - 1) : 0.f; + float ratio_w = + (out_w > 1) ? static_cast(in_w - 1) / (out_w - 1) : 0.f; if (in_h == out_h && in_w == out_w) { memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); @@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel { for (int i = 0; i < out_h; ++i) { // loop for images int h = ratio_h * i; int hid = (h < in_h - 1) ? 1 : 0; - T h1lambda = ratio_h * i - h; - T h2lambda = 1 - h1lambda; + float h1lambda = ratio_h * i - h; + float h2lambda = 1 - h1lambda; for (int j = 0; j < out_w; ++j) { int w = ratio_w * j; int wid = (w < in_w - 1) ? 1 : 0; - T w1lambda = ratio_w * j - w; - T w2lambda = 1 - w1lambda; + float w1lambda = ratio_w * j - w; + float w2lambda = 1 - w1lambda; T* in_pos = &d_input[k * in_chw + h * in_w + w]; const T* out_pos = &d_output[k * out_chw + i * out_w + j]; for (int c = 0; c < channels; ++c) { // loop for channels - in_pos[0] += h2lambda * w2lambda * out_pos[0]; - in_pos[wid] += h2lambda * w1lambda * out_pos[0]; - in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0]; - in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0]; + in_pos[0] += static_cast(h2lambda * w2lambda * out_pos[0]); + in_pos[wid] += static_cast(h2lambda * w1lambda * out_pos[0]); + in_pos[hid * in_w] += + static_cast(h1lambda * w2lambda * out_pos[0]); + in_pos[hid * in_w + wid] += + static_cast(h1lambda * w1lambda * out_pos[0]); in_pos += in_hw; out_pos += out_hw; } diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h index da1de72dad00db3ffe609e17bd198ef0a56bbfcd..b9e385994efcea0388756e8bd780ebfc719ed08d 100644 --- a/paddle/fluid/operators/detail/macros.h +++ b/paddle/fluid/operators/detail/macros.h @@ -15,13 +15,13 @@ #pragma once #ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" -#define RPCSERVER_T detail::AsyncGRPCServer -#define RPCCLIENT_T detail::GRPCClient +#include "paddle/fluid/operators/distributed/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc_server.h" +#define RPCSERVER_T distributed::AsyncGRPCServer +#define RPCCLIENT_T distributed::GRPCClient #else -#include "paddle/fluid/operators/detail/brpc_client.h" -#include "paddle/fluid/operators/detail/brpc_server.h" -#define RPCSERVER_T detail::AsyncBRPCServer -#define RPCCLIENT_T detail::BRPCClient +#include "paddle/fluid/operators/distributed/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc_server.h" +#define RPCSERVER_T distributed::AsyncBRPCServer +#define RPCCLIENT_T distributed::BRPCClient #endif diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt similarity index 97% rename from paddle/fluid/operators/detail/CMakeLists.txt rename to paddle/fluid/operators/distributed/CMakeLists.txt index abc5aad0430e71928a441c9488dda16dfdd63b9c..312f80e09077f21a47985c1c936c2ac41c292ead 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -1,8 +1,3 @@ -if(NOT WITH_DISTRIBUTE) - return() -endif() - - if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc similarity index 98% rename from paddle/fluid/operators/detail/brpc_client.cc rename to paddle/fluid/operators/distributed/brpc_client.cc index 9a4e410f1d83e93883438fae116c38eb60787673..b394c678fb6503eb73a1e11e6feb814251e9e940 100644 --- a/paddle/fluid/operators/detail/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { DEFINE_int32(brpc_channel_num, 24, "Number of channels to send requests connected to one server"); @@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { return q; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h similarity index 94% rename from paddle/fluid/operators/detail/brpc_client.h rename to paddle/fluid/operators/distributed/brpc_client.h index 1e953ea431d51a9586bfd0b352c7f27d079ff1a8..34f140687f91d866536f5e2b647c7445a6624736 100644 --- a/paddle/fluid/operators/detail/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -31,13 +31,13 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/detail/rpc_client.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { namespace operators { -namespace detail { +namespace distributed { struct ChannelContext { brpc::Channel channel; @@ -95,6 +95,6 @@ class BRPCClient : public RPCClient { DISABLE_COPY_AND_ASSIGN(BRPCClient); }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc similarity index 86% rename from paddle/fluid/operators/detail/brpc_server.cc rename to paddle/fluid/operators/distributed/brpc_server.cc index 2170abe679f9ededff3b53e3139e56f8aad227cb..862167f02084cfe81db1c0936bbfb0415fa85721 100644 --- a/paddle/fluid/operators/detail/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/detail/brpc_server.h" -#include "paddle/fluid/operators/detail/request_handler.h" +#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { typedef std::unordered_map + paddle::operators::distributed::RequestHandler*> HandlerMap; class BRPCServiceImpl : public SendRecvService { @@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService { : request_send_h_(nullptr), request_get_h_(nullptr), request_prefetch_h_(nullptr) { - auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); if (it != rpc_call_map.end()) { request_send_h_ = it->second; } - it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); if (it != rpc_call_map.end()) { request_get_h_ = it->second; } - it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); + it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; } @@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService { } private: - paddle::operators::detail::RequestHandler* request_send_h_; - paddle::operators::detail::RequestHandler* request_get_h_; - paddle::operators::detail::RequestHandler* request_prefetch_h_; + paddle::operators::distributed::RequestHandler* request_send_h_; + paddle::operators::distributed::RequestHandler* request_get_h_; + paddle::operators::distributed::RequestHandler* request_prefetch_h_; }; } // namespace sendrecv namespace paddle { namespace operators { -namespace detail { +namespace distributed { void AsyncBRPCServer::StartServer() { // Instance of your service. @@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() { VLOG(3) << "AsyncGRPCServer WaitSeverReady"; } -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/distributed/brpc_server.h similarity index 88% rename from paddle/fluid/operators/detail/brpc_server.h rename to paddle/fluid/operators/distributed/brpc_server.h index 0105c8074a46849031d8fa9c21a5507a982ec3c3..85a7ad0dfe843dad483d43631b69a79d75211ce9 100644 --- a/paddle/fluid/operators/detail/brpc_server.h +++ b/paddle/fluid/operators/distributed/brpc_server.h @@ -19,12 +19,12 @@ limitations under the License. */ #include #include "brpc/server.h" -#include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class AsyncBRPCServer final : public RPCServer { public: @@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer { int ready_; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/bytebuffer_stream.cc similarity index 94% rename from paddle/fluid/operators/detail/bytebuffer_stream.cc rename to paddle/fluid/operators/distributed/bytebuffer_stream.cc index a14171563edb0ac9a22b7ae493c965de3efb7823..6e91b447db838c9095432eda22e9e1171e938d31 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.cc +++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc @@ -17,11 +17,11 @@ limitations under the License. */ // file and did some modifications so that we can send gRPC // requests without too much copying of the tensor data. -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { GrpcByteBufferSource::GrpcByteBufferSource() {} @@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { return byte_count_; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/distributed/bytebuffer_stream.h similarity index 99% rename from paddle/fluid/operators/detail/bytebuffer_stream.h rename to paddle/fluid/operators/distributed/bytebuffer_stream.h index 054dd4ff294414cca55d7e033f2c5403bbb85526..e7de172c79c30761483b5d96f5bad19860208832 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.h +++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h @@ -106,7 +106,7 @@ class GrpcBufferReader final namespace paddle { namespace operators { -namespace detail { +namespace distributed { // Source provides a way for a particular RPC implementation to provide // received data to ParseFrom. class Source { @@ -183,6 +183,6 @@ class GrpcByteSource : public Source { char space_[sizeof(Reader)]; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc similarity index 98% rename from paddle/fluid/operators/detail/grpc_client.cc rename to paddle/fluid/operators/distributed/grpc_client.cc index c2e58c65ab18c66dfde229242022d294c0c12742..b6ff54a3e46f2d2eab4166094b29132581ed3257 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc_client.h" #include #include #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { void GRPCClient::InitImpl() { InitEventLoop(); } @@ -293,6 +293,6 @@ std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { return ch; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h similarity index 97% rename from paddle/fluid/operators/detail/grpc_client.h rename to paddle/fluid/operators/distributed/grpc_client.h index 0c54ec0efefcc889440a257d59f6ab123ab8b06a..ae7b5e068bfc253eadd48cf2df3f070060e18b35 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -38,13 +38,13 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/detail/rpc_client.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { namespace operators { -namespace detail { +namespace distributed { struct VarHandle { std::string ep; @@ -244,6 +244,6 @@ class GRPCClient : public RPCClient { DISABLE_COPY_AND_ASSIGN(GRPCClient); }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc similarity index 93% rename from paddle/fluid/operators/detail/grpc_serde_test.cc rename to paddle/fluid/operators/distributed/grpc_serde_test.cc index 15892295e6901fe649788c9e34604008fc8cbdfa..3d107b533bcb7bfef3f9b13ec99afbd579a62e52 100644 --- a/paddle/fluid/operators/detail/grpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" @@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { for (int i = 0; i < 564; ++i) rows->push_back(i); ::grpc::ByteBuffer msg; - operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); EXPECT_GT(msg.Length(), static_cast(0)); // deserialize @@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { // deserialize zero-copy // framework::Variable var2; - // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); framework::Scope scope; scope.Var("myvar"); - operators::detail::VariableResponse resp(&scope, &ctx); + operators::distributed::VariableResponse resp(&scope, &ctx); EXPECT_EQ(resp.Parse(msg), 0); framework::Variable* var2 = resp.GetVar(); @@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { math::set_constant(ctx, tensor, 31.9); ::grpc::ByteBuffer msg; - operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); EXPECT_GT(msg.Length(), static_cast(0)); // deserialize @@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { // deserialize zero-copy framework::Scope scope; scope.Var("myvar"); - operators::detail::VariableResponse resp(&scope, &ctx); + operators::distributed::VariableResponse resp(&scope, &ctx); if (from_type == 0) { EXPECT_EQ(resp.Parse(msg), 0); } else { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc similarity index 97% rename from paddle/fluid/operators/detail/grpc_server.cc rename to paddle/fluid/operators/distributed/grpc_server.cc index 9f4971dc12cfb56afc18d8f09cc8266842894f2c..b0f27042504248e47b5af38f86c02a5f0765fd63 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -15,13 +15,13 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc_server.h" using ::grpc::ServerAsyncResponseWriter; namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum CallStatus { PROCESS = 0, FINISH }; // reference: @@ -74,7 +74,7 @@ class RequestSend final : public RequestBase { request_.reset(new VariableResponse(request_handler->scope(), request_handler->dev_ctx(), !request_handler->sync_mode())); - int method_id = static_cast(detail::GrpcMethod::kSendVariable); + int method_id = static_cast(distributed::GrpcMethod::kSendVariable); service_->RequestAsyncUnary( method_id, &ctx_, request_.get(), &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); @@ -106,7 +106,7 @@ class RequestGet final : public RequestBase { ::grpc::ServerCompletionQueue* cq, RequestHandler* request_handler, int req_id) : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(detail::GrpcMethod::kGetVariable); + auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); service_->RequestAsyncUnary( method_id, &ctx_, &request_, &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); @@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase { local_scope_(nullptr) { request_.reset(new VariableResponse(request_handler->scope(), request_handler->dev_ctx(), true)); - int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); + int method_id = + static_cast(distributed::GrpcMethod::kPrefetchVariable); service_->RequestAsyncUnary( method_id, &ctx_, request_.get(), &responder_, cq_, cq_, reinterpret_cast(static_cast(req_id))); @@ -399,6 +400,6 @@ void AsyncGRPCServer::HandleRequest( } } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/distributed/grpc_server.h similarity index 85% rename from paddle/fluid/operators/detail/grpc_server.h rename to paddle/fluid/operators/distributed/grpc_server.h index f1db7590f6f14d5d44acc12453861a446e278cd2..d2524f5e65db6dedab78f45e17380359b58a3d11 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/distributed/grpc_server.h @@ -29,17 +29,17 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/grpc_service.h" -#include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/grpc_service.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RequestBase; @@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer { std::map> rpc_reqs_; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h similarity index 87% rename from paddle/fluid/operators/detail/grpc_service.h rename to paddle/fluid/operators/distributed/grpc_service.h index cb745e125a9a5a6755dd98d46ee7aa6ed16da2ea..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -23,7 +23,7 @@ #include #include #include -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/profiler.h" @@ -42,24 +42,25 @@ class ServerContext; // Support parsing/unparsing of tensorflow::VariableResponse. // Wire-format is identical to RecvVariableResponse. template <> -class SerializationTraits { +class SerializationTraits { public: static Status Serialize( - const paddle::operators::detail::VariableResponse& msg, + const paddle::operators::distributed::VariableResponse& msg, grpc_byte_buffer** bp, bool* own_buffer) { PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); return Status(); } - static Status Deserialize(grpc_byte_buffer* buffer, - paddle::operators::detail::VariableResponse* msg, - int max_message_size = INT_MAX) { + static Status Deserialize( + grpc_byte_buffer* buffer, + paddle::operators::distributed::VariableResponse* msg, + int max_message_size = INT_MAX) { if (buffer == nullptr) { return Status(StatusCode::INTERNAL, "No payload"); } Status result = g_core_codegen_interface->ok(); if (result.ok()) { - paddle::operators::detail::GrpcByteSource source(buffer); + paddle::operators::distributed::GrpcByteSource source(buffer); int ret = msg->Parse(&source); if (ret != 0) { result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); @@ -73,7 +74,7 @@ class SerializationTraits { namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum class GrpcMethod { kSendVariable, @@ -121,6 +122,6 @@ class GrpcService final { }; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h similarity index 98% rename from paddle/fluid/operators/detail/proto_encoder_helper.h rename to paddle/fluid/operators/distributed/proto_encoder_helper.h index d91d054b2507f32d1e948dde33da06a70cabe775..2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c 100644 --- a/paddle/fluid/operators/detail/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -26,7 +26,7 @@ limitations under the License. */ namespace paddle { namespace operators { -namespace detail { +namespace distributed { char* EncodeVarint32(char* dst, uint32_t v) { // Operate on characters as unsigneds @@ -144,6 +144,6 @@ class ProtoEncodeHelper { char* limit_; // Just for CHECKs }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h similarity index 98% rename from paddle/fluid/operators/detail/request_handler.h rename to paddle/fluid/operators/distributed/request_handler.h index 387a6b119030c8d7a629db9850adcb9b826b35f4..90742a201ad46447d6fbbe2137aa40fabc2f9983 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -31,7 +31,7 @@ namespace paddle { namespace operators { -namespace detail { +namespace distributed { constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; @@ -135,6 +135,6 @@ class RequestHandler { RPCServer* rpc_server_; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc similarity index 96% rename from paddle/fluid/operators/detail/request_handler_impl.cc rename to paddle/fluid/operators/distributed/request_handler_impl.cc index 859f6a75781cd325f3efeee59d9a1ae6a30dff88..b6e4e156080a67735d5cb2cbd6194bb4e52ea32d 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -20,13 +20,13 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/string/printf.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { bool RequestSendHandler::Handle(const std::string& varname, framework::Scope* scope, @@ -138,6 +138,6 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, return true; } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h similarity index 96% rename from paddle/fluid/operators/detail/request_handler_impl.h rename to paddle/fluid/operators/distributed/request_handler_impl.h index 689c6893cf86b4d9bc27572e5267ff6fdb1dc6d3..87185500f2ffc3a8578eea339cc7a1e2b0e46631 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -28,11 +28,11 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RequestSendHandler final : public RequestHandler { public: @@ -81,6 +81,6 @@ class RequestCheckpointHandler final : public RequestHandler { int checkpoint_notify_id; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc similarity index 88% rename from paddle/fluid/operators/detail/rpc_client.cc rename to paddle/fluid/operators/distributed/rpc_client.cc index 9a791403e3d6b99c5d4de5183e83e1af655d7d4c..c71edf977c18e554c502732e9bf4bb4ea99f8f99 100644 --- a/paddle/fluid/operators/detail/rpc_client.cc +++ b/paddle/fluid/operators/distributed/rpc_client.cc @@ -12,15 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { std::once_flag RPCClient::init_flag_; std::unique_ptr RPCClient::rpc_client_(nullptr); -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h similarity index 98% rename from paddle/fluid/operators/detail/rpc_client.h rename to paddle/fluid/operators/distributed/rpc_client.h index 71fdc864a611b05b93c3b54d5190934fddb7fd7a..b824da382625b2a584918c2f2b9bd90ab1c74035 100644 --- a/paddle/fluid/operators/detail/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RPCClient { public: @@ -88,6 +88,6 @@ class RPCClient { static std::once_flag init_flag_; static std::unique_ptr rpc_client_; }; -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc similarity index 96% rename from paddle/fluid/operators/detail/rpc_server.cc rename to paddle/fluid/operators/distributed/rpc_server.cc index cd0fe96e2301ee3304fe9a2967df58b9f7072d8d..fa0cb71b3056de92f65139c5402132fc8cbb7a87 100644 --- a/paddle/fluid/operators/detail/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -17,11 +17,11 @@ #include #include -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { void RPCServer::ShutDown() { LOG(INFO) << "RPCServer ShutDown "; @@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) { lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h similarity index 95% rename from paddle/fluid/operators/detail/rpc_server.h rename to paddle/fluid/operators/distributed/rpc_server.h index 2e3342428cb56c34abaca655d5906668cda8f140..cf25e78435bb470b25a46db647ca818571cc83a5 100644 --- a/paddle/fluid/operators/detail/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -19,11 +19,11 @@ #include // NOLINT #include #include -#include "paddle/fluid/operators/detail/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class RPCServer { public: @@ -86,6 +86,6 @@ class RPCServer { friend class RequestHandler; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc similarity index 87% rename from paddle/fluid/operators/detail/rpc_server_test.cc rename to paddle/fluid/operators/distributed/rpc_server_test.cc index 463a7b80cfac280de5afe91ee85caaaf074cef32..a0693cffabcc561b0adfafc2c49027a890dd5efc 100644 --- a/paddle/fluid/operators/detail/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -22,18 +22,18 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" -#include "paddle/fluid/operators/detail/rpc_client.h" -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace framework = paddle::framework; namespace platform = paddle::platform; -namespace detail = paddle::operators::detail; +namespace distributed = paddle::operators::distributed; USE_OP(lookup_table); -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; +std::unique_ptr g_rpc_service; +std::unique_ptr g_req_handler; framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { auto root_block = program->MutableBlock(0); @@ -113,19 +113,21 @@ void StartServer() { g_req_handler->SetScope(&scope); g_req_handler->SetExecutor(&exe); - g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get()); + g_rpc_service->RegisterRPC(distributed::kRequestPrefetch, + g_req_handler.get()); g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); + std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); server_thread.join(); } TEST(PREFETCH, CPU) { - g_req_handler.reset(new detail::RequestPrefetchHandler(true)); + g_req_handler.reset(new distributed::RequestPrefetchHandler(true)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - detail::RPCClient* client = detail::RPCClient::GetInstance(); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto similarity index 100% rename from paddle/fluid/operators/detail/send_recv.proto rename to paddle/fluid/operators/distributed/send_recv.proto diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc similarity index 95% rename from paddle/fluid/operators/detail/sendrecvop_utils.cc rename to paddle/fluid/operators/distributed/sendrecvop_utils.cc index 507b465435609a91ebca97dd70b176c3b79bee02..98129d9f1014c39347e3409533f2bc10092611d2 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #ifdef PADDLE_WITH_CUDA #include @@ -23,14 +23,14 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" -#include "paddle/fluid/operators/detail/proto_encoder_helper.h" -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { using VarMsg = sendrecv::VariableMessage; @@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, const framework::Scope* scope, framework::Variable** var) { - operators::detail::VariableResponse resp(scope, &ctx); + operators::distributed::VariableResponse resp(scope, &ctx); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); *var = resp.GetVar(); } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h similarity index 92% rename from paddle/fluid/operators/detail/sendrecvop_utils.h rename to paddle/fluid/operators/distributed/sendrecvop_utils.h index bd16bf1dab8d933ffd18b6d6d9e3ce1c7d73029b..fe25e73fa608727ba0bb912a82776b330ec8d83a 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -25,12 +25,12 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { typedef void (*DestroyCallback)(void*); @@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { } } -} // namespace detail +} // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc similarity index 96% rename from paddle/fluid/operators/detail/variable_response.cc rename to paddle/fluid/operators/distributed/variable_response.cc index 24cb91a3bb820a0e5d51aaa49154434919080f69..619890b1939be8777b89b94e415a3c2d63376658 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/detail/variable_response.h" +#include "paddle/fluid/operators/distributed/variable_response.h" #include #include @@ -22,12 +22,12 @@ #endif #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { enum WireType { WIRETYPE_VARINT = 0, @@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData( slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); - PADDLE_ENFORCE_EQ( - static_cast(tensor->numel()), - length / framework::SizeOfType( - paddle::operators::detail::ToTypeIndex(meta_.data_type()))); + PADDLE_ENFORCE_EQ(static_cast(tensor->numel()), + length / framework::SizeOfType( + paddle::operators::distributed::ToTypeIndex( + meta_.data_type()))); void* tensor_data = tensor->mutable_data( ctx.GetPlace(), - paddle::operators::detail::ToTypeIndex(meta_.data_type())); + paddle::operators::distributed::ToTypeIndex(meta_.data_type())); if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { return false; @@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) { return 0; } -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h similarity index 92% rename from paddle/fluid/operators/detail/variable_response.h rename to paddle/fluid/operators/distributed/variable_response.h index 69cfd784f8dd4f129f50c6882061e53e8535b949..1db4a0a522654ff2497b8bd9ee1381b5ab64067a 100644 --- a/paddle/fluid/operators/detail/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -22,17 +22,17 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/bytebuffer_stream.h" namespace paddle { namespace operators { -namespace detail { +namespace distributed { class VariableResponse { public: @@ -99,6 +99,6 @@ class VariableResponse { sendrecv::VariableMessage meta_; }; -}; // namespace detail +}; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 98b051afb551f373009d2bd3df1a8daa64b7e6c7..02beb80fc8a9f451393dcdd54492c4f88f908497 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); rpc_client->Wait(); diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index f824eee4e7d1ef19c9a38fd5d3369265f9c549a0..697c239e59d158428ae9ba9f7feded19637dff28 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" namespace paddle { @@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase { std::vector endpoint_list = Attr>("endpoint_list"); - detail::RPCClient* client = detail::RPCClient::GetInstance(); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); for (auto& ep : endpoint_list) { VLOG(3) << "sending nccl id to " << ep; @@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase { // NOTE: Can not use unique_ptr here because the default // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. - detail::RequestSendHandler rpc_h(true); - std::unique_ptr rpc_service( + distributed::RequestSendHandler rpc_h(true); + std::unique_ptr rpc_service( new RPCSERVER_T(endpoint, 1)); - rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); + rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h); rpc_h.SetRPCServer(rpc_service.get()); framework::ProgramDesc empty_program; @@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_h.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::RPCServer::StartServer, rpc_service.get())); + std::bind(&distributed::RPCServer::StartServer, rpc_service.get())); - rpc_service->SetCond(detail::kRequestSend); + rpc_service->SetCond(distributed::kRequestSend); VLOG(3) << "start getting nccl id from trainer 0..."; - rpc_service->WaitBarrier(detail::kRequestSend); + rpc_service->WaitBarrier(distributed::kRequestSend); VLOG(3) << "got nccl id and stop server..."; rpc_service->ShutDown(); VLOG(3) << "rpc server stopped"; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 3d67b2d2ea2f2a908fffd0bc08cdc515c2d50316..420c4e9e4ed2ef0b4a3b917f013360ad5a5dfa5b 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -21,14 +21,14 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -void RunServer(std::shared_ptr service) { +void RunServer(std::shared_ptr service) { service->StartServer(); VLOG(4) << "RunServer thread end"; } @@ -123,12 +123,12 @@ void ListenAndServOp::RunSyncLoop( while (true) { // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. - rpc_service_->SetCond(detail::kRequestSend); - rpc_service_->WaitBarrier(detail::kRequestSend); + rpc_service_->SetCond(distributed::kRequestSend); + rpc_service_->WaitBarrier(distributed::kRequestSend); if (rpc_service_->IsExit()) { LOG(WARNING) << "get exit!rpc_processor break!"; - rpc_service_->SetCond(detail::kRequestGet); + rpc_service_->SetCond(distributed::kRequestGet); break; } @@ -156,11 +156,11 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - rpc_service_->SetCond(detail::kRequestGet); - rpc_service_->WaitBarrier(detail::kRequestGet); + rpc_service_->SetCond(distributed::kRequestGet); + rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->ResetBarrierCounter(); // reset received sparse vars to avoid reuse it in the next mini-batch - dynamic_cast(request_send_handler_.get()) + dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); } // while(true) } @@ -217,14 +217,14 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, } static void FillRequestCtx( - detail::RequestHandler *h, framework::Scope *scope, + distributed::RequestHandler *h, framework::Scope *scope, platform::DeviceContext *dev_ctx, framework::Executor *executor, framework::ProgramDesc *program, std::unordered_map> *prefetch_ctx, std::shared_ptr checkpoint_ctx, - detail::RPCServer *rpc_server) { + distributed::RPCServer *rpc_server) { h->SetScope(scope); h->SetDevCtx(dev_ctx); h->SetExecutor(executor); @@ -255,16 +255,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); - request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); - request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); + request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode)); + request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode)); request_prefetch_handler_.reset( - new detail::RequestPrefetchHandler(sync_mode)); - request_checkpoint_handler_.reset( - new detail::RequestCheckpointHandler(sync_mode, checkpoint_notify_id)); - - rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get()); - rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get()); - rpc_service_->RegisterRPC(detail::kRequestPrefetch, + new distributed::RequestPrefetchHandler(sync_mode)); + request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler( + sync_mode, checkpoint_notify_id)); + + rpc_service_->RegisterRPC(distributed::kRequestSend, + request_send_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestGet, + request_get_handler_.get()); + rpc_service_->RegisterRPC(distributed::kRequestPrefetch, request_prefetch_handler_.get()); rpc_service_->RegisterRPC(detail::kRequestCheckpoint, request_checkpoint_handler_.get()); diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index ca2dafb737a826cd1ac98987a2bf6b2cc910a03e..a3dc6fc6220357ce0db1d7b8106f8a4abe0f3be0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" namespace paddle { namespace operators { @@ -34,7 +34,7 @@ constexpr char kOptimizeBlock[] = "OptimizeBlock"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kCheckpointBlockId[] = "checkpint_block_id"; -void RunServer(std::shared_ptr service); +void RunServer(std::shared_ptr service); class ListenAndServOp : public framework::OperatorBase { public: @@ -64,11 +64,13 @@ class ListenAndServOp : public framework::OperatorBase { const platform::Place& dev_place) const override; protected: - mutable std::shared_ptr rpc_service_; - mutable std::shared_ptr request_send_handler_; - mutable std::shared_ptr request_get_handler_; - mutable std::shared_ptr request_prefetch_handler_; - mutable std::shared_ptr request_checkpoint_handler_; + mutable std::shared_ptr rpc_service_; + mutable std::shared_ptr request_send_handler_; + mutable std::shared_ptr request_get_handler_; + mutable std::shared_ptr + request_prefetch_handler_; + mutable std::shared_ptr + request_checkpoint_handler_; mutable std::shared_ptr server_thread_; }; diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc index db109f5cd053d84718ac85bd4693ecece12ce172..26970db8d2af62bb06fce4eb1a1f21fd41617bd1 100644 --- a/paddle/fluid/operators/logical_op.cc +++ b/paddle/fluid/operators/logical_op.cc @@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, paddle::operators::LogicalNotFunctor); REGISTER_BINARY_LOGICAL_OP(logical_xor, - "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); + "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index 14964fc62af6dd947e49a2054511780a1bb20cf2..55c8a472aca7fe700ef6a3f96bed1496d7b12b80 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -93,10 +93,10 @@ class ConcatGradFunctor { auto cpu_place = boost::get(context.GetPlace()); // computation - for (size_t k = 0; k < input_rows; ++k) { + for (int k = 0; k < input_rows; ++k) { const T* src_ptr = input.data() + k * input_cols; int col_idx = 0; - for (int j = 0; j < num; ++j) { + for (size_t j = 0; j < num; ++j) { int col_len = output_cols[j]; auto* out_tensor = outputs->at(j); if (out_tensor != nullptr) { diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index d39154c6f88d6d17c1719eb9a5b048211f4bb52b..c3387be6daa3bd34a6e3410ced23fce5d65f2cf7 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -30,6 +30,7 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; #define DEFINE_CPU_TRANS(RANK) \ template struct Transpose(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 15dfb5469bf51330b98d6699fb3ce708222212ed..9854a31f5b10f5ecd940c0d41c2c3e468fc17bad 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index c6c975a23ce846464388c72af5d8902144ceb16a..6b4572dcccc21e783f1df0b9bcde11d532ff4ba8 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 84ec36625314572d16e5c537884b6efec420cc60..0cac329aafa8c4c67cae48ba62a48575f5edba92 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase { // For profiling platform::RecordEvent record_event(Type(), &ctx); - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 0ea273af9d5a5c8f1ae112232a9187675031b360..647cfc0a0af2be85e2868c6f68cab962c6631a8d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -14,11 +14,14 @@ #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/operators/tensorrt_engine_op.h" +#include +#include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 8455d24ddf47382b235edda10cb9b2e8934c5f06..295d6ba0395b68cabab3bd4117cedd912df48f5d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -16,10 +16,12 @@ #ifdef PADDLE_WITH_CUDA +#include +#include + #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" -#include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 3a2fef48052ae3943abad14bf87c14ca79251c94..358e2d151bb8f990503ea8a51ba5f81e0a1dc816 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { const std::string& z_name, bool x_created, const shape_t& x_shape, const shape_t& y_shape, const shape_t& z_shape) { - LOG(INFO) << "create fc op"; auto* fc = block_desc.AppendOp(); fc->SetType("mul"); diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index 5015b1005569ba70b147ebb795243e24ab81ea5c..e2b7b6b8e447381229e4ad594b7974bc0aa159d5 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv); namespace f = paddle::framework; namespace p = paddle::platform; namespace m = paddle::operators::math; -namespace detail = paddle::operators::detail; +namespace distributed = paddle::operators::distributed; namespace string = paddle::string; -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; +std::unique_ptr g_rpc_service; +std::unique_ptr g_req_handler; void StartServer() { f::Scope scope; @@ -57,14 +57,14 @@ void StartServer() { g_req_handler->SetProgram(&empty_program); g_req_handler->SetExecutor(&executor); - g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get()); + g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get()); g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); + std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - g_rpc_service->SetCond(detail::kRequestSend); - g_rpc_service->WaitBarrier(detail::kRequestSend); + g_rpc_service->SetCond(distributed::kRequestSend); + g_rpc_service->WaitBarrier(distributed::kRequestSend); LOG(INFO) << "got nccl id and stop server..."; g_rpc_service->ShutDown(); @@ -72,7 +72,7 @@ void StartServer() { } TEST(SendNcclId, RPCServer) { - g_req_handler.reset(new detail::RequestSendHandler(true)); + g_req_handler.reset(new distributed::RequestSendHandler(true)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); std::thread server_thread(StartServer); @@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) { std::string ep = string::Sprintf("127.0.0.1:%d", port); - detail::RPCClient* client = detail::RPCClient::GetInstance(); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(); LOG(INFO) << "connect to server" << ep; client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 74036bcb3114df8fc4613bd9f4dc327463397dba..dc02c6632e2a5265daf0c2f9949bdb94beec4232 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) { new (&instance) LoDTensor(new_offset_lod); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) + // We implement offset based LOD in C++ while we use length based with + // Python API. So we changed set_lod to set_recursive_sequence_lengths to + // avoid misuse. + // The discussion is here: + // https://github.com/PaddlePaddle/Paddle/issues/10855 .def("set_lod", [](LoDTensor &self, const std::vector> &lod) { // the input lod is offset-based level-of-detail info @@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) { std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); return new_lod; }) + // Set above comments of set_lod. .def("recursive_sequence_lengths", [](LoDTensor &self) -> std::vector> { // output the length-based lod info diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 93b09ed6922b32a5531224acc470daf0d97f95bd..6da3846ac69980daac4f0fb7401b2573c21c89bf 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -97,7 +97,7 @@ struct CastToPyBufferImpl { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { auto buffer_info = details::CastToPyBufferImpl()(tensor); + uint8_t, platform::float16>()(tensor); return buffer_info; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5af5bc9c4731317075b3912a4749a0b358bdd56e..45af83708ea63fc1b6aa86f1e8423bb44b7388a6 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -44,7 +44,7 @@ import metrics import transpiler from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder -from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace +from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from transpiler import DistributeTranspiler, InferenceTranspiler, \ memory_optimize, release_memory from concurrency import (Go, make_channel, channel_send, channel_recv, @@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ 'profiler', 'unique_name', 'recordio_writer', + 'Scope', ] diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index f96a2d282740dc943ae0fac2427e8f8efdaf5252..c859778b3757f638ac531620f241e684522add57 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -79,6 +79,61 @@ class DataToLoDTensorConverter(object): class DataFeeder(object): + """ + DataFeeder converts the data that returned by a reader into a data + structure that can feed into Executor and ParallelExecutor. The reader + usually returns a list of mini-batch data entries. Each data entry in + the list is one sample. Each sample is a list or a tuple with one + feature or multiple features. + + The simple usage shows below: + + .. code-block:: python + + place = fluid.CPUPlace() + img = fluid.layers.data(name='image', shape=[1, 28, 28]) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) + result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])]) + + + If you want to feed data into GPU side separately in advance when you + use multi-GPU to train a model, you can use `decorate_reader` function. + + .. code-block:: python + + place=fluid.CUDAPlace(0) + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + reader = feeder.decorate_reader( + paddle.batch(flowers.train(), batch_size=16)) + + Args: + feed_list(list): The Variables or Variables'name that will + feed into model. + place(Place): place indicates feed data into CPU or GPU, if you want to + feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents + the GPU id), or if you want to feed data into CPU, please using + `fluid.CPUPlace()`. + program(Program): The Program that will feed data into, if program + is None, it will use default_main_program(). Default None. + + Raises: + ValueError: If some Variable is not in this Program. + + Examples: + .. code-block:: python + + # ... + place = fluid.CPUPlace() + feed_list = [ + main_program.global_block().var(var_name) for var_name in feed_vars_name + ] # feed_vars_name is a list of variables' name. + feeder = fluid.DataFeeder(feed_list, place) + for data in reader(): + outs = exe.run(program=main_program, + feed=feeder.feed(data)) + """ + def __init__(self, feed_list, place, program=None): self.feed_dtypes = [] self.feed_names = [] @@ -108,6 +163,16 @@ class DataFeeder(object): self.place = place def feed(self, iterable): + """ + According to feed_list and iterable, converters the input into + a data structure that can feed into Executor and ParallelExecutor. + + Args: + iterable(list|tuple): the input data. + + Returns: + dict: the result of conversion. + """ converter = [] for lod_level, shape, dtype in six.zip( self.feed_lod_level, self.feed_shapes, self.feed_dtypes): @@ -130,6 +195,20 @@ class DataFeeder(object): return ret_dict def feed_parallel(self, iterable, num_places=None): + """ + Takes multiple mini-batches. Each mini-batch will be feed on each + device in advance. + + Args: + iterable(list|tuple): the input data. + num_places(int): the number of devices. Default None. + + Returns: + dict: the result of conversion. + + Notes: + The number of devices and number of mini-batches must be same. + """ if isinstance(self.place, core.CUDAPlace): places = [ core.CUDAPlace(i) @@ -168,6 +247,24 @@ class DataFeeder(object): multi_devices, num_places=None, drop_last=True): + """ + Converter the input data into a data that returned by reader into + multiple mini-batches. Each mini-batch will be feed on each device. + + Args: + reader(fun): the input data. + multi_devices(bool): the number of places. Default None. + num_places(int): the number of places. Default None. + drop_last(bool): the number of places. Default None. + + Returns: + dict: the result of conversion. + + Raises: + ValueError: If drop_last is False and the data batch which cannot + fit for devices. + """ + def __reader_creator__(): if not multi_devices: for item in reader(): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 33d8f709412b25d29c6618272500dd7b953d6645..159b0ca39eed547e4f3448e7ebf4807299d465b2 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -25,6 +25,13 @@ g_scope = core.Scope() def global_scope(): + """ + Get the global/default scope instance. There are a lot of APIs use + :code:`global_scope` as its default value, e.g., :code:`Executor.run` + + Returns: + Scope: The global/default scope instance. + """ return g_scope @@ -37,6 +44,19 @@ def switch_scope(scope): @contextlib.contextmanager def scope_guard(scope): + """ + Change the global/default scope instance by Python `with` statement. All + variable in runtime will assigned to the new scope. + + Examples: + >>> import paddle.fluid as fluid + >>> new_scope = fluid.Scope() + >>> with fluid.scope_guard(new_scope): + >>> ... + + Args: + scope: The new global/default scope. + """ ex = switch_scope(scope) yield switch_scope(ex) @@ -135,14 +155,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): def fetch_var(name, scope=None, return_numpy=True): """ - Fetch the value of the variable with the given name from the given scope + Fetch the value of the variable with the given name from the + given scope. + Args: name(str): name of the variable. Typically, only persistable variables can be found in the scope used for running the program. scope(core.Scope|None): scope object. It should be the scope where you pass to Executor.run() when running your program. - If None, global_scope() will be used. - return_numpy(bool): whether convert the tensor to numpy.ndarray + If None, global_scope() will be used. Default None. + return_numpy(bool): whether convert the tensor to numpy.ndarray. + Default True. + Returns: LodTensor|numpy.ndarray """ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ca118452711bd423f74e67c960e9a0bd9a69edc4..8ad78443fdeaa6a1cf28380f9aa71e21cdad0a35 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -30,8 +30,6 @@ __all__ = [ 'default_startup_program', 'default_main_program', 'program_guard', - 'switch_startup_program', - 'switch_main_program', 'get_var', ] @@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() def grad_var_name(var_name): """ - return gradient name for a certain var name + Returns: + str: gradient name for a certain var name """ return var_name + GRAD_VAR_SUFFIX @@ -51,10 +50,12 @@ def grad_var_name(var_name): def convert_np_dtype_to_dtype_(np_dtype): """ Convert the data type in numpy to the data type in Paddle + Args: - np_dtype(np.dtype): the data type in numpy + np_dtype(np.dtype): the data type in numpy. - Returns(core.VarDesc.VarType): the data type in Paddle + Returns: + core.VarDesc.VarType: the data type in Paddle. """ dtype = np.dtype(np_dtype) @@ -120,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True): class Variable(object): """ - Python variable. Every input and output of an operator is a variable. Every - variable belongs to a block. The variable has a name and two variables in - different blocks could have the same name. + In Fluid, every input and output of an operator is a variable. In most + cases, variables are used for holding different kinds of data or training + labels. A variable belongs to a block. All variable has its own name and + two variables in different blocks could have the same name. - There are many kinds of variables. Please reference the framework.proto for - details. + There are many kinds of variables. Each kind of them has its own attributes + and usages. Please reference the framework.proto for details. - Notes: The constructor of Variable should not be invoked directly. Please - use `Block.create_var` to create a variable. - - >>> cur_program = Program() - >>> cur_block = cur_program.current_block() - >>> new_variable = cur_block.create_var( - >>> name="X", shape=[-1, 23, 48], dtype='float32') + Most of a Variable's member variables can be setted to be None. It mean + it is not available or will be specified later. Args: - block(Block): The associated block. It will be passed by - `Block.create_var` automatically. + block(Block): The block that the variable belongs to. type(core.VarDesc.VarType): Variable type. Please reference the framework.proto for details. - shape(tuple|list|None): The shape of variable. -1 means the batch size. + name(str|None): The name of the variable. If setted None, it will be + generated automatically. Default: None + shape(tuple|list|None): The shape of the variable. -1 means the batch size. Some kinds of variable do not contain shape, just set it to None. - dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable. - lod_level(int): The level of lod tensor. 0 means it is not a time + Default: None + dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable. + Default: None + lod_level (int|None): The level of lod tensor. 0 means it is not a time series data. - capacity(int): The capacity of Channel variable. Ignored - for other types. - persistable(bool): True if the variable should be saved as check point. - Defaults to False. - stop_gradient(bool): True if the variable will stop to calculate - gradients when backward. Defaults to False. + Default: None + capacity (int|None): The capacity of Channel variable. Ignored for other + types. Default: None + persistable (bool|None): True if the variable is persistable. A persistable + variable will not be deleted after an iteration ending. Defaults: None. + error_clip (BaseErrorClipAttr|None): The error clip attributes of the + corresponding gradient variable. Default: None + stop_gradient (bool): True if the variable will stop to calculate its + gradients when backward. Default: False. + is_data (bool): True if the variable is an input data. Default: False + + Notes: + The constructor of Variable should not be invoked directly. Please + use `Block.create_var` to create a variable. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + new_variable = cur_block.create_var(name="X", + shape=[-1, 23, 48], + dtype='float32') """ def __init__(self, @@ -253,13 +270,14 @@ class Variable(object): Get debug string. Args: - throw_on_error(bool): True if raise an exception when self is not - intialized. + throw_on_error(bool): True if raise an exception when self is + not initialized. with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True - - Returns(str): The debug string. + (e.g. trainable, optimize_attr, ...) will be printed when + with_details is True. Default False; + Returns: + str: The debug string. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, bool) @@ -276,6 +294,15 @@ class Variable(object): __repr__ = __str__ def set_desc(self, input): + """ + Set the variable description. + + Args: + input(core.VarDesc): The new VarDesc. + + Returns: + None + """ self.desc = input @property @@ -312,6 +339,15 @@ class Variable(object): return self.desc.type() def set_error_clip(self, error_clip): + """ + Set the error_clip. + + Args: + error_clip(BaseErrorClipAttr) : The new error_clip. + + Returns: + None + """ self.error_clip = error_clip @@ -319,8 +355,8 @@ def get_all_op_protos(): """ Get all registered op proto from PaddlePaddle C++ end. - Returns(list): list of OpProto - + Returns: + list: list of OpProto. """ protostrs = core.get_all_op_protos() ret_values = [] @@ -373,9 +409,45 @@ class OpProtoHolder(object): class Operator(object): """ - Python Operator class. The operator represents the build in instructions in a - Block. Users can use the build in instructions to describe their neural - network. + In Fluid, all the operation are represented by Operator, and Operator + is regarded as a build in an instruction of a Block. Users can use the + build in instructions to describe their neural network. + + Args: + block(Block): The block has the current operator. + desc(core.OpDesc): The protobuf description of Operator. + type(str): The type of operator. Default None. + inputs(dict): The input of this Operator. it is a dictionary, for every + element, key is the input parameter name, and value is a list of + variables. Default None. + outputs(dict): The output of this Operator. it is a dictionary, for + every element, key is the input parameter name, and value is a list + of variables. Default None. + attrs(dict): The attributes of this Operator. it is a dictionary, for + every element, key is attribute name, and value is the attribute value. + The attribute type should be as same as the type registered in C++ side. + Default None. + + Returns: + Operator: The initialized Operator. + + Raises: + ValueError: If the passed input, output and attrs doesn't match the + initializing Operator's that registered in C++ side. + + Notes: + The constructor of operator should not be invoked directly. Use + Block.append_op or Block.prepend_op instead. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + # var1 += var2 + var3 + cur_block.append_op(type="sum", + inputs={"X": [var1, var2, var3]}, + outputs={"Out": [var1]}) """ OP_WITHOUT_KERNEL_SET = { 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', @@ -392,31 +464,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): - """ - Constructor. - Notes: The constructor of operator should not be invoked directly. Use - Block.append_op or Block.prepend_op instead. - - >>> cur_program = Program() - >>> cur_block = cur_program.current_block() - >>> # var1 += var2 + var3 - >>> cur_block.append_op(type="sum", - >>> inputs={"X": [var1, var2, var3]}, - >>> outputs={"Out": [var1]}) - - Args: - block(Block): The block has the current operator. - desc(core.OpDesc): The protobuf description. - type(str): The type of operator. - inputs(dict): The input dictionary. Key is the input parameter name. - Value is a list of variables. - outputs(dict): The output dictionary which has the same format with - inputs. - attrs(dict): The attributes dictionary. Key is attribute name. Value - is the attribute value. The attribute type should be as same as - the type registered in C++ - """ self.block = block self.desc = desc self.attrs = attrs @@ -529,12 +577,14 @@ class Operator(object): def to_string(self, throw_on_error): """ - To debug string. + Get debug string. + Args: - throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True + throw_on_error(bool): Whether to raise exception if self is not + initialized. - Returns(str): The debug string. + Returns: + str: The debug string. """ protostr = self.desc.serialize_to_string() @@ -552,29 +602,45 @@ class Operator(object): def input(self, name): """ - Get input arguments by the input parameter name - Args: - name(str): The input parameter name + Get the input arguments according to the input parameter name. - Returns(list): return the list of argument names associated with the - specific parameter name. + Args: + name(str): The input parameter name. + Returns: + list: return the list of argument names that associated with \ + the specific parameter name. """ return self.desc.input(name) def rename_input(self, old_name, new_name): + """ + Rename the `old_name` to `new_name`. + + Args: + old_name(str): The old name of the Operator's input. + new_name(str): The new name of the Operator's input. + + Returns: + None + """ self.desc.rename_input(old_name, new_name) def rename_output(self, old_name, new_name): + """ + Rename the `old_name` to `new_name`. + + Args: + old_name(str): The old name of the Operator's output. + new_name(str): The new name of the Operator's output. + + Returns: + None + """ self.desc.rename_output(old_name, new_name) @property def input_names(self): - """ - Get all input parameter names - Returns(list): return a list of input parameter names - - """ return self.desc.input_names() @property @@ -587,33 +653,23 @@ class Operator(object): def output(self, name): """ - Get output arguments by the output parameter name - Args: - name(str): The output parameter name + Get output arguments by the output parameter name. - Returns(list): return the list of argument names associated with the - specific parameter name. + Args: + name(str): The output parameter name. + Returns: + list: return the list of argument names associated with \ + the specific parameter name. """ return self.desc.output(name) @property def output_names(self): - """ - Get all output parameter names - Returns(list): return a list of output parameter names - - """ return self.desc.output_names() @property def idx(self): - """ - Return the array index of current operator. - Returns(int): The array index in block.ops array - Raises: - ValueError: when the operator is not found. - """ for i, op in enumerate(self.block.ops): if op == self: return i @@ -622,27 +678,40 @@ class Operator(object): def has_attr(self, name): """ - operator has the attribute with name or not. + Whether this Operator has the attribute with name or not. + Args: - name(str): the attribute name + name(str): the attribute name. - Returns(bool): True if has this attribute. + Returns: + bool: True if has this attribute. """ return self.desc.has_attr(name) def attr_type(self, name): """ - Get the type of attribute by attribute name - Args: - name(str): the attribute name + Get the type of attribute by attribute's name. - Returns(core.AttrType): the attribute type + Args: + name(str): the attribute name. + Returns: + core.AttrType: the attribute type. """ return self.desc.attr_type(name) def set_attr(self, name, val): + """ + Set the value of attribute by attribute's name. + + Args: + name(str): the attribute name. + val(bool|int|str|float|list): the value of the attribute. + + Raises: + ValueError: If the type of value doesn't match with desc.attr_type(name). + """ self.attrs[name] = val if isinstance(val, Block): self.desc.set_block_attr(name, val.desc) @@ -654,40 +723,39 @@ class Operator(object): @property def attr_names(self): - """ - Get all attribute names - Returns(list): The list of attribute name - - """ return self.desc.attr_names() def attr(self, name): """ - Get attribute by name + Get the attribute by name. + Args: - name(str): the attribute name + name(str): the attribute name. - Returns(bool|int|str|float|list): The attribute value. The return value + Returns: + bool|int|str|float|list: The attribute value. The return value can be any valid attribute type. - """ return self.desc.attr(name) def block_attr(self, name): """ - Get the block attribute by name - Args: - name(str): the attribute name + Get the block attribute by name. - Returns(int): the block index + Args: + name(str): the attribute name. + Returns: + int: the block index. """ return self.desc.block_attr(name) def all_attrs(self): """ - Get the attribute dict - Returns(dict): The Operator's attribute dict + Get the attribute dict. + + Returns: + dict: The Operator's attribute dict. """ attr_names = self.attr_names attr_map = {} @@ -700,6 +768,35 @@ class Operator(object): class Block(object): + """ + In Fluid, a Program is consistence of multi-Block, and Block stores + VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name. + One block could have some child blocks, and child block's name scopes + should inherit the parent's so that OpDesc in child block can reference + a VarDesc that is stored in the parent block. + Please reference the framework.proto for details. + + Args: + program(Program): The Program that the Block belongs to. + idx(int): The block's id in the Program. + + Notes: + The constructor of Block should not be invoked directly. Please + use `Program.create_block()` to create a block. + + Examples: + .. code-block:: python + + cur_program = Program() + cur_block = cur_program.current_block() + var = cur_block.create_var(name="X", + shape=[-1, 23, 48], + dtype='float32') + cur_block.append_op(type="abs", + inputs={"X": [var]}, + outputs={"Out": [var]}) + """ + def __init__(self, program, idx): self.desc = program.desc.block(idx) self.vars = collections.OrderedDict() # var_name --> var @@ -712,15 +809,17 @@ class Block(object): def to_string(self, throw_on_error, with_details=False): """ - To debug string. + Get debug string. + Args: throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True + when throw_on_error is True. with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True - - Returns(str): The debug string. + (e.g. trainable, optimize_attr, ...) will be printed when + with_details is True. Default False. + Returns: + str: The debug string. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, bool) @@ -752,6 +851,15 @@ class Block(object): return self.desc.get_forward_block_idx() def set_forward_block_idx(self, idx): + """ + Set the forward block Idx. + + Args: + idx(int): the block index. + + Returns: + None + """ self.desc.set_forward_block_idx(idx) @property @@ -759,6 +867,19 @@ class Block(object): return self.desc.id def var(self, name): + """ + Get a Variable by name from this block. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: The If input's type is not str, or this block + doesn't have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ if not isinstance(name, basestring): raise TypeError( "var require string as parameter, but get %s instead." % @@ -769,6 +890,19 @@ class Block(object): return v def var_recursive(self, name): + """ + Get a Variable by name from this block recursively. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: this block and this parent block doesn't + have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ frontier = list() visited = set() @@ -815,6 +949,18 @@ class Block(object): def rename_var(self, name, new_name): """ Rename variable in vars and ops' inputs and outputs + + Args: + name(str): the name that need to be renamed. + new_name(str): the name that need to rename to. + + Raises: + ValueError: If this block doesn't have this the giving name, + or the type of the var with the giving name is not Parameter + or Variable. + + Returns: + Variable: the Variable with the giving name. """ if not self.has_var(name): raise ValueError("var %s is not in current block" % name) @@ -878,12 +1024,27 @@ class Block(object): return param def append_op(self, *args, **kwargs): + """ + Appends a new Operator according to the giving arguments. + + Returns: + Operator: the append Operator. + """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.append(op) return op def insert_op(self, index, *args, **kwargs): + """ + Insert a Operator according to the giving arguments. + + Args: + index(int): the place that the operator to insert. + + Returns: + Operator: the insert Operator. + """ self.sync_with_cpp() op_desc = self.desc.insert_op(index) op = Operator(block=self, desc=op_desc, *args, **kwargs) @@ -891,11 +1052,30 @@ class Block(object): return op def remove_op(self, index): + """ + Remove the specific position operator. + + Args: + index(int): the position that the operator to insert. + + Returns: + None + """ self.sync_with_cpp() self.desc.remove_op(index, index + 1) del self.ops[index] def slice_ops(self, start, end): + """ + Return the Operator between start and end. + + Args: + start(int): the start position. + end(int): the end position. + + Returns: + list: the Operators between start and end. + """ return self.ops[start:end] def prepend_op(self, *args, **kwargs): @@ -906,9 +1086,8 @@ class Block(object): def sync_with_cpp(self): """ - Sync from the desc on the c++ end. - - This method is used to synchronize the c++ desc instance generated by backward. + Sync from the desc on the c++ end. This method is used to synchronize + the c++ desc instance generated by backward. """ # sync variables from cpp for var in self.desc.all_vars(): @@ -973,9 +1152,14 @@ class Block(object): def copy_param_info_from(self, other): """ - Copy the information of parameters from the other block + Copy the information of parameters from the other block. + Args: - other(Block): the other block + other(Block): the other block. + + Raises: + ValueError: If type of input is not Block, or the `other` and this + block is not in the same topology. Returns: None @@ -1007,11 +1191,12 @@ class Block(object): def clone_variable(self, var): """ Clone a variable into current block. + Args: var: the variable to be cloned. Returns: - The new variable cloned from 'var' in current block. + Variable: the new variable cloned from 'var' in current block. """ assert isinstance(var, Variable) ret_var = None @@ -1054,23 +1239,18 @@ class Program(object): Notes: we have default_startup_program and default_main_program by default, a pair of them will shared the parameters. The default_startup_program only run once to initialize parameters, - default_main_program run in every minibatch and adjust the weights. - - Args: - None + default_main_program run in every mini batch and adjust the weights. Returns: - Python Program + A empty program. Examples: - .. code-block:: python - - main_program = Program() - startup_program = Program() - with fluid.program_guard(main_program=main_program, startup_program=startup_program): - fluid.layers.data(name="x", shape=[-1, 784], dtype='float32') - fluid.layers.data(name="y", shape=[-1, 1], dtype='int32') - fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu") + >>> main_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program): + >>> fluid.layers.data(name="x", shape=[-1, 784], dtype='float32') + >>> fluid.layers.data(name="y", shape=[-1, 1], dtype='int32') + >>> fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu") """ @@ -1084,6 +1264,19 @@ class Program(object): @property def op_role(self): + """ + The operator role. In a enum {Forward, Backward, Optimize}. + + Notes: this is a low level API. It is used only for ParallelExecutor to + duplicate or schedule operator to devices. + + For example, the forward operator should be executed on every device. + The backward operator should be executed on every device and the + parameter gradient of backward (use :code:`op_role_var` to get this + variable) operator should be merged to one device. The optimization + operators should be executed on only one device and broadcast the + optimization result, i.e., the new parameter, to every other device. + """ return self._current_role @op_role.setter @@ -1092,6 +1285,13 @@ class Program(object): @property def op_role_var(self): + """ + The auxiliary variables for :code:`op_role` property. + + See Also: :code:`Program.op_role`'s documentation for details. + + Notes: This is a very low-level API. Users should not use it directly. + """ return self._op_role_var @op_role_var.setter @@ -1100,6 +1300,21 @@ class Program(object): @contextlib.contextmanager def optimized_guard(self, var): + """ + A with guard to set :code:`Optimization` :code:`OpRole` and + :code:`OpRoleVar` automatically. + + Notes: This is a very low level API. Users should not use it directly. + + Args: + var(Variable|str): The variable (name) to be optimized. + + Examples: + + >>> p, g = backward(...) + >>> with program.optimized_guard(p): + >>> p = p - 0.001 * g + """ OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize self._op_role_var = [var.name if isinstance(var, Variable) else var] @@ -1108,18 +1323,35 @@ class Program(object): self._current_role = OpRole.Forward def __str__(self): + """ + Get the protobuf debug string of this Program. + + Returns: + (str): The protobuf debug string. + + Raises: + ValueError: If any of required fields is not set. + """ return self.to_string(True) def to_string(self, throw_on_error, with_details=False): """ To debug string. + Args: - throw_on_error(bool): raise exception when self is not initialized - when throw_on_error is True - with_details(bool): more details about variables and parameters - (e.g. trainable, optimize_attr, ...) will be printed when with_details is True + throw_on_error(bool): raise Value error when any of required fields + is not set. - Returns(str): The debug string. + with_details(bool): True if more details about variables and + parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need + to print. + + Returns + (str): The debug string. + + Raises: + ValueError: If any of required fields is not set and throw_on_error is + True. """ assert isinstance(throw_on_error, bool) and isinstance(with_details, @@ -1135,25 +1367,89 @@ class Program(object): return res_str def get_desc(self): + """ + Get the C++ side of `ProgramDesc` object pointer. The C++ object is + exposed by :code:`pybind`. + + Notes: This is a very low level API. Users should not use this API + directly. + """ return self.desc def clone(self, for_test=False): - """Clone the Program object - Args: - for_test(bool): indicate whether clone for test. + """ + Create a new, duplicated program. + - Set for_test to False when we want to clone the program for training. - Set for_test to True when we want to clone the program for testing. + Some operators, e.g., :code:`batch_norm`, behave differently between + training and testing. They have an attribute, :code:`is_test`, to + control this behaviour. This method will change the :code:`is_test` + attribute of them to :code:`True` when :code:`for_test=True`. + + * Set for_test to False when we want to clone the program for training. + * Set for_test to True when we want to clone the program for testing. + + Notes: This API DOES NOT prune any operator. Use + :code:`clone(for_test=True)` before backward and optimization please. Args: - for_test(bool): Some operators, such as batch_norm and drop_out ops, - behave differently in training and testing. If for_test is True, - the is_test attributes in these operators will be set to True for - testing purposes, otherwise, they remain unchanged. + for_test(bool): True if change the :code:`is_test` attribute of + operators to :code:`True`. Returns: - Program: The cloned Program object. - + Program: The new, duplicated Program object. + + Examples: + + 1. To clone a test program, the sample code is: + + >>> import paddle.fluid as fluid + >>> train_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(train_program, startup_program): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> hidden = fluid.layers.fc(input=img, size=200, act='relu') + >>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5) + >>> loss = fluid.layers.cross_entropy( + >>> input=fluid.layers.fc(hidden, size=10, act='softmax'), + >>> label=fluid.layers.data(name='label', shape=[1], dtype='int64')) + >>> + >>> test_program = train_program.clone(for_test=True) + >>> + >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3) + >>> with fluid.program_guard(train_program, startup_program): + >>> sgd.minimize(loss) + + 2. The :code:`clone` method can be avoid if you create program for + training and program for testing individually. + + >>> import paddle.fluid as fluid + >>> + >>> def network(is_test): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> hidden = fluid.layers.fc(input=img, size=200, act='relu') + >>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test) + >>> loss = fluid.layers.cross_entropy( + >>> input=fluid.layers.fc(hidden, size=10, act='softmax'), + >>> label=fluid.layers.data(name='label', shape=[1], dtype='int64')) + >>> return loss + >>> + >>> train_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> test_program = fluid.Program() + >>> + >>> with fluid.program_guard(train_program, startup_program): + >>> with fluid.unique_name.guard(): + >>> loss = network(is_test=False) + >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3) + >>> sgd.minimize(loss) + >>> + >>> # the test startup program is not used. + >>> with fluid.program_guard(test_program, fluid.Program()): + >>> with fluid.unique_name.guard(): + >>> loss = network(is_test=True) + + The two code snippets above will generate same programs. """ if for_test: p = self.inference_optimize() @@ -1168,6 +1464,21 @@ class Program(object): return p def prune(self, targets): + """ + Prune operators and variables which are not needed to generate + :code:`targets`. + + Notes: This is a very low level API. Users should not use this API + directly. This API is in flux and not stable. + + Args: + targets(list|Variable|Operator): A list of variables or operators + need to be pruned + + Returns: + Program: A new, pruned program. + + """ if not isinstance(targets, list): targets = [targets] targets_idx = [] @@ -1202,6 +1513,17 @@ class Program(object): return res def inference_optimize(self): + """ + This method will create a new program and change the :code:`is_test` + attribute of operators to :code:`True`. All the :code:`Parameter` + information will be lost. + + Notes: This API is a very low level API. Use + :code:`Program.clone(for_test=True)` instead. + + Returns: + Program: The new program. + """ # this is an alternative implement before # core.inference_optimize being fixed. res = Program() @@ -1218,6 +1540,18 @@ class Program(object): @staticmethod def parse_from_string(binary_str): + """ + Deserialize a program desc from protobuf binary string. + + Notes: All information about parameters will be lost after serialization + and deserialization. + + Args: + binary_str(str): The binary prootbuf string. + + Returns: + Program: A deserialized program desc. + """ p = Program() p.desc = core.ProgramDesc(binary_str) p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())] @@ -1226,10 +1560,19 @@ class Program(object): @property def random_seed(self): + """ + The default random seed for random operators in Program. Zero means get + the random seed from random device. + + Notes: It must be set before the operators have been added. + """ return self._seed @property def num_blocks(self): + """ + The number of blocks in this program. + """ return self.desc.num_blocks() @random_seed.setter @@ -1242,15 +1585,40 @@ class Program(object): return str(self) def global_block(self): + """ + Get the first block of this program. + """ return self.blocks[0] def block(self, index): + """ + Get the :code:`index` block of this program + Args: + index(int): The index of block to get + + Returns: + Block: The :code:`index` block + """ return self.blocks[index] def current_block(self): + """ + Get the current block. The :code:`current` block is the block to append + operators. + """ return self.blocks[self.current_block_idx] def create_block(self, parent_idx=None): + """ + Create a new block with the :code:`parent_idx` and change the current block + to new block. + + Args: + parent_idx(int): The parent block index. + + Returns: + Block: The new block. + """ new_block_idx = len(self.blocks) parent = self.current_block() if parent_idx is None else self.block( parent_idx) @@ -1260,9 +1628,24 @@ class Program(object): return self.current_block() def rollback(self): + """ + Exit a code block, i.e., roll back to the parent block. + Returns: + None + """ self.current_block_idx = self.current_block().parent_idx def sync_with_cpp(self): + """ + Synchronize Python instance to its binding C++ object instance. + If the program is modified in C++ space, this method should be invoked. + + Notes: This is a very low level API. Users should not invoke it + directly. + + Returns: + None + """ for block_idx in range(len(self.blocks), self.desc.num_blocks()): self.blocks.append(Block(self, block_idx)) for block in self.blocks: @@ -1272,6 +1655,9 @@ class Program(object): """ Copy the information of parameters from other program. + Notes: This is a very low level API. Users should not invoke it + directly. + Args: other(Program): Other program @@ -1291,6 +1677,9 @@ class Program(object): """ Copy the information of data variables from other program. + Notes: This is a very low level API. Users should not invoke it + directly. + Args: other(Program): Other program @@ -1309,12 +1698,41 @@ class Program(object): self.global_block().var(var.name).is_data = True def list_vars(self): + """ + Get all variables from this Program. A iterable object is returned. + + Returns: + iterable: The generator will yield every variable in this program. + """ for each_block in self.blocks: for each_var in each_block.vars.itervalues(): yield each_var class Parameter(Variable): + """ + Parameter is derived from Variable. A parameter is a persistable + Variable, and will be updated by optimizers after each iteration. + The training of a neural network is essentially the updating of + its parameters. + + Relative to a general Variable, a Parameter has several its own + member variables: + + Args: + trainable(bool): True if the parameter need to be updated after + iterations. + optimize_attr(map): Parameter attributes related with optimizing. + Currently, it only contains 'learning_rate'. + Default: {'learning_rate': 1.0} + regularizer(WeightDecayRegularizer): The Regularizer which will + be applied on the parameter. Default: None + gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy + which will be applied on the parameter. Default: None + do_model_average(bool): True if the model average strategy will + be applied on this parameter. + """ + def __init__(self, block, shape, dtype, **kwargs): if shape is None or dtype is None: raise ValueError("Parameter must set shape and dtype") @@ -1377,8 +1795,15 @@ _startup_program_ = Program() def default_startup_program(): """ - Get default startup program. In startup program, Paddle will initialize - parameters, initialize nccl handle, etc. + Get default/global startup program. + + The layer function in :code:`fluid.layers` will create parameters, readers, + NCCL handles as global variables. The :code:`startup_program` will + initialize them by the operators in startup program. The layer function will + append these initialization operators into startup program. + + This method will return the :code:`default` or the :code:`current` startup + program. Users can use :code:`fluid.program_guard` to switch program. Returns: Program: startup program @@ -1388,7 +1813,15 @@ def default_startup_program(): def default_main_program(): """ - Get default main program. The main program is used for training or testing. + Get default/global main program. The main program is used for training or + testing. + + All layer function in :code:`fluid.layers` will append operators and + variables to the :code:`default_main_program`. + + The :code:`default_main_program` is the default program in a lot of APIs. + For example, the :code:`Executor.run()` will execute the + :code:`default_main_program` when the program is not specified. Returns: Program: main program @@ -1430,20 +1863,34 @@ def switch_startup_program(program): @contextlib.contextmanager def program_guard(main_program, startup_program=None): """ - Switch program with `with` statement + Change the global main program and startup program with `with` statement. + Layer functions in the Python `with` block will append operators and + variables to the new main programs. Examples: - >>> with program_guard(Program()): - >>> data = fluid.layers.data(...) - >>> hidden = fluid.layers.fc(...) + + >>> import paddle.fluid as fluid + >>> main_program = fluid.Program() + >>> startup_program = fluid.Program() + >>> with fluid.program_guard(main_program, startup_program): + >>> data = fluid.layers.data(...) + >>> hidden = fluid.layers.fc(...) + + Notes: The temporary :code:`Program` can be used if the user does not need + to construct either of startup program or main program. + + Examples: + + >>> import paddle.fluid as fluid + >>> main_program = fluid.Program() + >>> # does not care about startup program. Just pass a temporary value. + >>> with fluid.program_guard(main_program, fluid.Program()): + >>> data = ... Args: - main_program(Program): New main program inside `with` statement + main_program(Program): New main program inside `with` statement. startup_program(Program): New startup program inside `with` statement. None means do not change startup program. - - Returns: - None """ if not isinstance(main_program, Program): raise TypeError("main_program should be Program") @@ -1460,7 +1907,8 @@ def program_guard(main_program, startup_program=None): def get_var(name, program=None): """ - Get a variable by name from the global block of a program + Get a variable by name from the global block of a program. + Args: name(str): name of the variable program(Program|None): program object. diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py index 61be39c25912604f842ef8a9a6ec5f0d1cf70257..c417ab393fca88d476d2f1fe83d12f99271d6883 100644 --- a/python/paddle/fluid/lod_tensor.py +++ b/python/paddle/fluid/lod_tensor.py @@ -19,33 +19,41 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] def create_lod_tensor(data, lod, place): - """Create a lod tensor from a numpy array, a list, or an existing lod tensor. + """ + Create a lod tensor from a numpy array, a list, or an existing lod tensor. Create a lod tensor by doing the following: + 1. Check that the length-based input lod is valid. + 2. Convert the length-based lod to a offset-based LoD. - 3. Copy the data from a numpy array, a list or a existing lod tensor to + + 3. Copy the data from a numpy array, a list or a existing lod tensor to CPU or GPU device (based on input place). + 4. Set the level of detail (LoD) using the offset-based LoD. - Use example: - Suppose we want LoDTensor to hold data for sequences of word, where each word is - represented by an integer. If we want to create a LoDTensor to represent two - sentences, one of 2 words, and one of 3 words. + Examples: - Then 'data' can be a numpy array of integers with shape (5, 1). - 'lod' will be [[2, 3]], indicating the length(# of words) in each sentence. - This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]] - inside the function call. + Suppose we want LoDTensor to hold data for sequences of word, where each + word is represented by an integer. If we want to create a LoDTensor to + represent two sentences, one of 2 words, and one of 3 words. - Please refer to - github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md - for more details regarding LoD. + Then :code:`data` can be a numpy array of integers with shape (5, 1). + :code:`lod` will be [[2, 3]], indicating the length(# of words) in each + sentence. This length-based input lod [[2, 3]] will be converted to + offset-based lod [[0, 2, 5]] inside the function call. + + Please reference :ref:`api_guide_low_level_lod_tensor` for more details + regarding LoD. Args: - data: a numpy array or a LoDTensor or a list holding the data to be copied. - lod: a list of lists indicating the length-based LoD info specified by the user. - place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. + data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a + list holding the data to be copied. + lod(list): a list of lists indicating the length-based LoD info + specified by the user. + place(Place): CPU or GPU place indicating where the data in the new + LoDTensor will be stored. Returns: A fluid LoDTensor object with tensor data and lod info. @@ -77,31 +85,38 @@ def create_lod_tensor(data, lod, place): def create_random_int_lodtensor(lod, base_shape, place, low, high): - """Create a LoDTensor containing random integers. + """ + Create a LoDTensor containing random integers. - This function is frequently used in the book examples. So we revised it based on - the new create_lod_tensor API and put it here in the lod_tensor module to simplify - the code. + This function is frequently used in the book examples. So we revised it + based on the new create_lod_tensor API and put it here in the lod_tensor + module to simplify the code. The function does the following: - 1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input - and the shape of the basic element in 'base_shape'. + + 1. Calculate the overall shape of the LoDTensor based on the length-based + :code:`lod` input and the shape of the basic element in + :code:`base_shape`. + 2. Create a numpy array of this shape. + 3. Create the LoDTensor using create_lod_tensor API. - Suppose we want LoDTensor to hold data for sequences of word, where each word is - represented by an integer. If we want to create a LoDTensor to represent two - sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input - length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be - [5, 1], holding 5 words for two sentences. + Suppose we want LoDTensor to hold data for sequences of word, where each + word is represented by an integer. If we want to create a LoDTensor to + represent two sentences, one of 2 words, and one of 3 words. Then + 'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall + shape of the LoDTensor would be [5, 1], holding 5 words for two sentences. Args: - data: a numpy array or a LoDTensor holding the data to be copied. - lod: a list of lists indicating the length-based LoD info specified by the user. - base_shape: the shape of the basic element to be held by the LoDTensor. - place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. - low: the lower bound of the random integers. - high: the upper bound of the random integers. + lod(list): a list of lists indicating the length-based LoD info + specified by the user. + base_shape(list): the shape of the basic element to be held by the + LoDTensor. + place(Place): CPU or GPU place indicating where the data in the new + LoDTensor will be stored. + low(int): the lower bound of the random integers. + high(int): the upper bound of the random integers. Returns: A fluid LoDTensor object with tensor data and lod info. diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8..572475b483ff0341a97a91b6c5309fcf337dacbe 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -325,14 +325,14 @@ class Auc(MetricBase): """ def __init__(self, name, curve='ROC', num_thresholds=200): - super(MetricBase, self).__init__(name, curve, num_thresholds) + super(Auc, self).__init__(name=name) self._curve = curve self._num_thresholds = num_thresholds self._epsilon = 1e-6 - self.tp_list = np.ndarray((num_thresholds, )) - self.fn_list = np.ndarray((num_thresholds, )) - self.tn_list = np.ndarray((num_thresholds, )) - self.fp_list = np.ndarray((num_thresholds, )) + self.tp_list = np.zeros((num_thresholds, )) + self.fn_list = np.zeros((num_thresholds, )) + self.tn_list = np.zeros((num_thresholds, )) + self.fp_list = np.zeros((num_thresholds, )) def update(self, labels, predictions, axis=1): if not _is_numpy_(labels): @@ -350,12 +350,12 @@ class Auc(MetricBase): tp, fn, tn, fp = 0, 0, 0, 0 for i, lbl in enumerate(labels): if lbl: - if predictions[i, 0] >= thresh: + if predictions[i, 1] >= thresh: tp += 1 else: fn += 1 else: - if predictions[i, 0] >= thresh: + if predictions[i, 1] >= thresh: fp += 1 else: tn += 1 diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index bbedf6fde0872fd32d81c103bf5fe61449b7f57b..9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -26,16 +26,87 @@ def simple_img_conv_pool(input, filter_size, pool_size, pool_stride, - act, - param_attr=None, + pool_padding=0, pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + param_attr=None, + bias_attr=None, + act=None, use_cudnn=True, use_mkldnn=False): + """ + The simple_img_conv_pool is composed with one Convolution2d and one Pool2d. + + Args: + input (Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + feature channel. + filter_size (int|list|tuple): The filter size. If filter_size is a list or + tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise, + the filter_size_H = filter_size_W = filter_size. + pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size + is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W). + Otherwise, the pool_size_H = pool_size_W = pool_size. + pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride + is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W). + Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride. + pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or + tuple, it must contain two integers, (pool_padding_H, pool_padding_W). + Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0. + pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for + average-pooling. Default :math:`max`. + global_pooling (bool): Whether to use the global pooling. If global_pooling = true, + pool_size and pool_padding while be ignored. Default False + conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a + list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise, + the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1. + conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is + a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W). + Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0. + conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is + a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W). + Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1. + conv_groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None + act (str): Activation type for Conv2d. Default: None + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled + with mkldnn library. Default: False + + Return: + Variable: The result of input after Convolution2d and Pool2d. + + Examples: + .. code-block:: python + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + conv_pool = fluid.nets.simple_img_conv_pool(input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + """ conv_out = layers.conv2d( input=input, num_filters=num_filters, filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, param_attr=param_attr, + bias_attr=bias_attr, act=act, use_cudnn=use_cudnn, use_mkldnn=use_mkldnn) @@ -45,6 +116,8 @@ def simple_img_conv_pool(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, use_cudnn=use_cudnn, use_mkldnn=use_mkldnn) return pool_out @@ -60,11 +133,65 @@ def img_conv_group(input, conv_with_batchnorm=False, conv_batchnorm_drop_rate=0.0, pool_stride=1, - pool_type=None, + pool_type="max", use_cudnn=True, use_mkldnn=False): """ - Image Convolution Group, Used for vgg net. + The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut, + and Pool2d. According to the input arguments, img_conv_group will do serials of + computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last + result to Pool2d. + + Args: + input (Variable): The input image with [N, C, H, W] format. + conv_num_filter(list|tuple): Indicates the numbers of filter of this group. + pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size + is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W). + Otherwise, the pool_size_H = pool_size_W = pool_size. + conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is + a list or tuple, its length must be equal to the length of conv_num_filter. + Otherwise the conv_padding of all Conv2d Layers are the same. Default 1. + conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or + tuple, its length must be equal to the length of conv_num_filter. + Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3. + conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm. + Default: None. + param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None + conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer. + If conv_with_batchnorm is a list, its length must be equal to the length of + conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the + Conv2d Layer follows a BatchNorm. Default False. + conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer + after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be + equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout + Layers is conv_batchnorm_drop_rate. Default 0.0. + pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride + is a list or tuple, it must contain two integers, (pooling_stride_H, + pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride. + Default 1. + pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for + average-pooling. Default :math:`max`. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled + with mkldnn library. Default: False + + Return: + Variable: The final result after serial computation using Convolution2d, + BatchNorm, DropOut, and Pool2d. + + Examples: + .. code-block:: python + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + conv_pool = fluid.nets.img_conv_group(input=img, + num_channels=3, + conv_padding=1, + conv_num_filter=[3, 3], + conv_filter_size=3, + conv_act="relu", + pool_size=2, + pool_stride=2) """ tmp = input assert isinstance(conv_num_filter, list) or \ @@ -74,6 +201,7 @@ def img_conv_group(input, if not hasattr(obj, '__len__'): return [obj] * len(conv_num_filter) else: + assert len(obj) == len(conv_num_filter) return obj conv_padding = __extend_list__(conv_padding) @@ -119,6 +247,39 @@ def sequence_conv_pool(input, param_attr=None, act="sigmoid", pool_type="max"): + """ + The sequence_conv_pool is composed with Sequence Convolution and Pooling. + + Args: + input (Variable): The input of sequence_conv, which supports variable-time + length input sequence. The underlying of input is a matrix with shape + (T, N), where T is the total time steps in this mini-batch and N is + the input_hidden_size + num_filters(int): The number of filter. + filter_size (int): The filter size. + param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None. + act (str): Activation type for Sequence_conv Layer. Default: "sigmoid". + pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for + average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling. + Default :math:`max`. + + Return: + Variable: The final result after Sequence Convolution and Pooling. + + Examples: + .. code-block:: python + + input_dim = len(word_dict) + emb_dim = 128 + hid_dim = 512 + data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1) + emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True) + seq_conv = fluid.nets.sequence_conv_pool(input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + """ conv_out = layers.sequence_conv( input=input, num_filters=num_filters, @@ -132,9 +293,9 @@ def sequence_conv_pool(input, def glu(input, dim=-1): """ - The gated linear unit composed by split, sigmoid activation and elementwise - multiplication. Specifically, Split the input into two equal sized parts - :math:`a` and :math:`b` along the given dimension and then compute as + The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise + multiplication. Specifically, Split the input into two equal sized parts, + :math:`a` and :math:`b`, along the given dimension and then compute as following: .. math:: @@ -147,16 +308,16 @@ def glu(input, dim=-1): Args: input (Variable): The input variable which is a Tensor or LoDTensor. dim (int): The dimension along which to split. If :math:`dim < 0`, the - dimension to split along is :math:`rank(input) + dim`. + dimension to split along is :math:`rank(input) + dim`. Default -1. Returns: - Variable: The Tensor variable with half the size of input. + Variable: Variable with half the size of input. Examples: .. code-block:: python - # x is a Tensor variable with shape [3, 6, 9] - fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9] + data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32") + output = fluid.nets.glu(input=data, dim=1) # shape of output: [3, 3, 9] """ a, b = layers.split(input, num_or_sections=2, dim=dim) @@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries, `_. Args: - queries (Variable): The input variable which should be a 3-D Tensor. keys (Variable): The input variable which should be a 3-D Tensor. values (Variable): The input variable which should be a 3-D Tensor. num_heads (int): Head number to compute the scaled dot product - attention. Default value is 1. + attention. Default: 1. dropout_rate (float): The dropout rate to drop the attention weight. - Default value is 0. + Default: 0.0. Returns: - - Variable: A 3-D Tensor computed by multi-head scaled dot product \ - attention. + Variable: A 3-D Tensor computed by multi-head scaled dot product\ + attention. Raises: - ValueError: If input queries, keys, values are not 3-D Tensors. - NOTE: + NOTES: 1. When num_heads > 1, three linear projections are learned respectively - to map input queries, keys and values into queries', keys' and values'. - queries', keys' and values' have the same shapes with queries, keys - and values. - - 1. When num_heads == 1, scaled_dot_product_attention has no learnable - parameters. + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + 2. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. Examples: .. code-block:: python - # Suppose q, k, v are Tensors with the following shape: - # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] - - contexts = fluid.nets.scaled_dot_product_attention(q, k, v) + queries = fluid.layers.data(name="queries", + shape=[3, 5, 9], + dtype="float32", + append_batch_size=False) + queries.stop_gradient = False + keys = fluid.layers.data(name="keys", + shape=[3, 6, 9], + dtype="float32", + append_batch_size=False) + keys.stop_gradient = False + values = fluid.layers.data(name="values", + shape=[3, 6, 10], + dtype="float32", + append_batch_size=False) + values.stop_gradient = False + contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values) contexts.shape # [3, 5, 10] """ if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 0fdc9a035292b3390cece6c5821a60b1b281e54d..25cc1355d5a53e44b7f45c1f7d80673abcf567ec 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy class ParallelExecutor(object): + """ + ParallelExecutor can run program in parallel. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provied, it will share variables + from the specified ParallelExecutor. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int: Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + def __init__(self, use_cuda, loss_name=None, @@ -37,42 +71,6 @@ class ParallelExecutor(object): num_trainers=1, trainer_id=0, **kwargs): - """ - ParallelExecutor can run program in parallel. - - Args: - use_cuda(bool): Whether to use CUDA or not. - loss_name(str, default None): The loss name must set in training. - main_program(Program, default None): The program that need to run, - if not provided, then default_main_program will be used. - share_vars_from(ParallelExecutor, default None): If provied, - it will share variables from the specified ParallelExecutor. - num_trainers(int, default 1): If greater than 1, NCCL will be - initialized with multpile rank of nodes, each node should have - same number of GPUs. Distributed training will be enabled then. - trainer_id(int, default 0): Must use together with num_trainers. - trainer_id is the "rank" of current node starts from 0. - - Returns: - A ParallelExecutor object. - - Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor - object. - - Examples: - .. code-block:: python - - train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor( - use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) - """ if len(kwargs) != 0: err_msg = "" for key in kwargs: @@ -131,10 +129,16 @@ class ParallelExecutor(object): main = main_program main = main if main else framework.default_main_program() scope = executor.global_scope() + # FIXME(Yancey1989): it's a temporary approach to determinate the distribute + # train program, call self.bcast_param() at the end of each mini-batch. + self.is_dist = True if "recv" in [ + op.type for op in main.global_block().ops + ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): raise TypeError("share_vars_from must be ParallelExecutor.") + local_scopes = share_vars_from.executor.local_scopes( ) if share_vars_from else [] @@ -166,12 +170,14 @@ class ParallelExecutor(object): element in the list will be copied to each device directly. For example, if the feed is a dict: + >>> exe = ParallelExecutor() >>> # the image will be splitted into devices. If there is two devices >>> # each device will process an image with shape (24, 1, 28, 28) >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) For example, if the feed is a list: + >>> exe = ParallelExecutor() >>> # each device will process each element in the list. >>> # the 1st device will process an image with shape (48, 1, 28, 28) @@ -182,18 +188,40 @@ class ParallelExecutor(object): >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, >>> ]) - Args: fetch_list(list): The fetched variable names feed(list|dict|None): The feed variables. If the feed is a dict, tensors in that dict will be splitted into each devices. If the feed is a list, each element of the list will be copied - to each device. + to each device. Default None. feed_dict: Alias for feed parameter, for backward compatibility. - This parameter is deprecated. + This parameter has been deprecated. Default None. + + Returns: + List: The fetched result list. + + Raises: + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. - Returns: fetched result list. + Examples: + .. code-block:: python + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ if feed is None and feed_dict is not None: feed = feed_dict @@ -238,9 +266,17 @@ class ParallelExecutor(object): fetch_var_name = '@FETCHED_VAR_NAME@' self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if self.is_dist: + self.bcast_params() + return [arr[i] for i in range(len(arr))] def bcast_params(self): + """ + Broadcast the parameters to other devices. It is used during + distributed training. + """ self.executor.bcast_params(set(self.persistable_vars)) @property diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 1c6970441bccdc1c1221503256c30c83502bd123..0a42b9fca8dba7a11b414990be6c04c93158864f 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -22,6 +22,35 @@ __all__ = [ class ParamAttr(object): + """ + Parameter attributes object. To fine-tuning network training process, user + can set parameter's attributes to control training details. Such as learning rate, + regularization, trainable, do_model_average and the method to initialize param. + + + Args: + name(str): The parameter's name. Default None. + initializer(Initializer): The method to initial this parameter. Default None. + learning_rate(float): The parameter's learning rate. The learning rate when + optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`. + Default 1.0. + regularizer(WeightDecayRegularizer): Regularization factor. Default None. + trainable(bool): Whether this parameter is trainable. Default True. + gradient_clip(BaseGradientClipAttr): The method to clip this parameter's + gradient. Default None. + do_model_average(bool): Whether this parameter should do model average. + Default False. + + Examples: + .. code-block:: python + + w_param_attrs = fluid.ParamAttr(name="fc_weight", + learning_rate=0.5, + regularizer=fluid.L2Decay(1.0), + trainable=True) + y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) + """ + def __init__(self, name=None, initializer=None, @@ -29,7 +58,7 @@ class ParamAttr(object): regularizer=None, trainable=True, gradient_clip=None, - do_model_average=None): + do_model_average=False): self.name = name self.initializer = initializer self.learning_rate = learning_rate @@ -39,6 +68,16 @@ class ParamAttr(object): self.model_average = do_model_average def set_default_initializer(self, initializer): + """ + Set the default initializer, the initializer should be Constant, + Uniform, Normal, Xavier, MSRA. + + Args: + initializer(Initializer): the initializer to set. + + Returns: + None + """ if initializer is None: if self.initializer is None: raise ValueError("ParamAttr.initializer is not set") @@ -50,13 +89,45 @@ class ParamAttr(object): self.initializer = initializer def set_default_param_initializer(self): + """ + Set the default initializer for the parameter with Xavier. + + Args: + None. + + Returns: + None. + """ self.set_default_initializer(Xavier()) def set_default_bias_initializer(self): + """ + Set the default initializer for the bias with Constant(0.0). + + Args: + None. + + Returns: + None. + """ self.set_default_initializer(Constant(0.0)) @staticmethod def to_attr(arg): + """ + Create ParamAttr[s]. + + Args: + arg: Arguments to initialize ParamAttr[s]. arg's type can be + str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr, + bool, ParamAttr, or a list of above type. + + Returns: + ParamAttr[s]: ParamAttr[s] initialized with arg. + + Raises: + arg can not initialize a ParamAttr. + """ if arg is None: return ParamAttr() elif isinstance(arg, list) or isinstance(arg, tuple): @@ -75,6 +146,15 @@ class ParamAttr(object): raise TypeError("{0} cast to ParamAttr".format(type(arg))) def to_kwargs(self, with_initializer=False): + """ + Returns the attributes of this parameter. + + Args: + with_initializer(bool): Whether to add initializer attr. + + Returns: + Parameter attributes(map): The attributes of this parameter. + """ kwargs = { 'name': self.name, 'optimize_attr': { @@ -92,9 +172,27 @@ class ParamAttr(object): class WeightNormParamAttr(ParamAttr): """ - Used for weight normalization. Any field in ParamAttr can also be set here. - Besides, an extra field dim can be set to indicate the dimension except - which to normalize. + Used for weight Norm. Weight Norm is a reparameterization of the weight vectors + in a neural network that decouples the length of those weight vectors from + their direction. Weight Norm has been implemented as discussed in this + paper: `Weight Normalization: A Simple Reparameterization to Accelerate + Training of Deep Neural Networks + `_. + + Args: + dim(list): The parameter's name. Default None. + kwargs: Any field in ParamAttr. Default None. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, + size=1000, + param_attr=WeightNormParamAttr( + dim=None, + name='weight_norm_param')) + """ # List to record the parameters reparameterized by weight normalization. # If these parameters are treated as Variable rather than Parameter, diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 8d48e9abef0fb9861284c6302b30efb0e3994989..bd57772713057f12b876942de58ee43527e94834 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -36,6 +36,45 @@ def convert_reader_to_recordio_file( compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000, feed_order=None): + """ + Convert a Python Reader to a recordio file. + + Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for + details. + + Examples: + + >>> import paddle.fluid as fluid + >>> import paddle.dataset.mnist as mnist + >>> import paddle + >>> + >>> tmp_program = fluid.Program() + >>> with fluid.program_guard(tmp_program): + >>> img = fluid.layers.data(name='img', shape=[784]) + >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') + >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace()) + >>> # mnist.recordio will be generated in current directory + >>> fluid.recordio_writer.convert_reader_to_recordio_file( + >>> filename="mnist.recordio", + >>> reader_creator=paddle.batch(mnist.train(), batch_size=32), + >>> feeder=feeder) + + Args: + filename(str): The recordio filename. + reader_creator(callable): The Python Reader Creator. See + :ref:`api_guide_python_reader`. + feeder(DataFeeder): The DataFeeder instance. Used to convert + :code:`reader_creator` to :code: `lod_tensor` + compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or + fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy` + by default. + max_num_records(int): Maximum number of records in one chuck. Each record + is each return value from reader function + feed_order(list): The order of variable names that the reader returns + + Returns: + int: the number of record that saved. + """ if feed_order is None: feed_order = feeder.feed_names counter = 0 @@ -58,6 +97,17 @@ def convert_reader_to_recordio_files( compressor=core.RecordIOWriter.Compressor.Snappy, max_num_records=1000, feed_order=None): + """ + convert a python reader to many recordio files. + + This API is basically same as :code:`convert_reader_to_recordio_file`, + instead of it will create many recordio files. Each file contains at + most :code:`batch_per_file` records. + + Please reference + :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more + details. + """ if feed_order is None: feed_order = feeder.feed_names f_name, f_ext = os.path.splitext(filename) diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py index 87c11e7880e73b911f21dda77c1cc2b4850b3591..b04f25ef874cc6204211a4f5f5991a0ec8c473dd 100644 --- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py +++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core def bilinear_interp_np(input, out_h, out_w, out_size): @@ -45,9 +46,9 @@ def bilinear_interp_np(input, out_h, out_w, out_size): out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + w1lambda*input[:, :, h, w+wid]) + \ - h1lambda*(w2lambda*input[:, :, h+hid, w] + - w1lambda*input[:, :, h+hid, w+wid]) - return out.astype("float32") + h1lambda*(w2lambda*input[:, :, h+hid, w] + + w1lambda*input[:, :, h+hid, w+wid]) + return out.astype(input.dtype) class TestBilinearInterpOp(OpTest): @@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp): self.out_size = np.array([65, 129]).astype("int32") +class TestBilinearInterpOpUint8(OpTest): + def setUp(self): + self.out_size = None + self.init_test_case() + self.op_type = "bilinear_interp" + input_np = np.random.randint( + low=0, high=256, size=self.input_shape).astype("uint8") + output_np = bilinear_interp_np(input_np, self.out_h, self.out_w, + self.out_size) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + self.attrs = {'out_h': self.out_h, 'out_w': self.out_w} + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(place=core.CPUPlace(), atol=1) + + def init_test_case(self): + self.input_shape = [1, 3, 9, 6] + self.out_h = 10 + self.out_w = 9 + + +class TestCase1Uint8(TestBilinearInterpOpUint8): + def init_test_case(self): + self.input_shape = [2, 3, 128, 64] + self.out_h = 120 + self.out_w = 50 + + +class TestCase2Uint8(TestBilinearInterpOpUint8): + def init_test_case(self): + self.input_shape = [4, 1, 7, 8] + self.out_h = 5 + self.out_w = 13 + self.out_size = np.array([6, 15]).astype("int32") + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index b4cb019aea347fcd2207829fe5b5b065159f7882..ad3872ab0d6ff82db12b436b8b729eead6a3e16d 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -33,23 +33,59 @@ __all__ = [ class BeginEpochEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + def __init__(self, epoch_id): self.epoch = epoch_id class EndEpochEvent(object): + """ + The end of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + """ + def __init__(self, epoch_id): self.epoch = epoch_id class BeginStepEvent(object): + """ + The begin of a training epoch. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + """ + def __init__(self, epoch_id, step_id): self.epoch = epoch_id self.step = step_id self.fetch_metrics = True + """ + If fetch_metrics is true, the metrics will be fetched at the + EndStepEvent. Default is True. + """ class EndStepEvent(object): + """ + The end of a training step. + + Args: + epoch_id(int): The current epoch ID. + step_id(int): The current step ID. + metrics(list): A list of fetched tensor. The order of this list is same + as the :code:`train_func` returns. + """ + def __init__(self, epoch_id, step_id, metrics): self.epoch = epoch_id self.step = step_id @@ -57,6 +93,27 @@ class EndStepEvent(object): class CheckpointConfig(object): + """ + Parameter object for :code:`fluid.io.save_checkpoint` and + :code:`fluid.Trainer`. Used to configuration how to save checkpoint. + + Args: + checkpoint_dir(str): Directory path to save check point. Default is the + current directory. + + max_num_checkpoints(int): The max number of local check points. + epoch_interval(int): Every number of epoch to save check point. + step_interval(int): Every number of step to save check point. + + Examples: + >>> config = fluid.CheckpointConfig("./checkpoints") + >>> trainer = fluid.Trainer(train_func=train_program, + >>> place=place, + >>> optimizer_func=optimizer_func, + >>> checkpoint_config=config) + >>> trainer.train(...) + """ + def __init__(self, checkpoint_dir=None, max_num_checkpoints=3, @@ -106,11 +163,62 @@ def check_and_get_place(place): class Trainer(object): """ + A trainer wraps MultiGPU/MultiNode training loops and can be used to train a + simple neural network easily. + + This API takes a :code:`train_func`. A :code:`train_func` is a function that + return loss as it first return value. The reset value can be fetched by + EndStepEvent.metrics + + This API also takes a :code:`optimizer_func` that will return an optimizer + instance. + + For example, to train a MLP for MNIST dataset, the sample program is + + >>> import paddle.fluid as fluid + >>> + >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10): + >>> hidden = image + >>> for layer_size in layer_sizes: + >>> hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation) + >>> return fluid.layers.fc(input=hidden, size=num_classes, act="softmax") + >>> + >>> def train_mnist_mlp(): + >>> img = fluid.layers.data(name='image', shape=[784]) + >>> label = fluid.layers.data(name='label', shape=[1], dtype='int64') + >>> prediction = mlp(img) + >>> return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label)) + >>> + >>> def optimizer(): + >>> return fluid.optimizer.Adam() + >>> + >>> trainer = Trainer(train_func=train_mnist_mlp, + >>> optimizer_func=optimizer, + >>> place=fluid.CUDAPlace(0), + >>> parallel=True) + >>> + >>> def train_callback(event): + >>> if isinstance(event, fluid.EndStepEvent): + >>> print "Epoch ID", event.epoch, "Step ID",\ + >>> event.step, "AvgLoss", event.metrics[0] + >>> elif isinstance(event, fluid.EndEpochEvent): + >>> trainer.save_params("./model_{0}".format(event.epoch)) + >>> + >>> trainer.train(num_epochs=100, event_handler=train_callback) + + For more example, please see :ref:`api_guide_high_level_api`. + Args: - train_func(callable): A function which will return loss. The loss must be a scalar. + train_func(callable): A function which will return loss. The loss must be + a scalar tensor. optimizer_func(callable): A function that returns an Optimizer object. - place: The device place of this trainer. + place(CUDAPlace|CPUPlace): The device place of this trainer. If + :code:`parallel=True,` all CUDA Places will be used if :code:`place` + is a :code:`CUDAPlace`. + parallel(bool): True if use multiple devices. + checkpoint_config(CheckpointConfig): Configuration about how to save + checkpoints. """ def __init__(self, @@ -122,9 +230,6 @@ class Trainer(object): checkpoint_config=None): self.__stop = False self.parallel = parallel - # 1. we need to generate a framework.Program by calling - # program_func. Reference: fluid.program_guard in - # test_word2vec.py # config for checkpoint # only chief worker will save variables @@ -138,6 +243,10 @@ class Trainer(object): self.scope = core.Scope() + # 1. we need to generate a framework.Program by calling + # program_func. Reference: fluid.program_guard in + # test_word2vec.py + self.startup_program = framework.Program() self.train_program = framework.Program() @@ -280,17 +389,18 @@ class Trainer(object): def train(self, num_epochs, event_handler, reader=None, feed_order=None): """ - Train the model. + Start the train loop to train the model. Args: - num_epochs: The number of epoch. An epoch will process all data in reader - event_handler: The event handler. A function with type (ev:Event)->void - reader: - feed_order: Feeding order of reader. None will following the defining + num_epochs(int): The number of epoch. An epoch will process all data in reader + event_handler(callable): The event handler. A function with type (ev:Event)->void + reader(callable): A reader creator object. See also + :ref:`api_guide_python_reader` . + feed_order(list): Feeding order of reader. None will following the defining order in program Returns: - + None """ training_role = os.getenv("PADDLE_TRAINING_ROLE", "") if training_role == "PSERVER": @@ -310,16 +420,24 @@ class Trainer(object): Test the model on given test data Args: - reader: The reader that yields test data. - feed_order: Feeding order of reader. None will following the defining - order in program + reader(callable): The reader that yields test data. + feed_order(list): Feeding order of reader. None will following the + defining order in program """ return self._test_by_executor(reader, feed_order, self.train_func_outputs) def save_params(self, param_path): - # reference: save_persistables in io.py + """ + Save all parameters into :code:`param_path`. + + Args: + param_path(str): The path to save parameters. + + Returns: + None + """ with self._prog_and_scope_guard(): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 5bbeeeaed666651300fdd2e29abed6a6062de44f..a3f0a4ffe28f145c33b10117f69f933f4b93be29 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -12,14 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Transpile the program to distributed data-parallelism programs. -The main_program will be transformed to use a remote parameter server -to do parameter optimization. And the optimization graph will be put -into a parameter server program. - -Use different methods to split trainable variables to different -parameter servers. - Steps to transpile trainer: 1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 2. rename splited grad variables to add trainer_id suffix ".trainer_%d". @@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192): return blocks -class DistributeTranspiler: - def _has_distributed_lookup_table(self): - # process lookup_table_op - # 1. check all lookup_table_op is distributed - # 2. check all lookup_table_op share the same table. - distributed_lookup_table_ops = [] - # support only one distributed_lookup_table now - self.table_name = None - for op in self.origin_program.global_block().ops: - if op.type == LOOKUP_TABLE_TYPE: - if op.attrs['is_distributed'] is True: - if self.table_name is None: - self.table_name = op.input("W")[0] - if self.table_name != op.input("W")[0]: - raise RuntimeError("all distributed lookup_table_ops" - " should have only one table") - distributed_lookup_table_ops.append(op) - else: - if self.table_name is not None: - assert op.input("W")[0] != self.table_name - - return len(distributed_lookup_table_ops) > 0 - - def _update_dist_lookup_table_vars(self, param_list, grad_list, - params_grads): - # TODO(wuyi): put find a way to put dist lookup table stuff all together. - # update self.table_param_grad and self.trainer_side_table_grad_list - program = self.origin_program - if self.has_distributed_lookup_table: - param_list = [ - param for param in param_list if param.name != self.table_name - ] - grad_list = [ - grad for grad in grad_list - if grad.name != grad_var_name(self.table_name) - ] - self.table_param_grad = [ - param_grad for param_grad in params_grads - if param_grad[0].name == self.table_name - ][0] - table_grad_var = self.table_param_grad[1] - if self.sync_mode: - self.trainer_side_table_grad_list = [ - program.global_block().create_var( - name="%s.trainer_%d.pserver_%d" % - (table_grad_var.name, self.trainer_id, index), - type=table_grad_var.type, - shape=table_grad_var.shape, - dtype=table_grad_var.dtype) - for index in range(len(self.pserver_endpoints)) - ] - else: - self.trainer_side_table_grad_list = [ - program.global_block().create_var( - name="%s.pserver_%d" % (table_grad_var.name, index), - type=table_grad_var.type, - shape=table_grad_var.shape, - dtype=table_grad_var.dtype) - for index in range(len(self.pserver_endpoints)) - ] - return param_list, grad_list - - def _init_splited_vars(self, slice_var_up): - # update these mappings for further transpile: - # 1. param_var_mapping: param var name -> [splited params vars] - # 2. grad_var_mapping: grad var name -> [splited grads vars] - # 3. grad_param_mapping: grad.blockx -> param.blockx - # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []} - - param_list = [] - grad_list = [] - param_grad_set = set() - for p, g in self.params_grads: - # skip parameter marked not trainable - if type(p) == Parameter and p.trainable == False: - continue - if p.name not in param_grad_set: - param_list.append(p) - param_grad_set.add(p.name) - if g.name not in param_grad_set: - grad_list.append(g) - param_grad_set.add(g.name) - - param_list, grad_list = self._update_dist_lookup_table_vars( - param_list, grad_list, self.params_grads) - - if slice_var_up: - # when we slice var up into blocks, we will slice the var according to - # pserver services' count. A pserver may have two or more listening ports. - grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) - param_blocks = slice_variable(param_list, - len(self.pserver_endpoints)) - else: - # when we do NOT slice var up into blocks, we will always slice params - # grads into one block. - grad_blocks = slice_variable(grad_list, 1) - param_blocks = slice_variable(param_list, 1) - assert (len(grad_blocks) == len(param_blocks)) - - # origin_varname -> [splited_var] - self.param_var_mapping = self._create_vars_from_blocklist( - self.origin_program, param_blocks) - self.grad_var_mapping = self._create_vars_from_blocklist( - self.origin_program, - grad_blocks, - add_trainer_suffix=self.trainer_num > 1) - self.grad_param_mapping = dict() - for g, p in zip(grad_blocks, param_blocks): - g_name, g_bid, _ = g.split(":") - p_name, p_bid, _ = p.split(":") - self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ - self.param_var_mapping[p_name][int(p_bid)] - - # create mapping of endpoint -> split var to create pserver side program - self.param_grad_ep_mapping = dict() - [ - self.param_grad_ep_mapping.update({ - ep: { - "params": [], - "grads": [] - } - }) for ep in self.pserver_endpoints - ] +class DistributeTranspiler(object): + """ + **DistributeTranspiler** + + Convert the fluid program to distributed data-parallelism programs. + + The main_program will be transformed to use a remote parameter server + to do parameter optimization. And the optimization graph will be put + into a parameter server program. + + Examples: + .. code-block:: python + + # Define your model before these codes. + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) + trainers = int(os.getenv("PADDLE_TRAINERS")) + current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + role = os.getenv("PADDLE_TRAINING_ROLE") + + t = distribute_transpiler.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, + pserver_program) + elif role == "TRAINER": + trainer_program = t.get_trainer_program() + """ def transpile(self, trainer_id, @@ -250,20 +154,20 @@ class DistributeTranspiler: split_method=RoundRobin, sync_mode=True): """ - :param trainer_id: one unique id for each trainer in a job. - :type trainer_id: int - :param program: program to transpile, default is default_main_program - :type program: Program - :param pservers: parameter server endpoints like "m1:6174,m2:6174" - :type pservers: string - :param trainers: total number of workers/trainers in the job - :type trainers: int - :param split_method: A function to determin how to split variables - to different servers equally. - :type split_method: function - :type sync_mode: boolean default True - :param sync_mode: if sync_mode is set True, it means that dist transpiler - will transpile the program into sync_mode pserver and trainer program. + Run the transpiler. + + Args: + trainer_id (int): id for current trainer worker, if you have + n workers, the id may range from 0 ~ n-1 + program (Program|None): program to transpile, + default is fluid.default_main_program(). + pservers (str): comma separated ip:port string for the pserver + list. + trainers (int): number of trainers in the distributed job. + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + sync_mode (bool): Do sync training or not, default is True. """ assert (split_method.__bases__[0] == PSDispatcher) if program is None: @@ -390,6 +294,12 @@ class DistributeTranspiler: self._split_table_grad_and_add_send_vars(program, pserver_endpoints) def get_trainer_program(self): + """ + Get transpiled trainer side program. + + Returns: + Program: trainer side program. + """ # remove optimize ops and add a send op to main_program delete_ops(self.origin_program.global_block(), self.optimize_ops) # FIXME(typhoonzero): serialize once will fix error occurs when clone. @@ -398,12 +308,19 @@ class DistributeTranspiler: def get_pserver_program(self, endpoint): """ - Get pserver side program using the endpoint. - TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. - NOTE: assume blocks of the same variable is not distributed - on the same pserver, only change param/grad varnames for - trainers to fetch. + Get parameter server side program. + + Args: + endpoint (str): current parameter server endpoint. + + Returns: + Program: the program for current parameter server to run. """ + # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. + # NOTE: assume blocks of the same variable is not distributed + # on the same pserver, only change param/grad varnames for + # trainers to fetch. + # step1 pserver_program = Program() # step2: Create vars to receive vars at parameter servers. @@ -585,6 +502,14 @@ class DistributeTranspiler: Get startup program for current parameter server. Modify operator input variables if there are variables that were split to several blocks. + + Args: + endpoint (str): current pserver endpoint. + pserver_program (Program): call get_pserver_program first and + pass the result here. + + Returns: + Program: parameter server side startup program. """ s_prog = Program() orig_s_prog = default_startup_program() @@ -636,6 +561,129 @@ class DistributeTranspiler: # ====================== private transpiler functions ===================== + def _has_distributed_lookup_table(self): + # process lookup_table_op + # 1. check all lookup_table_op is distributed + # 2. check all lookup_table_op share the same table. + distributed_lookup_table_ops = [] + # support only one distributed_lookup_table now + self.table_name = None + for op in self.origin_program.global_block().ops: + if op.type == LOOKUP_TABLE_TYPE: + if op.attrs['is_distributed'] is True: + if self.table_name is None: + self.table_name = op.input("W")[0] + if self.table_name != op.input("W")[0]: + raise RuntimeError("all distributed lookup_table_ops" + " should have only one table") + distributed_lookup_table_ops.append(op) + else: + if self.table_name is not None: + assert op.input("W")[0] != self.table_name + + return len(distributed_lookup_table_ops) > 0 + + def _update_dist_lookup_table_vars(self, param_list, grad_list, + params_grads): + # TODO(wuyi): put find a way to put dist lookup table stuff all together. + # update self.table_param_grad and self.trainer_side_table_grad_list + program = self.origin_program + if self.has_distributed_lookup_table: + param_list = [ + param for param in param_list if param.name != self.table_name + ] + grad_list = [ + grad for grad in grad_list + if grad.name != grad_var_name(self.table_name) + ] + self.table_param_grad = [ + param_grad for param_grad in params_grads + if param_grad[0].name == self.table_name + ][0] + table_grad_var = self.table_param_grad[1] + if self.sync_mode: + self.trainer_side_table_grad_list = [ + program.global_block().create_var( + name="%s.trainer_%d.pserver_%d" % + (table_grad_var.name, self.trainer_id, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + else: + self.trainer_side_table_grad_list = [ + program.global_block().create_var( + name="%s.pserver_%d" % (table_grad_var.name, index), + type=table_grad_var.type, + shape=table_grad_var.shape, + dtype=table_grad_var.dtype) + for index in range(len(self.pserver_endpoints)) + ] + return param_list, grad_list + + def _init_splited_vars(self, slice_var_up): + # update these mappings for further transpile: + # 1. param_var_mapping: param var name -> [splited params vars] + # 2. grad_var_mapping: grad var name -> [splited grads vars] + # 3. grad_param_mapping: grad.blockx -> param.blockx + # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []} + + param_list = [] + grad_list = [] + param_grad_set = set() + for p, g in self.params_grads: + # skip parameter marked not trainable + if type(p) == Parameter and p.trainable == False: + continue + if p.name not in param_grad_set: + param_list.append(p) + param_grad_set.add(p.name) + if g.name not in param_grad_set: + grad_list.append(g) + param_grad_set.add(g.name) + + param_list, grad_list = self._update_dist_lookup_table_vars( + param_list, grad_list, self.params_grads) + + if slice_var_up: + # when we slice var up into blocks, we will slice the var according to + # pserver services' count. A pserver may have two or more listening ports. + grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) + param_blocks = slice_variable(param_list, + len(self.pserver_endpoints)) + else: + # when we do NOT slice var up into blocks, we will always slice params + # grads into one block. + grad_blocks = slice_variable(grad_list, 1) + param_blocks = slice_variable(param_list, 1) + assert (len(grad_blocks) == len(param_blocks)) + + # origin_varname -> [splited_var] + self.param_var_mapping = self._create_vars_from_blocklist( + self.origin_program, param_blocks) + self.grad_var_mapping = self._create_vars_from_blocklist( + self.origin_program, + grad_blocks, + add_trainer_suffix=self.trainer_num > 1) + self.grad_param_mapping = dict() + for g, p in zip(grad_blocks, param_blocks): + g_name, g_bid, _ = g.split(":") + p_name, p_bid, _ = p.split(":") + self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ + self.param_var_mapping[p_name][int(p_bid)] + + # create mapping of endpoint -> split var to create pserver side program + self.param_grad_ep_mapping = dict() + [ + self.param_grad_ep_mapping.update({ + ep: { + "params": [], + "grads": [] + } + }) for ep in self.pserver_endpoints + ] + # transpiler function for dis lookup_table def _replace_lookup_table_op_with_prefetch(self, program, pserver_endpoints): diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 8bfb554845d9b128f000d6c90cf626416a198eef..999ef43ca0feacbddff5f9db59589ce7097fe77e 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -383,6 +383,16 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): def release_memory(input_program, skip_opt_set=None): + """ + Modify the input program and insert :code:`delete_op` to early drop not used + variables. The modification will be performed inplace. + + Notes: This is an experimental API and could be removed in next few + releases. Users should not use this API. + + Args: + input_program(Program): The program will be inserted :code:`delete_op`. + """ cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.release_memory(skip_opt_set=skip_opt_set) diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py index d6a68677527deb09ace0e3a23cbc093d6d7b4349..dcffadd531719431f27feb464ed58a65c04770ee 100644 --- a/python/paddle/fluid/transpiler/ps_dispatcher.py +++ b/python/paddle/fluid/transpiler/ps_dispatcher.py @@ -33,15 +33,21 @@ class PSDispatcher(object): def dispatch(self, varlist): """ - :param varlist: a list of Variables - :return: a map of pserver endpoint -> varname + Args: + varlist(list): a list of Variables + Returns: + a map of pserver endpoint -> varname """ AssertionError("Interface has not been implemented.") class HashName(PSDispatcher): """ - Hash variable names to several endpoints + Hash variable names to several endpoints using python + "hash()" function. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). """ def __init__(self, pserver_endpoints): @@ -61,7 +67,11 @@ class HashName(PSDispatcher): class RoundRobin(PSDispatcher): """ - Distribute variables to serveral endpoints. + Distribute variables to serveral endpoints using + RondRobin method. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). """ def __init__(self, pserver_endpoints): diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py index 33c53113ae7e8ed9aeada31f2aed6990b6fea110..776619cd36722e338a9fdd5e13bceeaf3724de2c 100644 --- a/python/paddle/fluid/unique_name.py +++ b/python/paddle/fluid/unique_name.py @@ -16,7 +16,7 @@ import collections import contextlib import sys -__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator'] +__all__ = ['generate', 'switch', 'guard'] class UniqueNameGenerator(object): diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py new file mode 100644 index 0000000000000000000000000000000000000000..7de76c381b29a1ff8dcf2167f0e861dc261aa47b --- /dev/null +++ b/tools/check_ctest_hung.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import re + + +def escape(input): + o = input.replace("\n", "") + o = o.replace("\r", "") + return o + + +def main(): + usage = """Usage: +1. Download the Paddle_PR_CI_*.log from TeamCity +2. run: python check_ctest_hung.py Paddle_PR_CI_*.log +3. If there is hung ctest, the result likes: +Diff: set(['test_parallel_executor_crf']) + """ + if len(sys.argv) < 2: + print(usage) + exit(0) + + logfile = sys.argv[1] + started = set() + passed = set() + with open(logfile, "r") as fn: + for l in fn.readlines(): + if l.find("Test ") != -1 and \ + l.find("Passed") != -1: + m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l)) + passed.add(m.group(1)) + if l.find("Start ") != -1: + start_parts = escape(l).split(" ") + m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l)) + started.add(m.group(1)) + print "Diff: ", started - passed + + +if __name__ == "__main__": + main() diff --git a/.clang_format.hook b/tools/codestyle/clang_format.hook similarity index 100% rename from .clang_format.hook rename to tools/codestyle/clang_format.hook diff --git a/.copyright.hook b/tools/codestyle/copyright.hook similarity index 100% rename from .copyright.hook rename to tools/codestyle/copyright.hook diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py index 54a690462699651d3e14f9b24383df01a9740336..8d4b24a0cf6b743b72dca58fd885f927560964bf 100644 --- a/tools/codestyle/docstring_checker.py +++ b/tools/codestyle/docstring_checker.py @@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker): True if successful otherwise False. """ + if node.name.startswith("__") or node.name.startswith("_"): + return True find = False for t in node.body: if not isinstance(t, astroid.Return): @@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker): Returns: True if successful otherwise False. """ + if node.name.startswith("__") or node.name.startswith("_"): + return True args = [] for arg in node.args.get_children(): if (not isinstance(arg, astroid.AssignName)) \