提交 7ecb7b7d 编写于 作者: S sneaxiy

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into python_data_feeding

...@@ -23,7 +23,7 @@ repos: ...@@ -23,7 +23,7 @@ repos:
- id: clang-format-with-version-check - id: clang-format-with-version-check
name: clang-format name: clang-format
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: bash ./.clang_format.hook -i entry: bash ./tools/codestyle/clang_format.hook -i
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local - repo: local
...@@ -52,7 +52,7 @@ repos: ...@@ -52,7 +52,7 @@ repos:
hooks: hooks:
- id: copyright_checker - id: copyright_checker
name: copyright_checker name: copyright_checker
entry: python ./.copyright.hook entry: python ./tools/codestyle/copyright.hook
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Use UBUNTU_MIRROR can speed up apt-get speed.
# ARG UBUNTU_MIRROR
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
# IMPORTANT: # IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
...@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace ...@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD *.whl / ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
ADD models/ /workspace/models/
...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
break break
else: else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
if args.use_reader_op: if args.use_reader_op:
num_samples += args.batch_size * args.gpus num_samples += args.batch_size * args.gpus
else: else:
...@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples): ...@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
(num_samples, train_elapsed, examples_per_sec)) (num_samples, train_elapsed, examples_per_sec))
def print_paddle_envs():
print('----------- Configuration envs -----------')
for k in os.environ:
if "PADDLE_" in k:
print "ENV %s:%s" % (k, os.environ[k])
print('------------------------------------------------')
def main(): def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
print_paddle_envs()
# the unique trainer id, starting from 0, needed by trainer # the unique trainer id, starting from 0, needed by trainer
# only # only
......
...@@ -17,6 +17,7 @@ import copy ...@@ -17,6 +17,7 @@ import copy
import argparse import argparse
import random import random
import os import os
import copy
from kube_templates import pserver, trainer, envs from kube_templates import pserver, trainer, envs
...@@ -109,10 +110,9 @@ def gen_job(): ...@@ -109,10 +110,9 @@ def gen_job():
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)}) envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify # NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster. # this settings before you run on your own cluster.
envs.append({ envs.append({
...@@ -166,7 +166,7 @@ def gen_job(): ...@@ -166,7 +166,7 @@ def gen_job():
tn["spec"]["template"]["spec"]["volumes"] = volumes tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = envs ps_container["env"] = copy.deepcopy(envs)
ps_container["env"].append({ ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE", "name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER" "value": "PSERVER"
......
#!/bin/bash #!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
do do
python gen_doc.py ${module} > ${module}.rst python gen_doc.py ${module} > ${module}.rst
done done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
transpiler
==========
DistributeTranspiler
--------------------
.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
:members:
:noindex:
InferenceTranspiler
-------------------
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
:members:
:noindex:
memory_optimize
---------------
.. autofunction:: paddle.fluid.transpiler.memory_optimize
:noindex:
release_memory
--------------
.. autofunction:: paddle.fluid.transpiler.release_memory
:noindex:
HashName
--------
.. autoclass:: paddle.fluid.transpiler.HashName
:members:
:noindex:
RoundRobin
----------
.. autoclass:: paddle.fluid.transpiler.RoundRobin
:members:
:noindex:
...@@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository ...@@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
====================== ======================================== ====================== ========================================
从源码编译 从源码编译
......
...@@ -40,10 +40,9 @@ void Main(bool use_gpu) { ...@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
//# 2. Prepare input. //# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "", PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}), .shape = std::vector<int>({4, 1}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64}; .dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
...@@ -55,14 +54,12 @@ void Main(bool use_gpu) { ...@@ -55,14 +54,12 @@ void Main(bool use_gpu) {
//# 4. Get output. //# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length; LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
} }
// TODO(Superjomn): this is should be free automatically
free(outputs[0].data.data);
} }
} }
...@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
for (int batch_id = 0; batch_id < num_batches; ++batch_id) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
// 2. Dummy Input Data // 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "", PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}), .shape = std::vector<int>({4, 1}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64}; .dtype = PaddleDType::INT64};
std::vector<PaddleTensor> inputs(4, tensor); std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
// 4. Get output. // 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", " LOG(INFO) << "TID: " << tid << ", "
<< "output buffer size: " << outputs.front().data.length; << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length / sizeof(float); const size_t num_elements =
outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
} }
free(outputs[0].data.data);
} }
}); });
} }
......
...@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and ...@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
PaddleBuf::PaddleBuf(PaddleBuf&& other)
: data_(other.data_),
length_(other.length_),
memory_owned_(other.memory_owned_) {
other.memory_owned_ = false;
other.data_ = nullptr;
other.length_ = 0;
}
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
// only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
assert(memory_owned_);
Free();
data_ = new char[length];
length_ = length;
memory_owned_ = true;
}
void PaddleBuf::Reset(void* data, size_t length) {
Free();
memory_owned_ = false;
data_ = data;
length_ = length;
}
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
}
} // namespace paddle
\ No newline at end of file
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -32,12 +33,38 @@ enum PaddleDType { ...@@ -32,12 +33,38 @@ enum PaddleDType {
INT64, INT64,
}; };
struct PaddleBuf { class PaddleBuf {
void* data; // pointer to the data memory. public:
size_t length; // number of memory bytes. PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
// Reset to external memory.
void Reset(void* data, size_t length);
bool empty() const { return length_ == 0; }
void* data() const { return data_; }
size_t length() const { return length_; }
~PaddleBuf() { Free(); }
private:
void Free();
void* data_{nullptr}; // pointer to the data memory.
size_t length_{0}; // number of memory bytes.
bool memory_owned_{true};
}; };
struct PaddleTensor { struct PaddleTensor {
PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
std::vector<int> shape; std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed. // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
...@@ -67,8 +94,9 @@ class PaddlePredictor { ...@@ -67,8 +94,9 @@ class PaddlePredictor {
// Predict an record. // Predict an record.
// The caller should be responsible for allocating and releasing the memory of // The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be alive until Run returns. caller should be // `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for releasing the memory of `output_data`. // responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs, virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0; std::vector<PaddleTensor>* output_data) = 0;
......
...@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
auto d_tensor_in_p = executor_.get_in(input.name); auto d_tensor_in_p = executor_.get_in(input.name);
float *d_data_p = d_tensor_in_p->mutable_data(); float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p, if (cudaMemcpy(d_data_p,
static_cast<float *>(input.data.data), static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float), d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) { cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error"; LOG(ERROR) << "copy data from CPU to GPU error";
...@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
for (auto &output : *output_data) { for (auto &output : *output_data) {
auto *tensor = executor_.get_out(output.name); auto *tensor = executor_.get_out(output.name);
output.shape = tensor->shape(); output.shape = tensor->shape();
if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float));
}
// Copy data from GPU -> CPU // Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data, if (cudaMemcpy(output.data.data(),
tensor->mutable_data(), tensor->mutable_data(),
tensor->valid_size() * sizeof(float), tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) { cudaMemcpyDeviceToHost) != 0) {
......
...@@ -37,28 +37,26 @@ TEST(inference, anakin) { ...@@ -37,28 +37,26 @@ TEST(inference, anakin) {
float data[1 * 3 * 224 * 224] = {1.0f}; float data[1 * 3 * 224 * 224] = {1.0f};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "input_0", PaddleTensor tensor{.name = "input_0",
.shape = std::vector<int>({1, 3, 224, 224}), .shape = std::vector<int>({1, 3, 224, 224}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::FLOAT32}; .dtype = PaddleDType::FLOAT32};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor); std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.emplace_back(std::move(tensor));
float data_out[1000];
PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
PaddleTensor tensor_out{.name = "prob_out", PaddleTensor tensor_out{.name = "prob_out",
.shape = std::vector<int>({1000, 1}), .shape = std::vector<int>({1000, 1}),
.data = buf_out, .data = PaddleBuf(),
.dtype = PaddleDType::FLOAT32}; .dtype = PaddleDType::FLOAT32};
std::vector<PaddleTensor> outputs(1, tensor_out); std::vector<PaddleTensor> outputs;
outputs.emplace_back(std::move(tensor_out));
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < 1000; ++j) { for (size_t j = 0; j < 1000; ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j]; LOG(INFO) << "output[" << j << "]: " << data_o[j];
} }
......
...@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), std::memcpy(static_cast<void *>(input_ptr),
inputs[i].data.data, inputs[i].data.data(),
inputs[i].data.length); inputs[i].data.length());
feeds->push_back(input); feeds->push_back(input);
} }
return true; return true;
...@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch( ...@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
} }
outputs->at(i).shape = shape; outputs->at(i).shape = shape;
outputs->at(i).data.length = sizeof(float) * data.size(); auto &buffer = outputs->at(i).data;
outputs->at(i).data.data = malloc(outputs->at(i).data.length); if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
std::memcpy( buffer.Resize(sizeof(float) * data.size());
outputs->at(i).data.data, data.data(), outputs->at(i).data.length); }
std::memcpy(buffer.data(), data.data(), buffer.length());
outputs->at(i).dtype = PaddleDType::FLOAT32; outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy. // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
} }
......
...@@ -27,13 +27,12 @@ namespace paddle { ...@@ -27,13 +27,12 @@ namespace paddle {
PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
PaddleTensor pt; PaddleTensor pt;
pt.data.data = t->data<void>();
if (t->type() == typeid(int64_t)) { if (t->type() == typeid(int64_t)) {
pt.data.length = t->numel() * sizeof(int64_t); pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
pt.dtype = PaddleDType::INT64; pt.dtype = PaddleDType::INT64;
} else if (t->type() == typeid(float)) { } else if (t->type() == typeid(float)) {
pt.data.length = t->numel() * sizeof(float); pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
pt.dtype = PaddleDType::FLOAT32; pt.dtype = PaddleDType::FLOAT32;
} else { } else {
LOG(FATAL) << "unsupported type."; LOG(FATAL) << "unsupported type.";
...@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) { ...@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length; size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data); float* data = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0); ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0); ASSERT_GT(data[j], -1.0);
...@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) { ...@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
EXPECT_LT(lod_data[i] - data[i], 1e-3); EXPECT_LT(lod_data[i] - data[i], 1e-3);
EXPECT_GT(lod_data[i] - data[i], -1e-3); EXPECT_GT(lod_data[i] - data[i], -1e-3);
} }
free(outputs[0].data.data);
} }
void MainImageClassification(bool use_gpu) { void MainImageClassification(bool use_gpu) {
...@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) { ...@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length; size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data); float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>(); float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], 1e-3); EXPECT_NEAR(lod_data[j], data[j], 1e-3);
} }
free(data);
} }
void MainThreadsWord2Vec(bool use_gpu) { void MainThreadsWord2Vec(bool use_gpu) {
...@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
// check outputs range // check outputs range
ASSERT_EQ(local_outputs.size(), 1UL); ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length; const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data); float* data = static_cast<float*>(local_outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0); ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0); ASSERT_GT(data[j], -1.0);
...@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], 1e-3);
} }
free(data);
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
...@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
// check outputs correctness // check outputs correctness
ASSERT_EQ(local_outputs.size(), 1UL); ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length; const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data); float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>(); float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], 1e-3);
} }
free(data);
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
......
...@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) { for (auto &p : params) {
grad_names_.insert(GradVarName(p)); grad_names_.insert(GradVarName(p));
} }
balance_vars_.resize(places_.size(), 0);
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
...@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp( ...@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
checker(op.InputArgumentNames(), recv_vars); checker(op.InputArgumentNames(), recv_vars);
} }
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0;
for (auto var_name : var_names) {
auto var_desc = all_vars_.at(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GT(numel, 0);
numel_sum += numel;
}
auto smallest =
std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
balance_vars_[dev_id] += numel_sum;
return dev_id;
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const { const ProgramDesc &program) const {
std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) { for (auto *var : program.Block(0).AllVars()) {
all_vars[var->Name()] = var; all_vars_.emplace(var->Name(), var);
} }
auto graph = new SSAGraph(); auto graph = new SSAGraph();
...@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program); auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program); auto recv_vars = FindDistTrainRecvVars(program);
std::vector<std::unordered_set<std::string>> var_name_on_devices;
std::vector<std::unordered_set<std::string>> bcast_var_name_set; std::vector<std::unordered_set<std::string>> bcast_var_name_set;
var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size()); bcast_var_name_set.resize(places_.size());
size_t cur_device_id = 0; size_t cur_device_id = 0;
std::vector<int64_t> balance_grads(places_.size(), 0);
auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
auto var_desc = all_vars.at(g_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GE(numel, 0);
auto smallest =
std::min_element(std::begin(balance_grads), std::end(balance_grads));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
balance_grads[dev_id] += numel;
return dev_id;
};
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
if (boost::get<int>( if (boost::get<int>(
op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) { static_cast<int>(OpRole::kRPC)) {
// append rpc op if program is distributed trainer main program.
// always use the first device
CreateRPCOp(&result, *op); CreateRPCOp(&result, *op);
} else if (IsDistTrainOp(*op, send_vars, recv_vars)) { } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
CreateDistTrainOp(&result, *op); CreateDistTrainOp(&result, *op);
...@@ -201,13 +202,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -201,13 +202,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} }
is_forwarding = false; is_forwarding = false;
} else { } else {
int op_dev_id = GetOpDeviceID(var_name_on_devices, *op); int op_dev_id = GetOpDeviceID(*op);
if (op_dev_id == -1) { // var on all device if (op_dev_id == -1) { // var on all device
CreateComputationalOps(&result, *op, places_.size()); CreateComputationalOps(&result, *op, places_.size());
} else { } else {
CreateComputationalOp(&result, *op, op_dev_id); CreateComputationalOp(&result, *op, op_dev_id);
for (auto &var_name : op->OutputArgumentNames()) { for (auto &var_name : op->OutputArgumentNames()) {
var_name_on_devices[op_dev_id].emplace(var_name); var_name_on_devices_.emplace(var_name, op_dev_id);
} }
} }
if (!is_forwarding && places_.size() > 1) { if (!is_forwarding && places_.size() > 1) {
...@@ -230,13 +231,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -230,13 +231,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) { switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce: case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = get_appropriate_dev(g_name); cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(&result, g_name, cur_device_id); CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name); var_name_on_devices_.emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name); bcast_var_name_set[cur_device_id].emplace(p_name);
break; break;
case BuildStrategy::ReduceStrategy::kAllReduce: case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(all_vars, g_name)) { if (IsSparseGradient(g_name)) {
CreateReduceOp(&result, g_name, 0); CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0);
} else { } else {
...@@ -273,11 +274,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -273,11 +274,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
return std::unique_ptr<SSAGraph>(graph); return std::unique_ptr<SSAGraph>(graph);
} }
bool MultiDevSSAGraphBuilder::IsSparseGradient( bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
const std::unordered_map<std::string, VarDesc *> &all_vars, PADDLE_ENFORCE(all_vars_.count(og) != 0);
const std::string &og) const { if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
PADDLE_ENFORCE(all_vars.count(og) != 0);
if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true; return true;
} }
return false; return false;
...@@ -363,24 +362,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( ...@@ -363,24 +362,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
return is_pg_once; return is_pg_once;
} }
int MultiDevSSAGraphBuilder::GetOpDeviceID( int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1; return -1;
} }
int var_dev_id = -1; for (auto &varname : op.InputArgumentNames()) {
for (auto &var_name : op.InputArgumentNames()) { int dev_id = GetVarDeviceID(varname);
if (var_dev_id != -1) break; if (dev_id != -1) {
for (size_t i = 0; i < var_name_on_devices.size(); ++i) { return dev_id;
if (var_name_on_devices[i].count(var_name)) {
var_dev_id = static_cast<int>(i);
break;
}
} }
} }
return var_dev_id; return -1;
}
int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
auto got = var_name_on_devices_.find(varname);
return got == var_name_on_devices_.end() ? -1 : got->second;
} }
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
...@@ -463,7 +461,30 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, ...@@ -463,7 +461,30 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
CreateComputationalOp(result, op, 0); int op_dev_id = -1;
if (op.Type() == "split_byref") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else if (op.Type() == "concat") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
} else {
PADDLE_ENFORCE(
"the distribute training related op should be in [split_byref, "
"concat].");
}
PADDLE_ENFORCE(op_dev_id != -1,
"can not find right place for distributed op: %s", op.Type());
CreateComputationalOp(result, op, op_dev_id);
if (op.Type() == "concat") { if (op.Type() == "concat") {
ConnectOp(result, result->ops_.back().get(), "fetch_barrier"); ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
} }
...@@ -471,8 +492,34 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, ...@@ -471,8 +492,34 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
result->ops_.emplace_back( int op_dev_id = -1;
new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); if (op.Type() == "send") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
op.InputArgumentNames()[0].find(".block") == std::string::npos) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
} else if (op.Type() == "recv") {
op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else {
// send_barrier and fetch_barrier op can be scheduled on device 0
op_dev_id = 0;
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
op.Type());
result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
op.Type(), places_[op_dev_id]));
if (op.Type() == "send_barrier") { if (op.Type() == "send_barrier") {
ConnectOp(result, result->ops_.back().get(), "send"); ConnectOp(result, result->ops_.back().get(), "send");
...@@ -488,9 +535,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, ...@@ -488,9 +535,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
"send, send_barrier. recv, fetch_barrier]"); "send, send_barrier. recv, fetch_barrier]");
} }
// TODO(Yancey1989): schedule rpc op on different place may CreateOpHandleIOs(result, op, op_dev_id);
// increate throughput
CreateOpHandleIOs(result, op, 0);
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
......
...@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#endif #endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
int GetVarDeviceID(const std::string &varname) const;
private: private:
void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
size_t place_id) const; size_t device_id) const;
private: private:
std::string loss_var_name_; std::string loss_var_name_;
...@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &og, const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const; std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID( int GetOpDeviceID(const OpDesc &op) const;
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const;
void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
bool IsSparseGradient( bool IsSparseGradient(const std::string &og) const;
const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const; size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
private: private:
BuildStrategy strategy_; BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
mutable std::unordered_map<std::string, int> var_name_on_devices_;
mutable std::vector<int64_t> balance_vars_;
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
......
...@@ -30,6 +30,7 @@ class SSAGraphBuilder { ...@@ -30,6 +30,7 @@ class SSAGraphBuilder {
SSAGraphBuilder() {} SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0; virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#endif #endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {} ...@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
void Executor::Complete() { void Executor::Complete() {
::paddle::operators::detail::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::detail::GRPCClient>() ::paddle::operators::distributed::GRPCClient>()
->SendComplete(); ->SendComplete();
} }
#endif #endif
...@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
} }
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars,
bool keep_kids) {
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
...@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) { if (local_scope != scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} else { } else {
// Delete the local scopes created in operators. if (!keep_kids) {
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
scope->DropKids(); scope->DropKids();
} }
}
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
......
...@@ -78,7 +78,7 @@ class Executor { ...@@ -78,7 +78,7 @@ class Executor {
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true, bool keep_kids = false);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
......
...@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor( ...@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
details::SSAGraphBuilderFactory builder_factory( details::SSAGraphBuilderFactory builder_factory(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
build_strategy); build_strategy);
...@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor( ...@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
#endif #endif
} }
builder_ = std::move(builder_factory.Create());
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, exec_strategy, member_->local_scopes_, places,
builder_factory.Create()->Build(main_program))); builder_->Build(main_program)));
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, std::move(var_infos), exec_strategy, member_->local_scopes_, std::move(var_infos),
...@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor( ...@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const std::unordered_set<std::string> &vars) const { const std::unordered_set<std::string> &vars) const {
auto *main_scope = member_->local_scopes_[0]; // the the initialize bcast, all vars would be bcast from device(0), otherwise
// bcast from the specified device.
bool initialize = builder_.get() == nullptr ? true : false;
for (auto &var : vars) { for (auto &var : vars) {
auto *main_var = main_scope->FindVar(var); int var_dev_id =
builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
if (!initialize && var_dev_id == -1) continue;
framework::Variable *main_var = nullptr;
if (initialize) {
main_var = member_->local_scopes_[0]->FindVar(var);
} else {
main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
}
if (main_var == nullptr || !main_var->IsType<LoDTensor>()) { if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
continue; continue;
} }
...@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) {
if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -19,12 +19,14 @@ limitations under the License. */ ...@@ -19,12 +19,14 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -68,6 +70,7 @@ class ParallelExecutor { ...@@ -68,6 +70,7 @@ class ParallelExecutor {
private: private:
ParallelExecutorPrivate *member_; ParallelExecutorPrivate *member_;
std::unique_ptr<details::SSAGraphBuilder> builder_;
}; };
} // namespace framework } // namespace framework
......
...@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { ...@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_); SubGraphFuse(graph, node_inside_subgraph_teller_);
} }
} // analysis } // namespace analysis
} // inference } // namespace inference
} // paddle } // namespace paddle
...@@ -184,8 +184,8 @@ else() ...@@ -184,8 +184,8 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op) set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif() endif()
add_subdirectory(detail)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
add_subdirectory(distributed)
set(DISTRIBUTE_DEPS "") set(DISTRIBUTE_DEPS "")
if(WITH_GRPC) if(WITH_GRPC)
...@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE) ...@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE)
endif() endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endforeach()
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL) # listen_and_serv_op sum_op executor SERIAL)
......
...@@ -21,8 +21,6 @@ namespace operators { ...@@ -21,8 +21,6 @@ namespace operators {
using batch_norm_bwd = mkldnn::batch_normalization_backward; using batch_norm_bwd = mkldnn::batch_normalization_backward;
using batch_norm_fwd = mkldnn::batch_normalization_forward; using batch_norm_fwd = mkldnn::batch_normalization_forward;
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory; using mkldnn::memory;
using mkldnn::primitive; using mkldnn::primitive;
using mkldnn::reorder; using mkldnn::reorder;
...@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext; ...@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc; using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast; using platform::to_void_cast;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
namespace { namespace {
template <typename T> template <typename T>
struct bn_type_traits { struct bn_type_traits {
......
...@@ -22,22 +22,6 @@ limitations under the License. */ ...@@ -22,22 +22,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class BatchNormOp : public framework::OperatorWithKernel { class BatchNormOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
......
...@@ -19,6 +19,22 @@ limitations under the License. */ ...@@ -19,6 +19,22 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> { class BatchNormKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, ...@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
ops::BilinearInterpOpMaker, ops::BilinearInterpOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
ops::BilinearInterpKernel<uint8_t>);
REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
ops::BilinearInterpGradKernel<float>); ops::BilinearInterpGradKernel<float>);
...@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> { ...@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw; int in_chw = channels * in_hw;
int out_chw = channels * out_hw; int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T)); memcpy(output, input, input_t->numel() * sizeof(T));
...@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> { ...@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i; int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0; int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h; float h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda; float h2lambda = 1.f - h1lambda;
for (int j = 0; j < out_w; ++j) { for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j; int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0; int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w; float w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda; float w2lambda = 1.f - w1lambda;
// calculate four position for bilinear interpolation // calculate four position for bilinear interpolation
const T* in_pos = &input[k * in_chw + h * in_w + w]; const T* in_pos = &input[k * in_chw + h * in_w + w];
T* out_pos = &output[k * out_chw + i * out_w + j]; T* out_pos = &output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels for (int c = 0; c < channels; ++c) { // loop for channels
// bilinear interpolation // bilinear interpolation
out_pos[0] = out_pos[0] = static_cast<T>(
h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
h1lambda * (w2lambda * in_pos[hid * in_w] + h1lambda * (w2lambda * in_pos[hid * in_w] +
w1lambda * in_pos[hid * in_w + wid]); w1lambda * in_pos[hid * in_w + wid]));
in_pos += in_hw; in_pos += in_hw;
out_pos += out_hw; out_pos += out_hw;
} }
...@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> { ...@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw; int in_chw = channels * in_hw;
int out_chw = channels * out_hw; int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
...@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> { ...@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i; int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0; int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h; float h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda; float h2lambda = 1 - h1lambda;
for (int j = 0; j < out_w; ++j) { for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j; int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0; int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w; float w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda; float w2lambda = 1 - w1lambda;
T* in_pos = &d_input[k * in_chw + h * in_w + w]; T* in_pos = &d_input[k * in_chw + h * in_w + w];
const T* out_pos = &d_output[k * out_chw + i * out_w + j]; const T* out_pos = &d_output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels for (int c = 0; c < channels; ++c) { // loop for channels
in_pos[0] += h2lambda * w2lambda * out_pos[0]; in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
in_pos[wid] += h2lambda * w1lambda * out_pos[0]; in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0]; in_pos[hid * in_w] +=
in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0]; static_cast<T>(h1lambda * w2lambda * out_pos[0]);
in_pos[hid * in_w + wid] +=
static_cast<T>(h1lambda * w1lambda * out_pos[0]);
in_pos += in_hw; in_pos += in_hw;
out_pos += out_hw; out_pos += out_hw;
} }
......
...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_GRPC #ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc_server.h"
#define RPCSERVER_T detail::AsyncGRPCServer #define RPCSERVER_T distributed::AsyncGRPCServer
#define RPCCLIENT_T detail::GRPCClient #define RPCCLIENT_T distributed::GRPCClient
#else #else
#include "paddle/fluid/operators/detail/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/operators/detail/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc_server.h"
#define RPCSERVER_T detail::AsyncBRPCServer #define RPCSERVER_T distributed::AsyncBRPCServer
#define RPCCLIENT_T detail::BRPCClient #define RPCCLIENT_T distributed::BRPCClient
#endif #endif
if(NOT WITH_DISTRIBUTE)
return()
endif()
if(WITH_GRPC) if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
DEFINE_int32(brpc_channel_num, 24, DEFINE_int32(brpc_channel_num, 24,
"Number of channels to send requests connected to one server"); "Number of channels to send requests connected to one server");
...@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ...@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
return q; return q;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -31,13 +31,13 @@ limitations under the License. */ ...@@ -31,13 +31,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
struct ChannelContext { struct ChannelContext {
brpc::Channel channel; brpc::Channel channel;
...@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient { ...@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(BRPCClient); DISABLE_COPY_AND_ASSIGN(BRPCClient);
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,13 +12,13 @@ ...@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc_server.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace sendrecv { namespace sendrecv {
typedef std::unordered_map<std::string, typedef std::unordered_map<std::string,
paddle::operators::detail::RequestHandler*> paddle::operators::distributed::RequestHandler*>
HandlerMap; HandlerMap;
class BRPCServiceImpl : public SendRecvService { class BRPCServiceImpl : public SendRecvService {
...@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
: request_send_h_(nullptr), : request_send_h_(nullptr),
request_get_h_(nullptr), request_get_h_(nullptr),
request_prefetch_h_(nullptr) { request_prefetch_h_(nullptr) {
auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_send_h_ = it->second; request_send_h_ = it->second;
} }
it = rpc_call_map.find(paddle::operators::detail::kRequestSend); it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_get_h_ = it->second; request_get_h_ = it->second;
} }
it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_prefetch_h_ = it->second; request_prefetch_h_ = it->second;
} }
...@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
} }
private: private:
paddle::operators::detail::RequestHandler* request_send_h_; paddle::operators::distributed::RequestHandler* request_send_h_;
paddle::operators::detail::RequestHandler* request_get_h_; paddle::operators::distributed::RequestHandler* request_get_h_;
paddle::operators::detail::RequestHandler* request_prefetch_h_; paddle::operators::distributed::RequestHandler* request_prefetch_h_;
}; };
} // namespace sendrecv } // namespace sendrecv
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void AsyncBRPCServer::StartServer() { void AsyncBRPCServer::StartServer() {
// Instance of your service. // Instance of your service.
...@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() { ...@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer WaitSeverReady"; VLOG(3) << "AsyncGRPCServer WaitSeverReady";
} }
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -19,12 +19,12 @@ limitations under the License. */ ...@@ -19,12 +19,12 @@ limitations under the License. */
#include <string> #include <string>
#include "brpc/server.h" #include "brpc/server.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class AsyncBRPCServer final : public RPCServer { class AsyncBRPCServer final : public RPCServer {
public: public:
...@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer { ...@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
int ready_; int ready_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -17,11 +17,11 @@ limitations under the License. */ ...@@ -17,11 +17,11 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC // file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data. // requests without too much copying of the tensor data.
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
GrpcByteBufferSource::GrpcByteBufferSource() {} GrpcByteBufferSource::GrpcByteBufferSource() {}
...@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { ...@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_; return byte_count_;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -106,7 +106,7 @@ class GrpcBufferReader final ...@@ -106,7 +106,7 @@ class GrpcBufferReader final
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
// Source provides a way for a particular RPC implementation to provide // Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom. // received data to ParseFrom.
class Source { class Source {
...@@ -183,6 +183,6 @@ class GrpcByteSource : public Source { ...@@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
char space_[sizeof(Reader)]; char space_[sizeof(Reader)];
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#include <sys/time.h> #include <sys/time.h>
#include <limits> #include <limits>
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void GRPCClient::InitImpl() { InitEventLoop(); } void GRPCClient::InitImpl() { InitEventLoop(); }
...@@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { ...@@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
return ch; return ch;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -38,13 +38,13 @@ limitations under the License. */ ...@@ -38,13 +38,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
struct VarHandle { struct VarHandle {
std::string ep; std::string ep;
...@@ -226,6 +226,6 @@ class GRPCClient : public RPCClient { ...@@ -226,6 +226,6 @@ class GRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(GRPCClient); DISABLE_COPY_AND_ASSIGN(GRPCClient);
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < 564; ++i) rows->push_back(i); for (int i = 0; i < 564; ++i) rows->push_back(i);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0)); EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize // deserialize
...@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// deserialize zero-copy // deserialize zero-copy
// framework::Variable var2; // framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx); operators::distributed::VariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar(); framework::Variable* var2 = resp.GetVar();
...@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
math::set_constant(ctx, tensor, 31.9); math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0)); EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize // deserialize
...@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
// deserialize zero-copy // deserialize zero-copy
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx); operators::distributed::VariableResponse resp(&scope, &ctx);
if (from_type == 0) { if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
} else { } else {
......
...@@ -15,13 +15,13 @@ limitations under the License. */ ...@@ -15,13 +15,13 @@ limitations under the License. */
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc_server.h"
using ::grpc::ServerAsyncResponseWriter; using ::grpc::ServerAsyncResponseWriter;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum CallStatus { PROCESS = 0, FINISH }; enum CallStatus { PROCESS = 0, FINISH };
// reference: // reference:
...@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase { ...@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase {
request_.reset(new VariableResponse(request_handler->scope(), request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), request_handler->dev_ctx(),
!request_handler->sync_mode())); !request_handler->sync_mode()));
int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable); int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_, method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase { ...@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase {
::grpc::ServerCompletionQueue* cq, ::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id) RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable); auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_, method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase {
local_scope_(nullptr) { local_scope_(nullptr) {
request_.reset(new VariableResponse(request_handler->scope(), request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), true)); request_handler->dev_ctx(), true));
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable); int method_id =
static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_, method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest( ...@@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest(
} }
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -29,17 +29,17 @@ limitations under the License. */ ...@@ -29,17 +29,17 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/distributed/grpc_service.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RequestBase; class RequestBase;
...@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer { ...@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
std::map<std::string, std::vector<RequestBase*>> rpc_reqs_; std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <grpc++/impl/codegen/stub_options.h> #include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h> #include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h> #include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -42,16 +42,17 @@ class ServerContext; ...@@ -42,16 +42,17 @@ class ServerContext;
// Support parsing/unparsing of tensorflow::VariableResponse. // Support parsing/unparsing of tensorflow::VariableResponse.
// Wire-format is identical to RecvVariableResponse. // Wire-format is identical to RecvVariableResponse.
template <> template <>
class SerializationTraits<paddle::operators::detail::VariableResponse> { class SerializationTraits<paddle::operators::distributed::VariableResponse> {
public: public:
static Status Serialize( static Status Serialize(
const paddle::operators::detail::VariableResponse& msg, const paddle::operators::distributed::VariableResponse& msg,
grpc_byte_buffer** bp, bool* own_buffer) { grpc_byte_buffer** bp, bool* own_buffer) {
PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
return Status(); return Status();
} }
static Status Deserialize(grpc_byte_buffer* buffer, static Status Deserialize(
paddle::operators::detail::VariableResponse* msg, grpc_byte_buffer* buffer,
paddle::operators::distributed::VariableResponse* msg,
int max_message_size = INT_MAX) { int max_message_size = INT_MAX) {
if (buffer == nullptr) { if (buffer == nullptr) {
return Status(StatusCode::INTERNAL, "No payload"); return Status(StatusCode::INTERNAL, "No payload");
...@@ -59,7 +60,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> { ...@@ -59,7 +60,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
Status result = g_core_codegen_interface->ok(); Status result = g_core_codegen_interface->ok();
if (result.ok()) { if (result.ok()) {
paddle::operators::detail::GrpcByteSource source(buffer); paddle::operators::distributed::GrpcByteSource source(buffer);
int ret = msg->Parse(&source); int ret = msg->Parse(&source);
if (ret != 0) { if (ret != 0) {
result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
...@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> { ...@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum class GrpcMethod { enum class GrpcMethod {
kSendVariable, kSendVariable,
...@@ -118,6 +119,6 @@ class GrpcService final { ...@@ -118,6 +119,6 @@ class GrpcService final {
}; };
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
char* EncodeVarint32(char* dst, uint32_t v) { char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds // Operate on characters as unsigneds
...@@ -144,6 +144,6 @@ class ProtoEncodeHelper { ...@@ -144,6 +144,6 @@ class ProtoEncodeHelper {
char* limit_; // Just for CHECKs char* limit_; // Just for CHECKs
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestSend[] = "RequestSend";
constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestGet[] = "RequestGet";
...@@ -124,6 +124,6 @@ class RequestHandler { ...@@ -124,6 +124,6 @@ class RequestHandler {
RPCServer* rpc_server_; RPCServer* rpc_server_;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
bool RequestSendHandler::Handle(const std::string& varname, bool RequestSendHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
...@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, ...@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
return true; return true;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RequestSendHandler final : public RequestHandler { class RequestSendHandler final : public RequestHandler {
public: public:
...@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler { ...@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler {
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,15 +12,15 @@ ...@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
std::once_flag RPCClient::init_flag_; std::once_flag RPCClient::init_flag_;
std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr); std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RPCClient { class RPCClient {
public: public:
...@@ -84,6 +84,6 @@ class RPCClient { ...@@ -84,6 +84,6 @@ class RPCClient {
static std::once_flag init_flag_; static std::once_flag init_flag_;
static std::unique_ptr<RPCClient> rpc_client_; static std::unique_ptr<RPCClient> rpc_client_;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -17,11 +17,11 @@ ...@@ -17,11 +17,11 @@
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void RPCServer::ShutDown() { void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown "; LOG(INFO) << "RPCServer ShutDown ";
...@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) { ...@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -19,11 +19,11 @@ ...@@ -19,11 +19,11 @@
#include <thread> // NOLINT #include <thread> // NOLINT
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RPCServer { class RPCServer {
public: public:
...@@ -86,6 +86,6 @@ class RPCServer { ...@@ -86,6 +86,6 @@ class RPCServer {
friend class RequestHandler; friend class RequestHandler;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -22,18 +22,18 @@ limitations under the License. */ ...@@ -22,18 +22,18 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace detail = paddle::operators::detail; namespace distributed = paddle::operators::distributed;
USE_OP(lookup_table); USE_OP(lookup_table);
std::unique_ptr<detail::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0); auto root_block = program->MutableBlock(0);
...@@ -113,19 +113,21 @@ void StartServer() { ...@@ -113,19 +113,21 @@ void StartServer() {
g_req_handler->SetScope(&scope); g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe); g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get()); g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join(); server_thread.join();
} }
TEST(PREFETCH, CPU) { TEST(PREFETCH, CPU) {
g_req_handler.reset(new detail::RequestPrefetchHandler(true)); g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::thread server_thread(StartServer); std::thread server_thread(StartServer);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <nccl.h> #include <nccl.h>
...@@ -23,14 +23,14 @@ limitations under the License. */ ...@@ -23,14 +23,14 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
...@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var) { framework::Variable** var) {
operators::detail::VariableResponse resp(scope, &ctx); operators::distributed::VariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -25,12 +25,12 @@ limitations under the License. */ ...@@ -25,12 +25,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
typedef void (*DestroyCallback)(void*); typedef void (*DestroyCallback)(void*);
...@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { ...@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
} }
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -22,12 +22,12 @@ ...@@ -22,12 +22,12 @@
#endif #endif
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum WireType { enum WireType {
WIRETYPE_VARINT = 0, WIRETYPE_VARINT = 0,
...@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData( ...@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData(
slr->set_height(meta_.slr_height()); slr->set_height(meta_.slr_height());
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
tensor->Resize(dims); tensor->Resize(dims);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
static_cast<size_t>(tensor->numel()),
length / framework::SizeOfType( length / framework::SizeOfType(
paddle::operators::detail::ToTypeIndex(meta_.data_type()))); paddle::operators::distributed::ToTypeIndex(
meta_.data_type())));
void* tensor_data = tensor->mutable_data( void* tensor_data = tensor->mutable_data(
ctx.GetPlace(), ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta_.data_type())); paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false; return false;
...@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) { ...@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) {
return 0; return 0;
} }
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -22,17 +22,17 @@ ...@@ -22,17 +22,17 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class VariableResponse { class VariableResponse {
public: public:
...@@ -99,6 +99,6 @@ class VariableResponse { ...@@ -99,6 +99,6 @@ class VariableResponse {
sendrecv::VariableMessage meta_; sendrecv::VariableMessage meta_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
rpc_client->Wait(); rpc_client->Wait();
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
namespace paddle { namespace paddle {
...@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::vector<std::string> endpoint_list = std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("endpoint_list"); Attr<std::vector<std::string>>("endpoint_list");
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (auto& ep : endpoint_list) { for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep; VLOG(3) << "sending nccl id to " << ep;
...@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
// NOTE: Can not use unique_ptr here because the default // NOTE: Can not use unique_ptr here because the default
// deleter will call GRPC Server's base class's dtor and // deleter will call GRPC Server's base class's dtor and
// that will cause a wired crash. // that will cause a wired crash.
detail::RequestSendHandler rpc_h(true); distributed::RequestSendHandler rpc_h(true);
std::unique_ptr<detail::RPCServer> rpc_service( std::unique_ptr<distributed::RPCServer> rpc_service(
new RPCSERVER_T(endpoint, 1)); new RPCSERVER_T(endpoint, 1));
rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
rpc_h.SetRPCServer(rpc_service.get()); rpc_h.SetRPCServer(rpc_service.get());
framework::ProgramDesc empty_program; framework::ProgramDesc empty_program;
...@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
rpc_h.SetExecutor(&executor); rpc_h.SetExecutor(&executor);
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
rpc_service->SetCond(detail::kRequestSend); rpc_service->SetCond(distributed::kRequestSend);
VLOG(3) << "start getting nccl id from trainer 0..."; VLOG(3) << "start getting nccl id from trainer 0...";
rpc_service->WaitBarrier(detail::kRequestSend); rpc_service->WaitBarrier(distributed::kRequestSend);
VLOG(3) << "got nccl id and stop server..."; VLOG(3) << "got nccl id and stop server...";
rpc_service->ShutDown(); rpc_service->ShutDown();
VLOG(3) << "rpc server stopped"; VLOG(3) << "rpc server stopped";
......
...@@ -21,14 +21,14 @@ limitations under the License. */ ...@@ -21,14 +21,14 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void RunServer(std::shared_ptr<detail::RPCServer> service) { void RunServer(std::shared_ptr<distributed::RPCServer> service) {
service->StartServer(); service->StartServer();
VLOG(4) << "RunServer thread end"; VLOG(4) << "RunServer thread end";
} }
...@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop( ...@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop(
while (true) { while (true) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(detail::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
rpc_service_->WaitBarrier(detail::kRequestSend); rpc_service_->WaitBarrier(distributed::kRequestSend);
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
LOG(WARNING) << "get exit!rpc_processor break!"; LOG(WARNING) << "get exit!rpc_processor break!";
rpc_service_->SetCond(detail::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
break; break;
} }
...@@ -154,11 +154,11 @@ void ListenAndServOp::RunSyncLoop( ...@@ -154,11 +154,11 @@ void ListenAndServOp::RunSyncLoop(
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
rpc_service_->SetCond(detail::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
// reset received sparse vars to avoid reuse it in the next mini-batch // reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get()) dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder(); ->ResetSparseVarRecorder();
} // while(true) } // while(true)
} }
...@@ -215,13 +215,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -215,13 +215,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
} }
static void FillRequestCtx( static void FillRequestCtx(
detail::RequestHandler *h, framework::Scope *scope, distributed::RequestHandler *h, framework::Scope *scope,
platform::DeviceContext *dev_ctx, framework::Executor *executor, platform::DeviceContext *dev_ctx, framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx, *prefetch_ctx,
detail::RPCServer *rpc_server) { distributed::RPCServer *rpc_server) {
h->SetScope(scope); h->SetScope(scope);
h->SetDevCtx(dev_ctx); h->SetDevCtx(dev_ctx);
h->SetExecutor(executor); h->SetExecutor(executor);
...@@ -249,14 +249,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -249,14 +249,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
request_prefetch_handler_.reset( request_prefetch_handler_.reset(
new detail::RequestPrefetchHandler(sync_mode)); new distributed::RequestPrefetchHandler(sync_mode));
rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get()); rpc_service_->RegisterRPC(distributed::kRequestSend,
rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get()); request_send_handler_.get());
rpc_service_->RegisterRPC(detail::kRequestPrefetch, rpc_service_->RegisterRPC(distributed::kRequestGet,
request_get_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
request_prefetch_handler_.get()); request_prefetch_handler_.get());
auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock); auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
......
...@@ -24,8 +24,8 @@ limitations under the License. */ ...@@ -24,8 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,7 +33,7 @@ namespace operators { ...@@ -33,7 +33,7 @@ namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock"; constexpr char kOptimizeBlock[] = "OptimizeBlock";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
void RunServer(std::shared_ptr<detail::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
class ListenAndServOp : public framework::OperatorBase { class ListenAndServOp : public framework::OperatorBase {
public: public:
...@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase {
const platform::Place& dev_place) const override; const platform::Place& dev_place) const override;
protected: protected:
mutable std::shared_ptr<detail::RPCServer> rpc_service_; mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
mutable std::shared_ptr<detail::RequestHandler> request_send_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_get_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_; mutable std::shared_ptr<distributed::RequestHandler>
request_prefetch_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
}; };
......
...@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); ...@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor); paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor, REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor); paddle::operators::LogicalXorFunctor);
...@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace()); auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation // computation
for (size_t k = 0; k < input_rows; ++k) { for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols; const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (size_t j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
auto* out_tensor = outputs->at(j); auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) { if (out_tensor != nullptr) {
......
...@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>; ...@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>; template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>; template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>; template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
#define DEFINE_CPU_TRANS(RANK) \ #define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \ template struct Transpose<platform::CPUDeviceContext, platform::float16, \
......
...@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
......
...@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase { ...@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
......
...@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
......
...@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase { ...@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
......
...@@ -14,11 +14,14 @@ ...@@ -14,11 +14,14 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/tensorrt_engine_op.h" #include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,10 +16,12 @@ ...@@ -16,10 +16,12 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
const std::string& z_name, bool x_created, const std::string& z_name, bool x_created,
const shape_t& x_shape, const shape_t& y_shape, const shape_t& x_shape, const shape_t& y_shape,
const shape_t& z_shape) { const shape_t& z_shape) {
LOG(INFO) << "create fc op"; LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp(); auto* fc = block_desc.AppendOp();
fc->SetType("mul"); fc->SetType("mul");
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
...@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv); ...@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv);
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
namespace m = paddle::operators::math; namespace m = paddle::operators::math;
namespace detail = paddle::operators::detail; namespace distributed = paddle::operators::distributed;
namespace string = paddle::string; namespace string = paddle::string;
std::unique_ptr<detail::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
void StartServer() { void StartServer() {
f::Scope scope; f::Scope scope;
...@@ -57,14 +57,14 @@ void StartServer() { ...@@ -57,14 +57,14 @@ void StartServer() {
g_req_handler->SetProgram(&empty_program); g_req_handler->SetProgram(&empty_program);
g_req_handler->SetExecutor(&executor); g_req_handler->SetExecutor(&executor);
g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get()); g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
g_rpc_service->SetCond(detail::kRequestSend); g_rpc_service->SetCond(distributed::kRequestSend);
g_rpc_service->WaitBarrier(detail::kRequestSend); g_rpc_service->WaitBarrier(distributed::kRequestSend);
LOG(INFO) << "got nccl id and stop server..."; LOG(INFO) << "got nccl id and stop server...";
g_rpc_service->ShutDown(); g_rpc_service->ShutDown();
...@@ -72,7 +72,7 @@ void StartServer() { ...@@ -72,7 +72,7 @@ void StartServer() {
} }
TEST(SendNcclId, RPCServer) { TEST(SendNcclId, RPCServer) {
g_req_handler.reset(new detail::RequestSendHandler(true)); g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
std::thread server_thread(StartServer); std::thread server_thread(StartServer);
...@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) { ...@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) {
std::string ep = string::Sprintf("127.0.0.1:%d", port); std::string ep = string::Sprintf("127.0.0.1:%d", port);
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
LOG(INFO) << "connect to server" << ep; LOG(INFO) << "connect to server" << ep;
client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
......
...@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) { ...@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) {
new (&instance) LoDTensor(new_offset_lod); new (&instance) LoDTensor(new_offset_lod);
}) })
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
// We implement offset based LOD in C++ while we use length based with
// Python API. So we changed set_lod to set_recursive_sequence_lengths to
// avoid misuse.
// The discussion is here:
// https://github.com/PaddlePaddle/Paddle/issues/10855
.def("set_lod", .def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
// the input lod is offset-based level-of-detail info // the input lod is offset-based level-of-detail info
...@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) { ...@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) {
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod; return new_lod;
}) })
// Set above comments of set_lod.
.def("recursive_sequence_lengths", .def("recursive_sequence_lengths",
[](LoDTensor &self) -> std::vector<std::vector<size_t>> { [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
// output the length-based lod info // output the length-based lod info
......
...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool, details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor); uint8_t, platform::float16>()(tensor);
return buffer_info; return buffer_info;
} }
......
...@@ -44,7 +44,7 @@ import metrics ...@@ -44,7 +44,7 @@ import metrics
import transpiler import transpiler
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from transpiler import DistributeTranspiler, InferenceTranspiler, \ from transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory memory_optimize, release_memory
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
...@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ ...@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
'Scope',
] ]
......
...@@ -79,6 +79,61 @@ class DataToLoDTensorConverter(object): ...@@ -79,6 +79,61 @@ class DataToLoDTensorConverter(object):
class DataFeeder(object): class DataFeeder(object):
"""
DataFeeder converts the data that returned by a reader into a data
structure that can feed into Executor and ParallelExecutor. The reader
usually returns a list of mini-batch data entries. Each data entry in
the list is one sample. Each sample is a list or a tuple with one
feature or multiple features.
The simple usage shows below:
.. code-block:: python
place = fluid.CPUPlace()
img = fluid.layers.data(name='image', shape=[1, 28, 28])
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
If you want to feed data into GPU side separately in advance when you
use multi-GPU to train a model, you can use `decorate_reader` function.
.. code-block:: python
place=fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(flowers.train(), batch_size=16))
Args:
feed_list(list): The Variables or Variables'name that will
feed into model.
place(Place): place indicates feed data into CPU or GPU, if you want to
feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
the GPU id), or if you want to feed data into CPU, please using
`fluid.CPUPlace()`.
program(Program): The Program that will feed data into, if program
is None, it will use default_main_program(). Default None.
Raises:
ValueError: If some Variable is not in this Program.
Examples:
.. code-block:: python
# ...
place = fluid.CPUPlace()
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_vars_name
] # feed_vars_name is a list of variables' name.
feeder = fluid.DataFeeder(feed_list, place)
for data in reader():
outs = exe.run(program=main_program,
feed=feeder.feed(data))
"""
def __init__(self, feed_list, place, program=None): def __init__(self, feed_list, place, program=None):
self.feed_dtypes = [] self.feed_dtypes = []
self.feed_names = [] self.feed_names = []
...@@ -108,6 +163,16 @@ class DataFeeder(object): ...@@ -108,6 +163,16 @@ class DataFeeder(object):
self.place = place self.place = place
def feed(self, iterable): def feed(self, iterable):
"""
According to feed_list and iterable, converters the input into
a data structure that can feed into Executor and ParallelExecutor.
Args:
iterable(list|tuple): the input data.
Returns:
dict: the result of conversion.
"""
converter = [] converter = []
for lod_level, shape, dtype in six.zip( for lod_level, shape, dtype in six.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes): self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
...@@ -130,6 +195,20 @@ class DataFeeder(object): ...@@ -130,6 +195,20 @@ class DataFeeder(object):
return ret_dict return ret_dict
def feed_parallel(self, iterable, num_places=None): def feed_parallel(self, iterable, num_places=None):
"""
Takes multiple mini-batches. Each mini-batch will be feed on each
device in advance.
Args:
iterable(list|tuple): the input data.
num_places(int): the number of devices. Default None.
Returns:
dict: the result of conversion.
Notes:
The number of devices and number of mini-batches must be same.
"""
if isinstance(self.place, core.CUDAPlace): if isinstance(self.place, core.CUDAPlace):
places = [ places = [
core.CUDAPlace(i) core.CUDAPlace(i)
...@@ -168,6 +247,24 @@ class DataFeeder(object): ...@@ -168,6 +247,24 @@ class DataFeeder(object):
multi_devices, multi_devices,
num_places=None, num_places=None,
drop_last=True): drop_last=True):
"""
Converter the input data into a data that returned by reader into
multiple mini-batches. Each mini-batch will be feed on each device.
Args:
reader(fun): the input data.
multi_devices(bool): the number of places. Default None.
num_places(int): the number of places. Default None.
drop_last(bool): the number of places. Default None.
Returns:
dict: the result of conversion.
Raises:
ValueError: If drop_last is False and the data batch which cannot
fit for devices.
"""
def __reader_creator__(): def __reader_creator__():
if not multi_devices: if not multi_devices:
for item in reader(): for item in reader():
......
...@@ -25,6 +25,13 @@ g_scope = core.Scope() ...@@ -25,6 +25,13 @@ g_scope = core.Scope()
def global_scope(): def global_scope():
"""
Get the global/default scope instance. There are a lot of APIs use
:code:`global_scope` as its default value, e.g., :code:`Executor.run`
Returns:
Scope: The global/default scope instance.
"""
return g_scope return g_scope
...@@ -37,6 +44,19 @@ def switch_scope(scope): ...@@ -37,6 +44,19 @@ def switch_scope(scope):
@contextlib.contextmanager @contextlib.contextmanager
def scope_guard(scope): def scope_guard(scope):
"""
Change the global/default scope instance by Python `with` statement. All
variable in runtime will assigned to the new scope.
Examples:
>>> import paddle.fluid as fluid
>>> new_scope = fluid.Scope()
>>> with fluid.scope_guard(new_scope):
>>> ...
Args:
scope: The new global/default scope.
"""
ex = switch_scope(scope) ex = switch_scope(scope)
yield yield
switch_scope(ex) switch_scope(ex)
...@@ -135,14 +155,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -135,14 +155,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
def fetch_var(name, scope=None, return_numpy=True): def fetch_var(name, scope=None, return_numpy=True):
""" """
Fetch the value of the variable with the given name from the given scope Fetch the value of the variable with the given name from the
given scope.
Args: Args:
name(str): name of the variable. Typically, only persistable variables name(str): name of the variable. Typically, only persistable variables
can be found in the scope used for running the program. can be found in the scope used for running the program.
scope(core.Scope|None): scope object. It should be the scope where scope(core.Scope|None): scope object. It should be the scope where
you pass to Executor.run() when running your program. you pass to Executor.run() when running your program.
If None, global_scope() will be used. If None, global_scope() will be used. Default None.
return_numpy(bool): whether convert the tensor to numpy.ndarray return_numpy(bool): whether convert the tensor to numpy.ndarray.
Default True.
Returns: Returns:
LodTensor|numpy.ndarray LodTensor|numpy.ndarray
""" """
......
...@@ -30,8 +30,6 @@ __all__ = [ ...@@ -30,8 +30,6 @@ __all__ = [
'default_startup_program', 'default_startup_program',
'default_main_program', 'default_main_program',
'program_guard', 'program_guard',
'switch_startup_program',
'switch_main_program',
'get_var', 'get_var',
] ]
...@@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ...@@ -43,7 +41,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
def grad_var_name(var_name): def grad_var_name(var_name):
""" """
return gradient name for a certain var name Returns:
str: gradient name for a certain var name
""" """
return var_name + GRAD_VAR_SUFFIX return var_name + GRAD_VAR_SUFFIX
...@@ -51,10 +50,12 @@ def grad_var_name(var_name): ...@@ -51,10 +50,12 @@ def grad_var_name(var_name):
def convert_np_dtype_to_dtype_(np_dtype): def convert_np_dtype_to_dtype_(np_dtype):
""" """
Convert the data type in numpy to the data type in Paddle Convert the data type in numpy to the data type in Paddle
Args: Args:
np_dtype(np.dtype): the data type in numpy np_dtype(np.dtype): the data type in numpy.
Returns(core.VarDesc.VarType): the data type in Paddle Returns:
core.VarDesc.VarType: the data type in Paddle.
""" """
dtype = np.dtype(np_dtype) dtype = np.dtype(np_dtype)
...@@ -120,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True): ...@@ -120,37 +121,53 @@ def _debug_string_(proto, throw_on_error=True):
class Variable(object): class Variable(object):
""" """
Python variable. Every input and output of an operator is a variable. Every In Fluid, every input and output of an operator is a variable. In most
variable belongs to a block. The variable has a name and two variables in cases, variables are used for holding different kinds of data or training
different blocks could have the same name. labels. A variable belongs to a block. All variable has its own name and
two variables in different blocks could have the same name.
There are many kinds of variables. Please reference the framework.proto for There are many kinds of variables. Each kind of them has its own attributes
details. and usages. Please reference the framework.proto for details.
Notes: The constructor of Variable should not be invoked directly. Please
use `Block.create_var` to create a variable.
>>> cur_program = Program() Most of a Variable's member variables can be setted to be None. It mean
>>> cur_block = cur_program.current_block() it is not available or will be specified later.
>>> new_variable = cur_block.create_var(
>>> name="X", shape=[-1, 23, 48], dtype='float32')
Args: Args:
block(Block): The associated block. It will be passed by block(Block): The block that the variable belongs to.
`Block.create_var` automatically.
type(core.VarDesc.VarType): Variable type. Please reference the type(core.VarDesc.VarType): Variable type. Please reference the
framework.proto for details. framework.proto for details.
shape(tuple|list|None): The shape of variable. -1 means the batch size. name(str|None): The name of the variable. If setted None, it will be
generated automatically. Default: None
shape(tuple|list|None): The shape of the variable. -1 means the batch size.
Some kinds of variable do not contain shape, just set it to None. Some kinds of variable do not contain shape, just set it to None.
dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable. Default: None
lod_level(int): The level of lod tensor. 0 means it is not a time dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
Default: None
lod_level (int|None): The level of lod tensor. 0 means it is not a time
series data. series data.
capacity(int): The capacity of Channel variable. Ignored Default: None
for other types. capacity (int|None): The capacity of Channel variable. Ignored for other
persistable(bool): True if the variable should be saved as check point. types. Default: None
Defaults to False. persistable (bool|None): True if the variable is persistable. A persistable
stop_gradient(bool): True if the variable will stop to calculate variable will not be deleted after an iteration ending. Defaults: None.
gradients when backward. Defaults to False. error_clip (BaseErrorClipAttr|None): The error clip attributes of the
corresponding gradient variable. Default: None
stop_gradient (bool): True if the variable will stop to calculate its
gradients when backward. Default: False.
is_data (bool): True if the variable is an input data. Default: False
Notes:
The constructor of Variable should not be invoked directly. Please
use `Block.create_var` to create a variable.
Examples:
.. code-block:: python
cur_program = Program()
cur_block = cur_program.current_block()
new_variable = cur_block.create_var(name="X",
shape=[-1, 23, 48],
dtype='float32')
""" """
def __init__(self, def __init__(self,
...@@ -253,13 +270,14 @@ class Variable(object): ...@@ -253,13 +270,14 @@ class Variable(object):
Get debug string. Get debug string.
Args: Args:
throw_on_error(bool): True if raise an exception when self is not throw_on_error(bool): True if raise an exception when self is
intialized. not initialized.
with_details(bool): more details about variables and parameters with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True (e.g. trainable, optimize_attr, ...) will be printed when
with_details is True. Default False;
Returns(str): The debug string.
Returns:
str: The debug string.
""" """
assert isinstance(throw_on_error, bool) and isinstance(with_details, assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool) bool)
...@@ -276,6 +294,15 @@ class Variable(object): ...@@ -276,6 +294,15 @@ class Variable(object):
__repr__ = __str__ __repr__ = __str__
def set_desc(self, input): def set_desc(self, input):
"""
Set the variable description.
Args:
input(core.VarDesc): The new VarDesc.
Returns:
None
"""
self.desc = input self.desc = input
@property @property
...@@ -312,6 +339,15 @@ class Variable(object): ...@@ -312,6 +339,15 @@ class Variable(object):
return self.desc.type() return self.desc.type()
def set_error_clip(self, error_clip): def set_error_clip(self, error_clip):
"""
Set the error_clip.
Args:
error_clip(BaseErrorClipAttr) : The new error_clip.
Returns:
None
"""
self.error_clip = error_clip self.error_clip = error_clip
...@@ -319,8 +355,8 @@ def get_all_op_protos(): ...@@ -319,8 +355,8 @@ def get_all_op_protos():
""" """
Get all registered op proto from PaddlePaddle C++ end. Get all registered op proto from PaddlePaddle C++ end.
Returns(list): list of OpProto Returns:
list: list of OpProto.
""" """
protostrs = core.get_all_op_protos() protostrs = core.get_all_op_protos()
ret_values = [] ret_values = []
...@@ -373,9 +409,45 @@ class OpProtoHolder(object): ...@@ -373,9 +409,45 @@ class OpProtoHolder(object):
class Operator(object): class Operator(object):
""" """
Python Operator class. The operator represents the build in instructions in a In Fluid, all the operation are represented by Operator, and Operator
Block. Users can use the build in instructions to describe their neural is regarded as a build in an instruction of a Block. Users can use the
network. build in instructions to describe their neural network.
Args:
block(Block): The block has the current operator.
desc(core.OpDesc): The protobuf description of Operator.
type(str): The type of operator. Default None.
inputs(dict): The input of this Operator. it is a dictionary, for every
element, key is the input parameter name, and value is a list of
variables. Default None.
outputs(dict): The output of this Operator. it is a dictionary, for
every element, key is the input parameter name, and value is a list
of variables. Default None.
attrs(dict): The attributes of this Operator. it is a dictionary, for
every element, key is attribute name, and value is the attribute value.
The attribute type should be as same as the type registered in C++ side.
Default None.
Returns:
Operator: The initialized Operator.
Raises:
ValueError: If the passed input, output and attrs doesn't match the
initializing Operator's that registered in C++ side.
Notes:
The constructor of operator should not be invoked directly. Use
Block.append_op or Block.prepend_op instead.
Examples:
.. code-block:: python
cur_program = Program()
cur_block = cur_program.current_block()
# var1 += var2 + var3
cur_block.append_op(type="sum",
inputs={"X": [var1, var2, var3]},
outputs={"Out": [var1]})
""" """
OP_WITHOUT_KERNEL_SET = { OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
...@@ -392,31 +464,7 @@ class Operator(object): ...@@ -392,31 +464,7 @@ class Operator(object):
inputs=None, inputs=None,
outputs=None, outputs=None,
attrs=None): attrs=None):
"""
Constructor.
Notes: The constructor of operator should not be invoked directly. Use
Block.append_op or Block.prepend_op instead.
>>> cur_program = Program()
>>> cur_block = cur_program.current_block()
>>> # var1 += var2 + var3
>>> cur_block.append_op(type="sum",
>>> inputs={"X": [var1, var2, var3]},
>>> outputs={"Out": [var1]})
Args:
block(Block): The block has the current operator.
desc(core.OpDesc): The protobuf description.
type(str): The type of operator.
inputs(dict): The input dictionary. Key is the input parameter name.
Value is a list of variables.
outputs(dict): The output dictionary which has the same format with
inputs.
attrs(dict): The attributes dictionary. Key is attribute name. Value
is the attribute value. The attribute type should be as same as
the type registered in C++
"""
self.block = block self.block = block
self.desc = desc self.desc = desc
self.attrs = attrs self.attrs = attrs
...@@ -529,12 +577,14 @@ class Operator(object): ...@@ -529,12 +577,14 @@ class Operator(object):
def to_string(self, throw_on_error): def to_string(self, throw_on_error):
""" """
To debug string. Get debug string.
Args: Args:
throw_on_error(bool): raise exception when self is not initialized throw_on_error(bool): Whether to raise exception if self is not
when throw_on_error is True initialized.
Returns(str): The debug string. Returns:
str: The debug string.
""" """
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
...@@ -552,29 +602,45 @@ class Operator(object): ...@@ -552,29 +602,45 @@ class Operator(object):
def input(self, name): def input(self, name):
""" """
Get input arguments by the input parameter name Get the input arguments according to the input parameter name.
Args:
name(str): The input parameter name
Returns(list): return the list of argument names associated with the Args:
specific parameter name. name(str): The input parameter name.
Returns:
list: return the list of argument names that associated with \
the specific parameter name.
""" """
return self.desc.input(name) return self.desc.input(name)
def rename_input(self, old_name, new_name): def rename_input(self, old_name, new_name):
"""
Rename the `old_name` to `new_name`.
Args:
old_name(str): The old name of the Operator's input.
new_name(str): The new name of the Operator's input.
Returns:
None
"""
self.desc.rename_input(old_name, new_name) self.desc.rename_input(old_name, new_name)
def rename_output(self, old_name, new_name): def rename_output(self, old_name, new_name):
"""
Rename the `old_name` to `new_name`.
Args:
old_name(str): The old name of the Operator's output.
new_name(str): The new name of the Operator's output.
Returns:
None
"""
self.desc.rename_output(old_name, new_name) self.desc.rename_output(old_name, new_name)
@property @property
def input_names(self): def input_names(self):
"""
Get all input parameter names
Returns(list): return a list of input parameter names
"""
return self.desc.input_names() return self.desc.input_names()
@property @property
...@@ -587,33 +653,23 @@ class Operator(object): ...@@ -587,33 +653,23 @@ class Operator(object):
def output(self, name): def output(self, name):
""" """
Get output arguments by the output parameter name Get output arguments by the output parameter name.
Args:
name(str): The output parameter name
Returns(list): return the list of argument names associated with the Args:
specific parameter name. name(str): The output parameter name.
Returns:
list: return the list of argument names associated with \
the specific parameter name.
""" """
return self.desc.output(name) return self.desc.output(name)
@property @property
def output_names(self): def output_names(self):
"""
Get all output parameter names
Returns(list): return a list of output parameter names
"""
return self.desc.output_names() return self.desc.output_names()
@property @property
def idx(self): def idx(self):
"""
Return the array index of current operator.
Returns(int): The array index in block.ops array
Raises:
ValueError: when the operator is not found.
"""
for i, op in enumerate(self.block.ops): for i, op in enumerate(self.block.ops):
if op == self: if op == self:
return i return i
...@@ -622,27 +678,40 @@ class Operator(object): ...@@ -622,27 +678,40 @@ class Operator(object):
def has_attr(self, name): def has_attr(self, name):
""" """
operator has the attribute with name or not. Whether this Operator has the attribute with name or not.
Args: Args:
name(str): the attribute name name(str): the attribute name.
Returns(bool): True if has this attribute. Returns:
bool: True if has this attribute.
""" """
return self.desc.has_attr(name) return self.desc.has_attr(name)
def attr_type(self, name): def attr_type(self, name):
""" """
Get the type of attribute by attribute name Get the type of attribute by attribute's name.
Args:
name(str): the attribute name
Returns(core.AttrType): the attribute type Args:
name(str): the attribute name.
Returns:
core.AttrType: the attribute type.
""" """
return self.desc.attr_type(name) return self.desc.attr_type(name)
def set_attr(self, name, val): def set_attr(self, name, val):
"""
Set the value of attribute by attribute's name.
Args:
name(str): the attribute name.
val(bool|int|str|float|list): the value of the attribute.
Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name).
"""
self.attrs[name] = val self.attrs[name] = val
if isinstance(val, Block): if isinstance(val, Block):
self.desc.set_block_attr(name, val.desc) self.desc.set_block_attr(name, val.desc)
...@@ -654,40 +723,39 @@ class Operator(object): ...@@ -654,40 +723,39 @@ class Operator(object):
@property @property
def attr_names(self): def attr_names(self):
"""
Get all attribute names
Returns(list): The list of attribute name
"""
return self.desc.attr_names() return self.desc.attr_names()
def attr(self, name): def attr(self, name):
""" """
Get attribute by name Get the attribute by name.
Args: Args:
name(str): the attribute name name(str): the attribute name.
Returns(bool|int|str|float|list): The attribute value. The return value Returns:
bool|int|str|float|list: The attribute value. The return value
can be any valid attribute type. can be any valid attribute type.
""" """
return self.desc.attr(name) return self.desc.attr(name)
def block_attr(self, name): def block_attr(self, name):
""" """
Get the block attribute by name Get the block attribute by name.
Args:
name(str): the attribute name
Returns(int): the block index Args:
name(str): the attribute name.
Returns:
int: the block index.
""" """
return self.desc.block_attr(name) return self.desc.block_attr(name)
def all_attrs(self): def all_attrs(self):
""" """
Get the attribute dict Get the attribute dict.
Returns(dict): The Operator's attribute dict
Returns:
dict: The Operator's attribute dict.
""" """
attr_names = self.attr_names attr_names = self.attr_names
attr_map = {} attr_map = {}
...@@ -700,6 +768,35 @@ class Operator(object): ...@@ -700,6 +768,35 @@ class Operator(object):
class Block(object): class Block(object):
"""
In Fluid, a Program is consistence of multi-Block, and Block stores
VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
One block could have some child blocks, and child block's name scopes
should inherit the parent's so that OpDesc in child block can reference
a VarDesc that is stored in the parent block.
Please reference the framework.proto for details.
Args:
program(Program): The Program that the Block belongs to.
idx(int): The block's id in the Program.
Notes:
The constructor of Block should not be invoked directly. Please
use `Program.create_block()` to create a block.
Examples:
.. code-block:: python
cur_program = Program()
cur_block = cur_program.current_block()
var = cur_block.create_var(name="X",
shape=[-1, 23, 48],
dtype='float32')
cur_block.append_op(type="abs",
inputs={"X": [var]},
outputs={"Out": [var]})
"""
def __init__(self, program, idx): def __init__(self, program, idx):
self.desc = program.desc.block(idx) self.desc = program.desc.block(idx)
self.vars = collections.OrderedDict() # var_name --> var self.vars = collections.OrderedDict() # var_name --> var
...@@ -712,15 +809,17 @@ class Block(object): ...@@ -712,15 +809,17 @@ class Block(object):
def to_string(self, throw_on_error, with_details=False): def to_string(self, throw_on_error, with_details=False):
""" """
To debug string. Get debug string.
Args: Args:
throw_on_error(bool): raise exception when self is not initialized throw_on_error(bool): raise exception when self is not initialized
when throw_on_error is True when throw_on_error is True.
with_details(bool): more details about variables and parameters with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True (e.g. trainable, optimize_attr, ...) will be printed when
with_details is True. Default False.
Returns(str): The debug string.
Returns:
str: The debug string.
""" """
assert isinstance(throw_on_error, bool) and isinstance(with_details, assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool) bool)
...@@ -752,6 +851,15 @@ class Block(object): ...@@ -752,6 +851,15 @@ class Block(object):
return self.desc.get_forward_block_idx() return self.desc.get_forward_block_idx()
def set_forward_block_idx(self, idx): def set_forward_block_idx(self, idx):
"""
Set the forward block Idx.
Args:
idx(int): the block index.
Returns:
None
"""
self.desc.set_forward_block_idx(idx) self.desc.set_forward_block_idx(idx)
@property @property
...@@ -759,6 +867,19 @@ class Block(object): ...@@ -759,6 +867,19 @@ class Block(object):
return self.desc.id return self.desc.id
def var(self, name): def var(self, name):
"""
Get a Variable by name from this block.
Args:
name(str): the Variable's name.
Raises:
ValueError: The If input's type is not str, or this block
doesn't have a Variable with the giving name.
Returns:
Variable: the Variable with the giving name.
"""
if not isinstance(name, basestring): if not isinstance(name, basestring):
raise TypeError( raise TypeError(
"var require string as parameter, but get %s instead." % "var require string as parameter, but get %s instead." %
...@@ -769,6 +890,19 @@ class Block(object): ...@@ -769,6 +890,19 @@ class Block(object):
return v return v
def var_recursive(self, name): def var_recursive(self, name):
"""
Get a Variable by name from this block recursively.
Args:
name(str): the Variable's name.
Raises:
ValueError: this block and this parent block doesn't
have a Variable with the giving name.
Returns:
Variable: the Variable with the giving name.
"""
frontier = list() frontier = list()
visited = set() visited = set()
...@@ -815,6 +949,18 @@ class Block(object): ...@@ -815,6 +949,18 @@ class Block(object):
def rename_var(self, name, new_name): def rename_var(self, name, new_name):
""" """
Rename variable in vars and ops' inputs and outputs Rename variable in vars and ops' inputs and outputs
Args:
name(str): the name that need to be renamed.
new_name(str): the name that need to rename to.
Raises:
ValueError: If this block doesn't have this the giving name,
or the type of the var with the giving name is not Parameter
or Variable.
Returns:
Variable: the Variable with the giving name.
""" """
if not self.has_var(name): if not self.has_var(name):
raise ValueError("var %s is not in current block" % name) raise ValueError("var %s is not in current block" % name)
...@@ -878,12 +1024,27 @@ class Block(object): ...@@ -878,12 +1024,27 @@ class Block(object):
return param return param
def append_op(self, *args, **kwargs): def append_op(self, *args, **kwargs):
"""
Appends a new Operator according to the giving arguments.
Returns:
Operator: the append Operator.
"""
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
op = Operator(block=self, desc=op_desc, *args, **kwargs) op = Operator(block=self, desc=op_desc, *args, **kwargs)
self.ops.append(op) self.ops.append(op)
return op return op
def insert_op(self, index, *args, **kwargs): def insert_op(self, index, *args, **kwargs):
"""
Insert a Operator according to the giving arguments.
Args:
index(int): the place that the operator to insert.
Returns:
Operator: the insert Operator.
"""
self.sync_with_cpp() self.sync_with_cpp()
op_desc = self.desc.insert_op(index) op_desc = self.desc.insert_op(index)
op = Operator(block=self, desc=op_desc, *args, **kwargs) op = Operator(block=self, desc=op_desc, *args, **kwargs)
...@@ -891,11 +1052,30 @@ class Block(object): ...@@ -891,11 +1052,30 @@ class Block(object):
return op return op
def remove_op(self, index): def remove_op(self, index):
"""
Remove the specific position operator.
Args:
index(int): the position that the operator to insert.
Returns:
None
"""
self.sync_with_cpp() self.sync_with_cpp()
self.desc.remove_op(index, index + 1) self.desc.remove_op(index, index + 1)
del self.ops[index] del self.ops[index]
def slice_ops(self, start, end): def slice_ops(self, start, end):
"""
Return the Operator between start and end.
Args:
start(int): the start position.
end(int): the end position.
Returns:
list: the Operators between start and end.
"""
return self.ops[start:end] return self.ops[start:end]
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
...@@ -906,9 +1086,8 @@ class Block(object): ...@@ -906,9 +1086,8 @@ class Block(object):
def sync_with_cpp(self): def sync_with_cpp(self):
""" """
Sync from the desc on the c++ end. Sync from the desc on the c++ end. This method is used to synchronize
the c++ desc instance generated by backward.
This method is used to synchronize the c++ desc instance generated by backward.
""" """
# sync variables from cpp # sync variables from cpp
for var in self.desc.all_vars(): for var in self.desc.all_vars():
...@@ -973,9 +1152,14 @@ class Block(object): ...@@ -973,9 +1152,14 @@ class Block(object):
def copy_param_info_from(self, other): def copy_param_info_from(self, other):
""" """
Copy the information of parameters from the other block Copy the information of parameters from the other block.
Args: Args:
other(Block): the other block other(Block): the other block.
Raises:
ValueError: If type of input is not Block, or the `other` and this
block is not in the same topology.
Returns: Returns:
None None
...@@ -1007,11 +1191,12 @@ class Block(object): ...@@ -1007,11 +1191,12 @@ class Block(object):
def clone_variable(self, var): def clone_variable(self, var):
""" """
Clone a variable into current block. Clone a variable into current block.
Args: Args:
var: the variable to be cloned. var: the variable to be cloned.
Returns: Returns:
The new variable cloned from 'var' in current block. Variable: the new variable cloned from 'var' in current block.
""" """
assert isinstance(var, Variable) assert isinstance(var, Variable)
ret_var = None ret_var = None
...@@ -1051,23 +1236,18 @@ class Program(object): ...@@ -1051,23 +1236,18 @@ class Program(object):
Notes: we have default_startup_program and default_main_program Notes: we have default_startup_program and default_main_program
by default, a pair of them will shared the parameters. by default, a pair of them will shared the parameters.
The default_startup_program only run once to initialize parameters, The default_startup_program only run once to initialize parameters,
default_main_program run in every minibatch and adjust the weights. default_main_program run in every mini batch and adjust the weights.
Args:
None
Returns: Returns:
Python Program A empty program.
Examples: Examples:
.. code-block:: python >>> main_program = fluid.Program()
>>> startup_program = fluid.Program()
main_program = Program() >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
startup_program = Program() >>> fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
with fluid.program_guard(main_program=main_program, startup_program=startup_program): >>> fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
fluid.layers.data(name="x", shape=[-1, 784], dtype='float32') >>> fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
""" """
...@@ -1081,6 +1261,19 @@ class Program(object): ...@@ -1081,6 +1261,19 @@ class Program(object):
@property @property
def op_role(self): def op_role(self):
"""
The operator role. In a enum {Forward, Backward, Optimize}.
Notes: this is a low level API. It is used only for ParallelExecutor to
duplicate or schedule operator to devices.
For example, the forward operator should be executed on every device.
The backward operator should be executed on every device and the
parameter gradient of backward (use :code:`op_role_var` to get this
variable) operator should be merged to one device. The optimization
operators should be executed on only one device and broadcast the
optimization result, i.e., the new parameter, to every other device.
"""
return self._current_role return self._current_role
@op_role.setter @op_role.setter
...@@ -1089,6 +1282,13 @@ class Program(object): ...@@ -1089,6 +1282,13 @@ class Program(object):
@property @property
def op_role_var(self): def op_role_var(self):
"""
The auxiliary variables for :code:`op_role` property.
See Also: :code:`Program.op_role`'s documentation for details.
Notes: This is a very low-level API. Users should not use it directly.
"""
return self._op_role_var return self._op_role_var
@op_role_var.setter @op_role_var.setter
...@@ -1097,6 +1297,21 @@ class Program(object): ...@@ -1097,6 +1297,21 @@ class Program(object):
@contextlib.contextmanager @contextlib.contextmanager
def optimized_guard(self, var): def optimized_guard(self, var):
"""
A with guard to set :code:`Optimization` :code:`OpRole` and
:code:`OpRoleVar` automatically.
Notes: This is a very low level API. Users should not use it directly.
Args:
var(Variable|str): The variable (name) to be optimized.
Examples:
>>> p, g = backward(...)
>>> with program.optimized_guard(p):
>>> p = p - 0.001 * g
"""
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.Optimize self._current_role = OpRole.Optimize
self._op_role_var = [var.name if isinstance(var, Variable) else var] self._op_role_var = [var.name if isinstance(var, Variable) else var]
...@@ -1105,18 +1320,35 @@ class Program(object): ...@@ -1105,18 +1320,35 @@ class Program(object):
self._current_role = OpRole.Forward self._current_role = OpRole.Forward
def __str__(self): def __str__(self):
"""
Get the protobuf debug string of this Program.
Returns:
(str): The protobuf debug string.
Raises:
ValueError: If any of required fields is not set.
"""
return self.to_string(True) return self.to_string(True)
def to_string(self, throw_on_error, with_details=False): def to_string(self, throw_on_error, with_details=False):
""" """
To debug string. To debug string.
Args: Args:
throw_on_error(bool): raise exception when self is not initialized throw_on_error(bool): raise Value error when any of required fields
when throw_on_error is True is not set.
with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True
Returns(str): The debug string. with_details(bool): True if more details about variables and
parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
to print.
Returns
(str): The debug string.
Raises:
ValueError: If any of required fields is not set and throw_on_error is
True.
""" """
assert isinstance(throw_on_error, bool) and isinstance(with_details, assert isinstance(throw_on_error, bool) and isinstance(with_details,
...@@ -1132,25 +1364,89 @@ class Program(object): ...@@ -1132,25 +1364,89 @@ class Program(object):
return res_str return res_str
def get_desc(self): def get_desc(self):
"""
Get the C++ side of `ProgramDesc` object pointer. The C++ object is
exposed by :code:`pybind`.
Notes: This is a very low level API. Users should not use this API
directly.
"""
return self.desc return self.desc
def clone(self, for_test=False): def clone(self, for_test=False):
"""Clone the Program object """
Args: Create a new, duplicated program.
for_test(bool): indicate whether clone for test.
Set for_test to False when we want to clone the program for training.
Set for_test to True when we want to clone the program for testing. Some operators, e.g., :code:`batch_norm`, behave differently between
training and testing. They have an attribute, :code:`is_test`, to
control this behaviour. This method will change the :code:`is_test`
attribute of them to :code:`True` when :code:`for_test=True`.
* Set for_test to False when we want to clone the program for training.
* Set for_test to True when we want to clone the program for testing.
Notes: This API DOES NOT prune any operator. Use
:code:`clone(for_test=True)` before backward and optimization please.
Args: Args:
for_test(bool): Some operators, such as batch_norm and drop_out ops, for_test(bool): True if change the :code:`is_test` attribute of
behave differently in training and testing. If for_test is True, operators to :code:`True`.
the is_test attributes in these operators will be set to True for
testing purposes, otherwise, they remain unchanged.
Returns: Returns:
Program: The cloned Program object. Program: The new, duplicated Program object.
Examples:
1. To clone a test program, the sample code is:
>>> import paddle.fluid as fluid
>>> train_program = fluid.Program()
>>> startup_program = fluid.Program()
>>> with fluid.program_guard(train_program, startup_program):
>>> img = fluid.layers.data(name='image', shape=[784])
>>> hidden = fluid.layers.fc(input=img, size=200, act='relu')
>>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
>>> loss = fluid.layers.cross_entropy(
>>> input=fluid.layers.fc(hidden, size=10, act='softmax'),
>>> label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
>>>
>>> test_program = train_program.clone(for_test=True)
>>>
>>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
>>> with fluid.program_guard(train_program, startup_program):
>>> sgd.minimize(loss)
2. The :code:`clone` method can be avoid if you create program for
training and program for testing individually.
>>> import paddle.fluid as fluid
>>>
>>> def network(is_test):
>>> img = fluid.layers.data(name='image', shape=[784])
>>> hidden = fluid.layers.fc(input=img, size=200, act='relu')
>>> hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
>>> loss = fluid.layers.cross_entropy(
>>> input=fluid.layers.fc(hidden, size=10, act='softmax'),
>>> label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
>>> return loss
>>>
>>> train_program = fluid.Program()
>>> startup_program = fluid.Program()
>>> test_program = fluid.Program()
>>>
>>> with fluid.program_guard(train_program, startup_program):
>>> with fluid.unique_name.guard():
>>> loss = network(is_test=False)
>>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
>>> sgd.minimize(loss)
>>>
>>> # the test startup program is not used.
>>> with fluid.program_guard(test_program, fluid.Program()):
>>> with fluid.unique_name.guard():
>>> loss = network(is_test=True)
The two code snippets above will generate same programs.
""" """
if for_test: if for_test:
p = self.inference_optimize() p = self.inference_optimize()
...@@ -1165,6 +1461,21 @@ class Program(object): ...@@ -1165,6 +1461,21 @@ class Program(object):
return p return p
def prune(self, targets): def prune(self, targets):
"""
Prune operators and variables which are not needed to generate
:code:`targets`.
Notes: This is a very low level API. Users should not use this API
directly. This API is in flux and not stable.
Args:
targets(list|Variable|Operator): A list of variables or operators
need to be pruned
Returns:
Program: A new, pruned program.
"""
if not isinstance(targets, list): if not isinstance(targets, list):
targets = [targets] targets = [targets]
targets_idx = [] targets_idx = []
...@@ -1199,6 +1510,17 @@ class Program(object): ...@@ -1199,6 +1510,17 @@ class Program(object):
return res return res
def inference_optimize(self): def inference_optimize(self):
"""
This method will create a new program and change the :code:`is_test`
attribute of operators to :code:`True`. All the :code:`Parameter`
information will be lost.
Notes: This API is a very low level API. Use
:code:`Program.clone(for_test=True)` instead.
Returns:
Program: The new program.
"""
# this is an alternative implement before # this is an alternative implement before
# core.inference_optimize being fixed. # core.inference_optimize being fixed.
res = Program() res = Program()
...@@ -1215,6 +1537,18 @@ class Program(object): ...@@ -1215,6 +1537,18 @@ class Program(object):
@staticmethod @staticmethod
def parse_from_string(binary_str): def parse_from_string(binary_str):
"""
Deserialize a program desc from protobuf binary string.
Notes: All information about parameters will be lost after serialization
and deserialization.
Args:
binary_str(str): The binary prootbuf string.
Returns:
Program: A deserialized program desc.
"""
p = Program() p = Program()
p.desc = core.ProgramDesc(binary_str) p.desc = core.ProgramDesc(binary_str)
p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
...@@ -1223,10 +1557,19 @@ class Program(object): ...@@ -1223,10 +1557,19 @@ class Program(object):
@property @property
def random_seed(self): def random_seed(self):
"""
The default random seed for random operators in Program. Zero means get
the random seed from random device.
Notes: It must be set before the operators have been added.
"""
return self._seed return self._seed
@property @property
def num_blocks(self): def num_blocks(self):
"""
The number of blocks in this program.
"""
return self.desc.num_blocks() return self.desc.num_blocks()
@random_seed.setter @random_seed.setter
...@@ -1239,15 +1582,40 @@ class Program(object): ...@@ -1239,15 +1582,40 @@ class Program(object):
return str(self) return str(self)
def global_block(self): def global_block(self):
"""
Get the first block of this program.
"""
return self.blocks[0] return self.blocks[0]
def block(self, index): def block(self, index):
"""
Get the :code:`index` block of this program
Args:
index(int): The index of block to get
Returns:
Block: The :code:`index` block
"""
return self.blocks[index] return self.blocks[index]
def current_block(self): def current_block(self):
"""
Get the current block. The :code:`current` block is the block to append
operators.
"""
return self.blocks[self.current_block_idx] return self.blocks[self.current_block_idx]
def create_block(self, parent_idx=None): def create_block(self, parent_idx=None):
"""
Create a new block with the :code:`parent_idx` and change the current block
to new block.
Args:
parent_idx(int): The parent block index.
Returns:
Block: The new block.
"""
new_block_idx = len(self.blocks) new_block_idx = len(self.blocks)
parent = self.current_block() if parent_idx is None else self.block( parent = self.current_block() if parent_idx is None else self.block(
parent_idx) parent_idx)
...@@ -1257,9 +1625,24 @@ class Program(object): ...@@ -1257,9 +1625,24 @@ class Program(object):
return self.current_block() return self.current_block()
def rollback(self): def rollback(self):
"""
Exit a code block, i.e., roll back to the parent block.
Returns:
None
"""
self.current_block_idx = self.current_block().parent_idx self.current_block_idx = self.current_block().parent_idx
def sync_with_cpp(self): def sync_with_cpp(self):
"""
Synchronize Python instance to its binding C++ object instance.
If the program is modified in C++ space, this method should be invoked.
Notes: This is a very low level API. Users should not invoke it
directly.
Returns:
None
"""
for block_idx in range(len(self.blocks), self.desc.num_blocks()): for block_idx in range(len(self.blocks), self.desc.num_blocks()):
self.blocks.append(Block(self, block_idx)) self.blocks.append(Block(self, block_idx))
for block in self.blocks: for block in self.blocks:
...@@ -1269,6 +1652,9 @@ class Program(object): ...@@ -1269,6 +1652,9 @@ class Program(object):
""" """
Copy the information of parameters from other program. Copy the information of parameters from other program.
Notes: This is a very low level API. Users should not invoke it
directly.
Args: Args:
other(Program): Other program other(Program): Other program
...@@ -1288,6 +1674,9 @@ class Program(object): ...@@ -1288,6 +1674,9 @@ class Program(object):
""" """
Copy the information of data variables from other program. Copy the information of data variables from other program.
Notes: This is a very low level API. Users should not invoke it
directly.
Args: Args:
other(Program): Other program other(Program): Other program
...@@ -1306,12 +1695,41 @@ class Program(object): ...@@ -1306,12 +1695,41 @@ class Program(object):
self.global_block().var(var.name).is_data = True self.global_block().var(var.name).is_data = True
def list_vars(self): def list_vars(self):
"""
Get all variables from this Program. A iterable object is returned.
Returns:
iterable: The generator will yield every variable in this program.
"""
for each_block in self.blocks: for each_block in self.blocks:
for each_var in each_block.vars.itervalues(): for each_var in each_block.vars.itervalues():
yield each_var yield each_var
class Parameter(Variable): class Parameter(Variable):
"""
Parameter is derived from Variable. A parameter is a persistable
Variable, and will be updated by optimizers after each iteration.
The training of a neural network is essentially the updating of
its parameters.
Relative to a general Variable, a Parameter has several its own
member variables:
Args:
trainable(bool): True if the parameter need to be updated after
iterations.
optimize_attr(map): Parameter attributes related with optimizing.
Currently, it only contains 'learning_rate'.
Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the parameter. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
which will be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
"""
def __init__(self, block, shape, dtype, **kwargs): def __init__(self, block, shape, dtype, **kwargs):
if shape is None or dtype is None: if shape is None or dtype is None:
raise ValueError("Parameter must set shape and dtype") raise ValueError("Parameter must set shape and dtype")
...@@ -1374,8 +1792,15 @@ _startup_program_ = Program() ...@@ -1374,8 +1792,15 @@ _startup_program_ = Program()
def default_startup_program(): def default_startup_program():
""" """
Get default startup program. In startup program, Paddle will initialize Get default/global startup program.
parameters, initialize nccl handle, etc.
The layer function in :code:`fluid.layers` will create parameters, readers,
NCCL handles as global variables. The :code:`startup_program` will
initialize them by the operators in startup program. The layer function will
append these initialization operators into startup program.
This method will return the :code:`default` or the :code:`current` startup
program. Users can use :code:`fluid.program_guard` to switch program.
Returns: Returns:
Program: startup program Program: startup program
...@@ -1385,7 +1810,15 @@ def default_startup_program(): ...@@ -1385,7 +1810,15 @@ def default_startup_program():
def default_main_program(): def default_main_program():
""" """
Get default main program. The main program is used for training or testing. Get default/global main program. The main program is used for training or
testing.
All layer function in :code:`fluid.layers` will append operators and
variables to the :code:`default_main_program`.
The :code:`default_main_program` is the default program in a lot of APIs.
For example, the :code:`Executor.run()` will execute the
:code:`default_main_program` when the program is not specified.
Returns: Returns:
Program: main program Program: main program
...@@ -1427,20 +1860,34 @@ def switch_startup_program(program): ...@@ -1427,20 +1860,34 @@ def switch_startup_program(program):
@contextlib.contextmanager @contextlib.contextmanager
def program_guard(main_program, startup_program=None): def program_guard(main_program, startup_program=None):
""" """
Switch program with `with` statement Change the global main program and startup program with `with` statement.
Layer functions in the Python `with` block will append operators and
variables to the new main programs.
Examples: Examples:
>>> with program_guard(Program()):
>>> import paddle.fluid as fluid
>>> main_program = fluid.Program()
>>> startup_program = fluid.Program()
>>> with fluid.program_guard(main_program, startup_program):
>>> data = fluid.layers.data(...) >>> data = fluid.layers.data(...)
>>> hidden = fluid.layers.fc(...) >>> hidden = fluid.layers.fc(...)
Notes: The temporary :code:`Program` can be used if the user does not need
to construct either of startup program or main program.
Examples:
>>> import paddle.fluid as fluid
>>> main_program = fluid.Program()
>>> # does not care about startup program. Just pass a temporary value.
>>> with fluid.program_guard(main_program, fluid.Program()):
>>> data = ...
Args: Args:
main_program(Program): New main program inside `with` statement main_program(Program): New main program inside `with` statement.
startup_program(Program): New startup program inside `with` statement. startup_program(Program): New startup program inside `with` statement.
None means do not change startup program. None means do not change startup program.
Returns:
None
""" """
if not isinstance(main_program, Program): if not isinstance(main_program, Program):
raise TypeError("main_program should be Program") raise TypeError("main_program should be Program")
...@@ -1457,7 +1904,8 @@ def program_guard(main_program, startup_program=None): ...@@ -1457,7 +1904,8 @@ def program_guard(main_program, startup_program=None):
def get_var(name, program=None): def get_var(name, program=None):
""" """
Get a variable by name from the global block of a program Get a variable by name from the global block of a program.
Args: Args:
name(str): name of the variable name(str): name of the variable
program(Program|None): program object. program(Program|None): program object.
......
...@@ -19,33 +19,41 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] ...@@ -19,33 +19,41 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
def create_lod_tensor(data, lod, place): def create_lod_tensor(data, lod, place):
"""Create a lod tensor from a numpy array, a list, or an existing lod tensor. """
Create a lod tensor from a numpy array, a list, or an existing lod tensor.
Create a lod tensor by doing the following: Create a lod tensor by doing the following:
1. Check that the length-based input lod is valid. 1. Check that the length-based input lod is valid.
2. Convert the length-based lod to a offset-based LoD. 2. Convert the length-based lod to a offset-based LoD.
3. Copy the data from a numpy array, a list or a existing lod tensor to 3. Copy the data from a numpy array, a list or a existing lod tensor to
CPU or GPU device (based on input place). CPU or GPU device (based on input place).
4. Set the level of detail (LoD) using the offset-based LoD. 4. Set the level of detail (LoD) using the offset-based LoD.
Use example: Examples:
Suppose we want LoDTensor to hold data for sequences of word, where each word is
represented by an integer. If we want to create a LoDTensor to represent two
sentences, one of 2 words, and one of 3 words.
Then 'data' can be a numpy array of integers with shape (5, 1). Suppose we want LoDTensor to hold data for sequences of word, where each
'lod' will be [[2, 3]], indicating the length(# of words) in each sentence. word is represented by an integer. If we want to create a LoDTensor to
This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]] represent two sentences, one of 2 words, and one of 3 words.
inside the function call.
Please refer to Then :code:`data` can be a numpy array of integers with shape (5, 1).
github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
for more details regarding LoD. sentence. This length-based input lod [[2, 3]] will be converted to
offset-based lod [[0, 2, 5]] inside the function call.
Please reference :ref:`api_guide_low_level_lod_tensor` for more details
regarding LoD.
Args: Args:
data: a numpy array or a LoDTensor or a list holding the data to be copied. data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
lod: a list of lists indicating the length-based LoD info specified by the user. list holding the data to be copied.
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. lod(list): a list of lists indicating the length-based LoD info
specified by the user.
place(Place): CPU or GPU place indicating where the data in the new
LoDTensor will be stored.
Returns: Returns:
A fluid LoDTensor object with tensor data and lod info. A fluid LoDTensor object with tensor data and lod info.
...@@ -77,31 +85,38 @@ def create_lod_tensor(data, lod, place): ...@@ -77,31 +85,38 @@ def create_lod_tensor(data, lod, place):
def create_random_int_lodtensor(lod, base_shape, place, low, high): def create_random_int_lodtensor(lod, base_shape, place, low, high):
"""Create a LoDTensor containing random integers. """
Create a LoDTensor containing random integers.
This function is frequently used in the book examples. So we revised it based on This function is frequently used in the book examples. So we revised it
the new create_lod_tensor API and put it here in the lod_tensor module to simplify based on the new create_lod_tensor API and put it here in the lod_tensor
the code. module to simplify the code.
The function does the following: The function does the following:
1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input
and the shape of the basic element in 'base_shape'. 1. Calculate the overall shape of the LoDTensor based on the length-based
:code:`lod` input and the shape of the basic element in
:code:`base_shape`.
2. Create a numpy array of this shape. 2. Create a numpy array of this shape.
3. Create the LoDTensor using create_lod_tensor API. 3. Create the LoDTensor using create_lod_tensor API.
Suppose we want LoDTensor to hold data for sequences of word, where each word is Suppose we want LoDTensor to hold data for sequences of word, where each
represented by an integer. If we want to create a LoDTensor to represent two word is represented by an integer. If we want to create a LoDTensor to
sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input represent two sentences, one of 2 words, and one of 3 words. Then
length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
[5, 1], holding 5 words for two sentences. shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
Args: Args:
data: a numpy array or a LoDTensor holding the data to be copied. lod(list): a list of lists indicating the length-based LoD info
lod: a list of lists indicating the length-based LoD info specified by the user. specified by the user.
base_shape: the shape of the basic element to be held by the LoDTensor. base_shape(list): the shape of the basic element to be held by the
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored. LoDTensor.
low: the lower bound of the random integers. place(Place): CPU or GPU place indicating where the data in the new
high: the upper bound of the random integers. LoDTensor will be stored.
low(int): the lower bound of the random integers.
high(int): the upper bound of the random integers.
Returns: Returns:
A fluid LoDTensor object with tensor data and lod info. A fluid LoDTensor object with tensor data and lod info.
......
...@@ -325,14 +325,14 @@ class Auc(MetricBase): ...@@ -325,14 +325,14 @@ class Auc(MetricBase):
""" """
def __init__(self, name, curve='ROC', num_thresholds=200): def __init__(self, name, curve='ROC', num_thresholds=200):
super(MetricBase, self).__init__(name, curve, num_thresholds) super(Auc, self).__init__(name=name)
self._curve = curve self._curve = curve
self._num_thresholds = num_thresholds self._num_thresholds = num_thresholds
self._epsilon = 1e-6 self._epsilon = 1e-6
self.tp_list = np.ndarray((num_thresholds, )) self.tp_list = np.zeros((num_thresholds, ))
self.fn_list = np.ndarray((num_thresholds, )) self.fn_list = np.zeros((num_thresholds, ))
self.tn_list = np.ndarray((num_thresholds, )) self.tn_list = np.zeros((num_thresholds, ))
self.fp_list = np.ndarray((num_thresholds, )) self.fp_list = np.zeros((num_thresholds, ))
def update(self, labels, predictions, axis=1): def update(self, labels, predictions, axis=1):
if not _is_numpy_(labels): if not _is_numpy_(labels):
...@@ -350,12 +350,12 @@ class Auc(MetricBase): ...@@ -350,12 +350,12 @@ class Auc(MetricBase):
tp, fn, tn, fp = 0, 0, 0, 0 tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels): for i, lbl in enumerate(labels):
if lbl: if lbl:
if predictions[i, 0] >= thresh: if predictions[i, 1] >= thresh:
tp += 1 tp += 1
else: else:
fn += 1 fn += 1
else: else:
if predictions[i, 0] >= thresh: if predictions[i, 1] >= thresh:
fp += 1 fp += 1
else: else:
tn += 1 tn += 1
......
...@@ -26,16 +26,87 @@ def simple_img_conv_pool(input, ...@@ -26,16 +26,87 @@ def simple_img_conv_pool(input,
filter_size, filter_size,
pool_size, pool_size,
pool_stride, pool_stride,
act, pool_padding=0,
param_attr=None,
pool_type='max', pool_type='max',
global_pooling=False,
conv_stride=1,
conv_padding=0,
conv_dilation=1,
conv_groups=1,
param_attr=None,
bias_attr=None,
act=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False): use_mkldnn=False):
"""
The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
Args:
input (Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
feature channel.
filter_size (int|list|tuple): The filter size. If filter_size is a list or
tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
the filter_size_H = filter_size_W = filter_size.
pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
Otherwise, the pool_size_H = pool_size_W = pool_size.
pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
average-pooling. Default :math:`max`.
global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
pool_size and pool_padding while be ignored. Default False
conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
a list or tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
conv_groups (int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
act (str): Activation type for Conv2d. Default: None
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
Return:
Variable: The result of input after Convolution2d and Pool2d.
Examples:
.. code-block:: python
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
conv_pool = fluid.nets.simple_img_conv_pool(input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
"""
conv_out = layers.conv2d( conv_out = layers.conv2d(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
filter_size=filter_size, filter_size=filter_size,
stride=conv_stride,
padding=conv_padding,
dilation=conv_dilation,
groups=conv_groups,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr,
act=act, act=act,
use_cudnn=use_cudnn, use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn) use_mkldnn=use_mkldnn)
...@@ -45,6 +116,8 @@ def simple_img_conv_pool(input, ...@@ -45,6 +116,8 @@ def simple_img_conv_pool(input,
pool_size=pool_size, pool_size=pool_size,
pool_type=pool_type, pool_type=pool_type,
pool_stride=pool_stride, pool_stride=pool_stride,
pool_padding=pool_padding,
global_pooling=global_pooling,
use_cudnn=use_cudnn, use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn) use_mkldnn=use_mkldnn)
return pool_out return pool_out
...@@ -60,11 +133,65 @@ def img_conv_group(input, ...@@ -60,11 +133,65 @@ def img_conv_group(input,
conv_with_batchnorm=False, conv_with_batchnorm=False,
conv_batchnorm_drop_rate=0.0, conv_batchnorm_drop_rate=0.0,
pool_stride=1, pool_stride=1,
pool_type=None, pool_type="max",
use_cudnn=True, use_cudnn=True,
use_mkldnn=False): use_mkldnn=False):
""" """
Image Convolution Group, Used for vgg net. The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
and Pool2d. According to the input arguments, img_conv_group will do serials of
computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
result to Pool2d.
Args:
input (Variable): The input image with [N, C, H, W] format.
conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
Otherwise, the pool_size_H = pool_size_W = pool_size.
conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
a list or tuple, its length must be equal to the length of conv_num_filter.
Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
tuple, its length must be equal to the length of conv_num_filter.
Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
Default: None.
param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
If conv_with_batchnorm is a list, its length must be equal to the length of
conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
Conv2d Layer follows a BatchNorm. Default False.
conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
Layers is conv_batchnorm_drop_rate. Default 0.0.
pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
is a list or tuple, it must contain two integers, (pooling_stride_H,
pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
Default 1.
pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
average-pooling. Default :math:`max`.
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
Return:
Variable: The final result after serial computation using Convolution2d,
BatchNorm, DropOut, and Pool2d.
Examples:
.. code-block:: python
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
conv_pool = fluid.nets.img_conv_group(input=img,
num_channels=3,
conv_padding=1,
conv_num_filter=[3, 3],
conv_filter_size=3,
conv_act="relu",
pool_size=2,
pool_stride=2)
""" """
tmp = input tmp = input
assert isinstance(conv_num_filter, list) or \ assert isinstance(conv_num_filter, list) or \
...@@ -74,6 +201,7 @@ def img_conv_group(input, ...@@ -74,6 +201,7 @@ def img_conv_group(input,
if not hasattr(obj, '__len__'): if not hasattr(obj, '__len__'):
return [obj] * len(conv_num_filter) return [obj] * len(conv_num_filter)
else: else:
assert len(obj) == len(conv_num_filter)
return obj return obj
conv_padding = __extend_list__(conv_padding) conv_padding = __extend_list__(conv_padding)
...@@ -119,6 +247,39 @@ def sequence_conv_pool(input, ...@@ -119,6 +247,39 @@ def sequence_conv_pool(input,
param_attr=None, param_attr=None,
act="sigmoid", act="sigmoid",
pool_type="max"): pool_type="max"):
"""
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
Args:
input (Variable): The input of sequence_conv, which supports variable-time
length input sequence. The underlying of input is a matrix with shape
(T, N), where T is the total time steps in this mini-batch and N is
the input_hidden_size
num_filters(int): The number of filter.
filter_size (int): The filter size.
param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
Default :math:`max`.
Return:
Variable: The final result after Sequence Convolution and Pooling.
Examples:
.. code-block:: python
input_dim = len(word_dict)
emb_dim = 128
hid_dim = 512
data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1)
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
seq_conv = fluid.nets.sequence_conv_pool(input=emb,
num_filters=hid_dim,
filter_size=3,
act="tanh",
pool_type="sqrt")
"""
conv_out = layers.sequence_conv( conv_out = layers.sequence_conv(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
...@@ -132,9 +293,9 @@ def sequence_conv_pool(input, ...@@ -132,9 +293,9 @@ def sequence_conv_pool(input,
def glu(input, dim=-1): def glu(input, dim=-1):
""" """
The gated linear unit composed by split, sigmoid activation and elementwise The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
multiplication. Specifically, Split the input into two equal sized parts multiplication. Specifically, Split the input into two equal sized parts,
:math:`a` and :math:`b` along the given dimension and then compute as :math:`a` and :math:`b`, along the given dimension and then compute as
following: following:
.. math:: .. math::
...@@ -147,16 +308,16 @@ def glu(input, dim=-1): ...@@ -147,16 +308,16 @@ def glu(input, dim=-1):
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int): The dimension along which to split. If :math:`dim < 0`, the dim (int): The dimension along which to split. If :math:`dim < 0`, the
dimension to split along is :math:`rank(input) + dim`. dimension to split along is :math:`rank(input) + dim`. Default -1.
Returns: Returns:
Variable: The Tensor variable with half the size of input. Variable: Variable with half the size of input.
Examples: Examples:
.. code-block:: python .. code-block:: python
# x is a Tensor variable with shape [3, 6, 9] data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9] output = fluid.nets.glu(input=data, dim=1) # shape of output: [3, 3, 9]
""" """
a, b = layers.split(input, num_or_sections=2, dim=dim) a, b = layers.split(input, num_or_sections=2, dim=dim)
...@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries, ...@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries,
<https://arxiv.org/pdf/1706.03762.pdf>`_. <https://arxiv.org/pdf/1706.03762.pdf>`_.
Args: Args:
queries (Variable): The input variable which should be a 3-D Tensor. queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor. keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor. values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1. attention. Default: 1.
dropout_rate (float): The dropout rate to drop the attention weight. dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0. Default: 0.0.
Returns: Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product\
Variable: A 3-D Tensor computed by multi-head scaled dot product \
attention. attention.
Raises: Raises:
ValueError: If input queries, keys, values are not 3-D Tensors. ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE: NOTES:
1. When num_heads > 1, three linear projections are learned respectively 1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'. to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys queries', keys' and values' have the same shapes with queries, keys
and values. and values.
2. When num_heads == 1, scaled_dot_product_attention has no learnable
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters. parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
# Suppose q, k, v are Tensors with the following shape: queries = fluid.layers.data(name="queries",
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] shape=[3, 5, 9],
dtype="float32",
contexts = fluid.nets.scaled_dot_product_attention(q, k, v) append_batch_size=False)
queries.stop_gradient = False
keys = fluid.layers.data(name="keys",
shape=[3, 6, 9],
dtype="float32",
append_batch_size=False)
keys.stop_gradient = False
values = fluid.layers.data(name="values",
shape=[3, 6, 10],
dtype="float32",
append_batch_size=False)
values.stop_gradient = False
contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
contexts.shape # [3, 5, 10] contexts.shape # [3, 5, 10]
""" """
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
......
...@@ -27,52 +27,50 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy ...@@ -27,52 +27,50 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
class ParallelExecutor(object): class ParallelExecutor(object):
def __init__(self,
use_cuda,
loss_name=None,
main_program=None,
share_vars_from=None,
exec_strategy=None,
build_strategy=None,
num_trainers=1,
trainer_id=0,
**kwargs):
""" """
ParallelExecutor can run program in parallel. ParallelExecutor can run program in parallel.
Args: Args:
use_cuda(bool): Whether to use CUDA or not. use_cuda (bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training. loss_name (str): The loss name must set in training. Default None.
main_program(Program, default None): The program that need to run, main_program (Program): The program that need to run, if not provided,
if not provided, then default_main_program will be used. then default_main_program will be used. Default None.
share_vars_from(ParallelExecutor, default None): If provied, share_vars_from(ParallelExecutor): If provied, it will share variables
it will share variables from the specified ParallelExecutor. from the specified ParallelExecutor. Default None.
num_trainers(int, default 1): If greater than 1, NCCL will be num_trainers(int): If greater than 1, NCCL will be initialized with
initialized with multpile rank of nodes, each node should have multiple rank of nodes, each node should have same number of GPUs.
same number of GPUs. Distributed training will be enabled then. Distributed training will be enabled then. Default 1.
trainer_id(int, default 0): Must use together with num_trainers. trainer_id(int: Must use together with num_trainers. trainer_id is the
trainer_id is the "rank" of current node starts from 0. "rank" of current node starts from 0. Default 0.
Returns: Returns:
A ParallelExecutor object. ParallelExecutor: The initialized ParallelExecutor object.
Raises: Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor TypeError: If share_vars_from is provided, but not ParallelExecutor object.
object.
Examples: Examples:
.. code-block:: python .. code-block:: python
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
use_cuda=True, loss_name=loss.name) test_exe = fluid.ParallelExecutor(use_cuda=True,
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program, main_program=test_program,
share_vars_from=train_exe) share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed=feed_dict) train_loss, = train_exe.run([loss.name], feed=feed_dict)
test_loss, = test_exe.run([loss.name], feed=feed_dict) test_loss, = test_exe.run([loss.name], feed=feed_dict)
""" """
def __init__(self,
use_cuda,
loss_name=None,
main_program=None,
share_vars_from=None,
exec_strategy=None,
build_strategy=None,
num_trainers=1,
trainer_id=0,
**kwargs):
if len(kwargs) != 0: if len(kwargs) != 0:
err_msg = "" err_msg = ""
for key in kwargs: for key in kwargs:
...@@ -131,10 +129,16 @@ class ParallelExecutor(object): ...@@ -131,10 +129,16 @@ class ParallelExecutor(object):
main = main_program main = main_program
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [
op.type for op in main.global_block().ops
] else False
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor): ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.") raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes( local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else [] ) if share_vars_from else []
...@@ -166,12 +170,14 @@ class ParallelExecutor(object): ...@@ -166,12 +170,14 @@ class ParallelExecutor(object):
element in the list will be copied to each device directly. element in the list will be copied to each device directly.
For example, if the feed is a dict: For example, if the feed is a dict:
>>> exe = ParallelExecutor() >>> exe = ParallelExecutor()
>>> # the image will be splitted into devices. If there is two devices >>> # the image will be splitted into devices. If there is two devices
>>> # each device will process an image with shape (24, 1, 28, 28) >>> # each device will process an image with shape (24, 1, 28, 28)
>>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
For example, if the feed is a list: For example, if the feed is a list:
>>> exe = ParallelExecutor() >>> exe = ParallelExecutor()
>>> # each device will process each element in the list. >>> # each device will process each element in the list.
>>> # the 1st device will process an image with shape (48, 1, 28, 28) >>> # the 1st device will process an image with shape (48, 1, 28, 28)
...@@ -182,18 +188,40 @@ class ParallelExecutor(object): ...@@ -182,18 +188,40 @@ class ParallelExecutor(object):
>>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, >>> {"image": numpy.random.random(size=(32, 1, 28, 28))},
>>> ]) >>> ])
Args: Args:
fetch_list(list): The fetched variable names fetch_list(list): The fetched variable names
feed(list|dict|None): The feed variables. If the feed is a dict, feed(list|dict|None): The feed variables. If the feed is a dict,
tensors in that dict will be splitted into each devices. If tensors in that dict will be splitted into each devices. If
the feed is a list, each element of the list will be copied the feed is a list, each element of the list will be copied
to each device. to each device. Default None.
feed_dict: Alias for feed parameter, for backward compatibility. feed_dict: Alias for feed parameter, for backward compatibility.
This parameter is deprecated. This parameter has been deprecated. Default None.
Returns: fetched result list. Returns:
List: The fetched result list.
Raises:
ValueError: If the feed is a list, but its length is not equal the
length of active places, or its element's is not dict.
NOTES:
1. If the feed's type is dict, the number of data that feeds to
ParallelExecutor must be bigger than active places. Otherwise,
it will throw exception from C++ side. Special attention should be
paid to check whether the last batch of the dataset is bigger
than active places.
2. If active places are more than one, the fetch results for each
variable is a list, and each element of this list is the variable of
respective active place.
Examples:
.. code-block:: python
pe = fluid.ParallelExecutor(use_cuda=use_cuda,
loss_name=avg_cost.name,
main_program=fluid.default_main_program())
loss = pe.run(feed=feeder.feed(cur_batch),
fetch_list=[avg_cost.name]))
""" """
if feed is None and feed_dict is not None: if feed is None and feed_dict is not None:
feed = feed_dict feed = feed_dict
...@@ -238,9 +266,17 @@ class ParallelExecutor(object): ...@@ -238,9 +266,17 @@ class ParallelExecutor(object):
fetch_var_name = '@FETCHED_VAR_NAME@' fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name) self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self.bcast_params()
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def bcast_params(self):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self.executor.bcast_params(set(self.persistable_vars)) self.executor.bcast_params(set(self.persistable_vars))
@property @property
......
...@@ -22,6 +22,35 @@ __all__ = [ ...@@ -22,6 +22,35 @@ __all__ = [
class ParamAttr(object): class ParamAttr(object):
"""
Parameter attributes object. To fine-tuning network training process, user
can set parameter's attributes to control training details. Such as learning rate,
regularization, trainable, do_model_average and the method to initialize param.
Args:
name(str): The parameter's name. Default None.
initializer(Initializer): The method to initial this parameter. Default None.
learning_rate(float): The parameter's learning rate. The learning rate when
optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
Default 1.0.
regularizer(WeightDecayRegularizer): Regularization factor. Default None.
trainable(bool): Whether this parameter is trainable. Default True.
gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
gradient. Default None.
do_model_average(bool): Whether this parameter should do model average.
Default False.
Examples:
.. code-block:: python
w_param_attrs = fluid.ParamAttr(name="fc_weight",
learning_rate=0.5,
regularizer=fluid.L2Decay(1.0),
trainable=True)
y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
"""
def __init__(self, def __init__(self,
name=None, name=None,
initializer=None, initializer=None,
...@@ -29,7 +58,7 @@ class ParamAttr(object): ...@@ -29,7 +58,7 @@ class ParamAttr(object):
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None, gradient_clip=None,
do_model_average=None): do_model_average=False):
self.name = name self.name = name
self.initializer = initializer self.initializer = initializer
self.learning_rate = learning_rate self.learning_rate = learning_rate
...@@ -39,6 +68,16 @@ class ParamAttr(object): ...@@ -39,6 +68,16 @@ class ParamAttr(object):
self.model_average = do_model_average self.model_average = do_model_average
def set_default_initializer(self, initializer): def set_default_initializer(self, initializer):
"""
Set the default initializer, the initializer should be Constant,
Uniform, Normal, Xavier, MSRA.
Args:
initializer(Initializer): the initializer to set.
Returns:
None
"""
if initializer is None: if initializer is None:
if self.initializer is None: if self.initializer is None:
raise ValueError("ParamAttr.initializer is not set") raise ValueError("ParamAttr.initializer is not set")
...@@ -50,13 +89,45 @@ class ParamAttr(object): ...@@ -50,13 +89,45 @@ class ParamAttr(object):
self.initializer = initializer self.initializer = initializer
def set_default_param_initializer(self): def set_default_param_initializer(self):
"""
Set the default initializer for the parameter with Xavier.
Args:
None.
Returns:
None.
"""
self.set_default_initializer(Xavier()) self.set_default_initializer(Xavier())
def set_default_bias_initializer(self): def set_default_bias_initializer(self):
"""
Set the default initializer for the bias with Constant(0.0).
Args:
None.
Returns:
None.
"""
self.set_default_initializer(Constant(0.0)) self.set_default_initializer(Constant(0.0))
@staticmethod @staticmethod
def to_attr(arg): def to_attr(arg):
"""
Create ParamAttr[s].
Args:
arg: Arguments to initialize ParamAttr[s]. arg's type can be
str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
bool, ParamAttr, or a list of above type.
Returns:
ParamAttr[s]: ParamAttr[s] initialized with arg.
Raises:
arg can not initialize a ParamAttr.
"""
if arg is None: if arg is None:
return ParamAttr() return ParamAttr()
elif isinstance(arg, list) or isinstance(arg, tuple): elif isinstance(arg, list) or isinstance(arg, tuple):
...@@ -75,6 +146,15 @@ class ParamAttr(object): ...@@ -75,6 +146,15 @@ class ParamAttr(object):
raise TypeError("{0} cast to ParamAttr".format(type(arg))) raise TypeError("{0} cast to ParamAttr".format(type(arg)))
def to_kwargs(self, with_initializer=False): def to_kwargs(self, with_initializer=False):
"""
Returns the attributes of this parameter.
Args:
with_initializer(bool): Whether to add initializer attr.
Returns:
Parameter attributes(map): The attributes of this parameter.
"""
kwargs = { kwargs = {
'name': self.name, 'name': self.name,
'optimize_attr': { 'optimize_attr': {
...@@ -92,9 +172,27 @@ class ParamAttr(object): ...@@ -92,9 +172,27 @@ class ParamAttr(object):
class WeightNormParamAttr(ParamAttr): class WeightNormParamAttr(ParamAttr):
""" """
Used for weight normalization. Any field in ParamAttr can also be set here. Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
Besides, an extra field dim can be set to indicate the dimension except in a neural network that decouples the length of those weight vectors from
which to normalize. their direction. Weight Norm has been implemented as discussed in this
paper: `Weight Normalization: A Simple Reparameterization to Accelerate
Training of Deep Neural Networks
<https://arxiv.org/pdf/1602.07868.pdf>`_.
Args:
dim(list): The parameter's name. Default None.
kwargs: Any field in ParamAttr. Default None.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
fc = fluid.layers.fc(input=data,
size=1000,
param_attr=WeightNormParamAttr(
dim=None,
name='weight_norm_param'))
""" """
# List to record the parameters reparameterized by weight normalization. # List to record the parameters reparameterized by weight normalization.
# If these parameters are treated as Variable rather than Parameter, # If these parameters are treated as Variable rather than Parameter,
......
...@@ -36,6 +36,45 @@ def convert_reader_to_recordio_file( ...@@ -36,6 +36,45 @@ def convert_reader_to_recordio_file(
compressor=core.RecordIOWriter.Compressor.Snappy, compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000, max_num_records=1000,
feed_order=None): feed_order=None):
"""
Convert a Python Reader to a recordio file.
Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
details.
Examples:
>>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist
>>> import paddle
>>>
>>> tmp_program = fluid.Program()
>>> with fluid.program_guard(tmp_program):
>>> img = fluid.layers.data(name='img', shape=[784])
>>> label = fluid.layers.data(name='label', shape=[1], dtype='int64')
>>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
>>> # mnist.recordio will be generated in current directory
>>> fluid.recordio_writer.convert_reader_to_recordio_file(
>>> filename="mnist.recordio",
>>> reader_creator=paddle.batch(mnist.train(), batch_size=32),
>>> feeder=feeder)
Args:
filename(str): The recordio filename.
reader_creator(callable): The Python Reader Creator. See
:ref:`api_guide_python_reader`.
feeder(DataFeeder): The DataFeeder instance. Used to convert
:code:`reader_creator` to :code: `lod_tensor`
compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
by default.
max_num_records(int): Maximum number of records in one chuck. Each record
is each return value from reader function
feed_order(list): The order of variable names that the reader returns
Returns:
int: the number of record that saved.
"""
if feed_order is None: if feed_order is None:
feed_order = feeder.feed_names feed_order = feeder.feed_names
counter = 0 counter = 0
...@@ -58,6 +97,17 @@ def convert_reader_to_recordio_files( ...@@ -58,6 +97,17 @@ def convert_reader_to_recordio_files(
compressor=core.RecordIOWriter.Compressor.Snappy, compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000, max_num_records=1000,
feed_order=None): feed_order=None):
"""
convert a python reader to many recordio files.
This API is basically same as :code:`convert_reader_to_recordio_file`,
instead of it will create many recordio files. Each file contains at
most :code:`batch_per_file` records.
Please reference
:ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
details.
"""
if feed_order is None: if feed_order is None:
feed_order = feeder.feed_names feed_order = feeder.feed_names
f_name, f_ext = os.path.splitext(filename) f_name, f_ext = os.path.splitext(filename)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
def bilinear_interp_np(input, out_h, out_w, out_size): def bilinear_interp_np(input, out_h, out_w, out_size):
...@@ -47,7 +48,7 @@ def bilinear_interp_np(input, out_h, out_w, out_size): ...@@ -47,7 +48,7 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
w1lambda*input[:, :, h, w+wid]) + \ w1lambda*input[:, :, h, w+wid]) + \
h1lambda*(w2lambda*input[:, :, h+hid, w] + h1lambda*(w2lambda*input[:, :, h+hid, w] +
w1lambda*input[:, :, h+hid, w+wid]) w1lambda*input[:, :, h+hid, w+wid])
return out.astype("float32") return out.astype(input.dtype)
class TestBilinearInterpOp(OpTest): class TestBilinearInterpOp(OpTest):
...@@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp): ...@@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp):
self.out_size = np.array([65, 129]).astype("int32") self.out_size = np.array([65, 129]).astype("int32")
class TestBilinearInterpOpUint8(OpTest):
def setUp(self):
self.out_size = None
self.init_test_case()
self.op_type = "bilinear_interp"
input_np = np.random.randint(
low=0, high=256, size=self.input_shape).astype("uint8")
output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
self.out_size)
self.inputs = {'X': input_np}
if self.out_size is not None:
self.inputs['OutSize'] = self.out_size
self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
self.outputs = {'Out': output_np}
def test_check_output(self):
self.check_output_with_place(place=core.CPUPlace(), atol=1)
def init_test_case(self):
self.input_shape = [1, 3, 9, 6]
self.out_h = 10
self.out_w = 9
class TestCase1Uint8(TestBilinearInterpOpUint8):
def init_test_case(self):
self.input_shape = [2, 3, 128, 64]
self.out_h = 120
self.out_w = 50
class TestCase2Uint8(TestBilinearInterpOpUint8):
def init_test_case(self):
self.input_shape = [4, 1, 7, 8]
self.out_h = 5
self.out_w = 13
self.out_size = np.array([6, 15]).astype("int32")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -33,23 +33,59 @@ __all__ = [ ...@@ -33,23 +33,59 @@ __all__ = [
class BeginEpochEvent(object): class BeginEpochEvent(object):
"""
The begin of a training epoch.
Args:
epoch_id(int): The current epoch ID.
"""
def __init__(self, epoch_id): def __init__(self, epoch_id):
self.epoch = epoch_id self.epoch = epoch_id
class EndEpochEvent(object): class EndEpochEvent(object):
"""
The end of a training epoch.
Args:
epoch_id(int): The current epoch ID.
"""
def __init__(self, epoch_id): def __init__(self, epoch_id):
self.epoch = epoch_id self.epoch = epoch_id
class BeginStepEvent(object): class BeginStepEvent(object):
"""
The begin of a training epoch.
Args:
epoch_id(int): The current epoch ID.
step_id(int): The current step ID.
"""
def __init__(self, epoch_id, step_id): def __init__(self, epoch_id, step_id):
self.epoch = epoch_id self.epoch = epoch_id
self.step = step_id self.step = step_id
self.fetch_metrics = True self.fetch_metrics = True
"""
If fetch_metrics is true, the metrics will be fetched at the
EndStepEvent. Default is True.
"""
class EndStepEvent(object): class EndStepEvent(object):
"""
The end of a training step.
Args:
epoch_id(int): The current epoch ID.
step_id(int): The current step ID.
metrics(list): A list of fetched tensor. The order of this list is same
as the :code:`train_func` returns.
"""
def __init__(self, epoch_id, step_id, metrics): def __init__(self, epoch_id, step_id, metrics):
self.epoch = epoch_id self.epoch = epoch_id
self.step = step_id self.step = step_id
...@@ -57,6 +93,27 @@ class EndStepEvent(object): ...@@ -57,6 +93,27 @@ class EndStepEvent(object):
class CheckpointConfig(object): class CheckpointConfig(object):
"""
Parameter object for :code:`fluid.io.save_checkpoint` and
:code:`fluid.Trainer`. Used to configuration how to save checkpoint.
Args:
checkpoint_dir(str): Directory path to save check point. Default is the
current directory.
max_num_checkpoints(int): The max number of local check points.
epoch_interval(int): Every number of epoch to save check point.
step_interval(int): Every number of step to save check point.
Examples:
>>> config = fluid.CheckpointConfig("./checkpoints")
>>> trainer = fluid.Trainer(train_func=train_program,
>>> place=place,
>>> optimizer_func=optimizer_func,
>>> checkpoint_config=config)
>>> trainer.train(...)
"""
def __init__(self, def __init__(self,
checkpoint_dir=None, checkpoint_dir=None,
max_num_checkpoints=3, max_num_checkpoints=3,
...@@ -113,11 +170,62 @@ def check_and_get_place(place): ...@@ -113,11 +170,62 @@ def check_and_get_place(place):
class Trainer(object): class Trainer(object):
""" """
A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
simple neural network easily.
This API takes a :code:`train_func`. A :code:`train_func` is a function that
return loss as it first return value. The reset value can be fetched by
EndStepEvent.metrics
This API also takes a :code:`optimizer_func` that will return an optimizer
instance.
For example, to train a MLP for MNIST dataset, the sample program is
>>> import paddle.fluid as fluid
>>>
>>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
>>> hidden = image
>>> for layer_size in layer_sizes:
>>> hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
>>> return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
>>>
>>> def train_mnist_mlp():
>>> img = fluid.layers.data(name='image', shape=[784])
>>> label = fluid.layers.data(name='label', shape=[1], dtype='int64')
>>> prediction = mlp(img)
>>> return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
>>>
>>> def optimizer():
>>> return fluid.optimizer.Adam()
>>>
>>> trainer = Trainer(train_func=train_mnist_mlp,
>>> optimizer_func=optimizer,
>>> place=fluid.CUDAPlace(0),
>>> parallel=True)
>>>
>>> def train_callback(event):
>>> if isinstance(event, fluid.EndStepEvent):
>>> print "Epoch ID", event.epoch, "Step ID",\
>>> event.step, "AvgLoss", event.metrics[0]
>>> elif isinstance(event, fluid.EndEpochEvent):
>>> trainer.save_params("./model_{0}".format(event.epoch))
>>>
>>> trainer.train(num_epochs=100, event_handler=train_callback)
For more example, please see :ref:`api_guide_high_level_api`.
Args: Args:
train_func(callable): A function which will return loss. The loss must be a scalar. train_func(callable): A function which will return loss. The loss must be
a scalar tensor.
optimizer_func(callable): A function that returns an Optimizer object. optimizer_func(callable): A function that returns an Optimizer object.
place: The device place of this trainer. place(CUDAPlace|CPUPlace): The device place of this trainer. If
:code:`parallel=True,` all CUDA Places will be used if :code:`place`
is a :code:`CUDAPlace`.
parallel(bool): True if use multiple devices.
checkpoint_config(CheckpointConfig): Configuration about how to save
checkpoints.
""" """
def __init__(self, def __init__(self,
...@@ -129,9 +237,6 @@ class Trainer(object): ...@@ -129,9 +237,6 @@ class Trainer(object):
checkpoint_config=None): checkpoint_config=None):
self.__stop = False self.__stop = False
self.parallel = parallel self.parallel = parallel
# 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in
# test_word2vec.py
# config for checkpoint # config for checkpoint
# only chief worker will save variables # only chief worker will save variables
...@@ -145,6 +250,10 @@ class Trainer(object): ...@@ -145,6 +250,10 @@ class Trainer(object):
self.scope = core.Scope() self.scope = core.Scope()
# 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in
# test_word2vec.py
self.startup_program = framework.Program() self.startup_program = framework.Program()
self.train_program = framework.Program() self.train_program = framework.Program()
...@@ -277,17 +386,18 @@ class Trainer(object): ...@@ -277,17 +386,18 @@ class Trainer(object):
def train(self, num_epochs, event_handler, reader=None, feed_order=None): def train(self, num_epochs, event_handler, reader=None, feed_order=None):
""" """
Train the model. Start the train loop to train the model.
Args: Args:
num_epochs: The number of epoch. An epoch will process all data in reader num_epochs(int): The number of epoch. An epoch will process all data in reader
event_handler: The event handler. A function with type (ev:Event)->void event_handler(callable): The event handler. A function with type (ev:Event)->void
reader: reader(callable): A reader creator object. See also
feed_order: Feeding order of reader. None will following the defining :ref:`api_guide_python_reader` .
feed_order(list): Feeding order of reader. None will following the defining
order in program order in program
Returns: Returns:
None
""" """
training_role = os.getenv("PADDLE_TRAINING_ROLE", "") training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
if training_role == "PSERVER": if training_role == "PSERVER":
...@@ -307,16 +417,24 @@ class Trainer(object): ...@@ -307,16 +417,24 @@ class Trainer(object):
Test the model on given test data Test the model on given test data
Args: Args:
reader: The reader that yields test data. reader(callable): The reader that yields test data.
feed_order: Feeding order of reader. None will following the defining feed_order(list): Feeding order of reader. None will following the
order in program defining order in program
""" """
return self._test_by_executor(reader, feed_order, return self._test_by_executor(reader, feed_order,
self.train_func_outputs) self.train_func_outputs)
def save_params(self, param_path): def save_params(self, param_path):
# reference: save_persistables in io.py """
Save all parameters into :code:`param_path`.
Args:
param_path(str): The path to save parameters.
Returns:
None
"""
with self._prog_and_scope_guard(): with self._prog_and_scope_guard():
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
io.save_persistables(exe, dirname=param_path) io.save_persistables(exe, dirname=param_path)
......
...@@ -12,14 +12,6 @@ ...@@ -12,14 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Transpile the program to distributed data-parallelism programs.
The main_program will be transformed to use a remote parameter server
to do parameter optimization. And the optimization graph will be put
into a parameter server program.
Use different methods to split trainable variables to different
parameter servers.
Steps to transpile trainer: Steps to transpile trainer:
1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
...@@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192): ...@@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
return blocks return blocks
class DistributeTranspiler: class DistributeTranspiler(object):
def _has_distributed_lookup_table(self): """
# process lookup_table_op **DistributeTranspiler**
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table. Convert the fluid program to distributed data-parallelism programs.
distributed_lookup_table_ops = []
# support only one distributed_lookup_table now The main_program will be transformed to use a remote parameter server
self.table_name = None to do parameter optimization. And the optimization graph will be put
for op in self.origin_program.global_block().ops: into a parameter server program.
if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True: Examples:
if self.table_name is None: .. code-block:: python
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]: # Define your model before these codes.
raise RuntimeError("all distributed lookup_table_ops" port = os.getenv("PADDLE_PSERVER_PORT", "6174")
" should have only one table") pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
distributed_lookup_table_ops.append(op) eplist = []
else: for ip in pserver_ips.split(","):
if self.table_name is not None: eplist.append(':'.join([ip, port]))
assert op.input("W")[0] != self.table_name pserver_endpoints = ",".join(eplist)
trainers = int(os.getenv("PADDLE_TRAINERS"))
return len(distributed_lookup_table_ops) > 0 current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
def _update_dist_lookup_table_vars(self, param_list, grad_list, role = os.getenv("PADDLE_TRAINING_ROLE")
params_grads):
# TODO(wuyi): put find a way to put dist lookup table stuff all together. t = distribute_transpiler.DistributeTranspiler()
# update self.table_param_grad and self.trainer_side_table_grad_list t.transpile(
program = self.origin_program trainer_id, pservers=pserver_endpoints, trainers=trainers)
if self.has_distributed_lookup_table: if role == "PSERVER":
param_list = [ pserver_program = t.get_pserver_program(current_endpoint)
param for param in param_list if param.name != self.table_name pserver_startup_program = t.get_startup_program(current_endpoint,
] pserver_program)
grad_list = [ elif role == "TRAINER":
grad for grad in grad_list trainer_program = t.get_trainer_program()
if grad.name != grad_var_name(self.table_name) """
]
self.table_param_grad = [
param_grad for param_grad in params_grads
if param_grad[0].name == self.table_name
][0]
table_grad_var = self.table_param_grad[1]
if self.sync_mode:
self.trainer_side_table_grad_list = [
program.global_block().create_var(
name="%s.trainer_%d.pserver_%d" %
(table_grad_var.name, self.trainer_id, index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype)
for index in range(len(self.pserver_endpoints))
]
else:
self.trainer_side_table_grad_list = [
program.global_block().create_var(
name="%s.pserver_%d" % (table_grad_var.name, index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype)
for index in range(len(self.pserver_endpoints))
]
return param_list, grad_list
def _init_splited_vars(self, slice_var_up):
# update these mappings for further transpile:
# 1. param_var_mapping: param var name -> [splited params vars]
# 2. grad_var_mapping: grad var name -> [splited grads vars]
# 3. grad_param_mapping: grad.blockx -> param.blockx
# 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
param_list = []
grad_list = []
param_grad_set = set()
for p, g in self.params_grads:
# skip parameter marked not trainable
if type(p) == Parameter and p.trainable == False:
continue
if p.name not in param_grad_set:
param_list.append(p)
param_grad_set.add(p.name)
if g.name not in param_grad_set:
grad_list.append(g)
param_grad_set.add(g.name)
param_list, grad_list = self._update_dist_lookup_table_vars(
param_list, grad_list, self.params_grads)
if slice_var_up:
# when we slice var up into blocks, we will slice the var according to
# pserver services' count. A pserver may have two or more listening ports.
grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
param_blocks = slice_variable(param_list,
len(self.pserver_endpoints))
else:
# when we do NOT slice var up into blocks, we will always slice params
# grads into one block.
grad_blocks = slice_variable(grad_list, 1)
param_blocks = slice_variable(param_list, 1)
assert (len(grad_blocks) == len(param_blocks))
# origin_varname -> [splited_var]
self.param_var_mapping = self._create_vars_from_blocklist(
self.origin_program, param_blocks)
self.grad_var_mapping = self._create_vars_from_blocklist(
self.origin_program,
grad_blocks,
add_trainer_suffix=self.trainer_num > 1)
self.grad_param_mapping = dict()
for g, p in zip(grad_blocks, param_blocks):
g_name, g_bid, _ = g.split(":")
p_name, p_bid, _ = p.split(":")
self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
self.param_var_mapping[p_name][int(p_bid)]
# create mapping of endpoint -> split var to create pserver side program
self.param_grad_ep_mapping = dict()
[
self.param_grad_ep_mapping.update({
ep: {
"params": [],
"grads": []
}
}) for ep in self.pserver_endpoints
]
def transpile(self, def transpile(self,
trainer_id, trainer_id,
...@@ -250,20 +154,20 @@ class DistributeTranspiler: ...@@ -250,20 +154,20 @@ class DistributeTranspiler:
split_method=RoundRobin, split_method=RoundRobin,
sync_mode=True): sync_mode=True):
""" """
:param trainer_id: one unique id for each trainer in a job. Run the transpiler.
:type trainer_id: int
:param program: program to transpile, default is default_main_program Args:
:type program: Program trainer_id (int): id for current trainer worker, if you have
:param pservers: parameter server endpoints like "m1:6174,m2:6174" n workers, the id may range from 0 ~ n-1
:type pservers: string program (Program|None): program to transpile,
:param trainers: total number of workers/trainers in the job default is fluid.default_main_program().
:type trainers: int pservers (str): comma separated ip:port string for the pserver
:param split_method: A function to determin how to split variables list.
to different servers equally. trainers (int): number of trainers in the distributed job.
:type split_method: function slice_var_up (bool): Do Tensor slice for pservers, default is True.
:type sync_mode: boolean default True split_method (PSDispatcher): RoundRobin or HashName can be used
:param sync_mode: if sync_mode is set True, it means that dist transpiler try to choose the best method to balance loads for pservers.
will transpile the program into sync_mode pserver and trainer program. sync_mode (bool): Do sync training or not, default is True.
""" """
assert (split_method.__bases__[0] == PSDispatcher) assert (split_method.__bases__[0] == PSDispatcher)
if program is None: if program is None:
...@@ -390,6 +294,12 @@ class DistributeTranspiler: ...@@ -390,6 +294,12 @@ class DistributeTranspiler:
self._split_table_grad_and_add_send_vars(program, pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
def get_trainer_program(self): def get_trainer_program(self):
"""
Get transpiled trainer side program.
Returns:
Program: trainer side program.
"""
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
# FIXME(typhoonzero): serialize once will fix error occurs when clone. # FIXME(typhoonzero): serialize once will fix error occurs when clone.
...@@ -398,12 +308,19 @@ class DistributeTranspiler: ...@@ -398,12 +308,19 @@ class DistributeTranspiler:
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
""" """
Get pserver side program using the endpoint. Get parameter server side program.
TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
NOTE: assume blocks of the same variable is not distributed Args:
on the same pserver, only change param/grad varnames for endpoint (str): current parameter server endpoint.
trainers to fetch.
Returns:
Program: the program for current parameter server to run.
""" """
# TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
# NOTE: assume blocks of the same variable is not distributed
# on the same pserver, only change param/grad varnames for
# trainers to fetch.
# step1 # step1
pserver_program = Program() pserver_program = Program()
# step2: Create vars to receive vars at parameter servers. # step2: Create vars to receive vars at parameter servers.
...@@ -582,6 +499,14 @@ class DistributeTranspiler: ...@@ -582,6 +499,14 @@ class DistributeTranspiler:
Get startup program for current parameter server. Get startup program for current parameter server.
Modify operator input variables if there are variables that Modify operator input variables if there are variables that
were split to several blocks. were split to several blocks.
Args:
endpoint (str): current pserver endpoint.
pserver_program (Program): call get_pserver_program first and
pass the result here.
Returns:
Program: parameter server side startup program.
""" """
s_prog = Program() s_prog = Program()
orig_s_prog = default_startup_program() orig_s_prog = default_startup_program()
...@@ -633,6 +558,129 @@ class DistributeTranspiler: ...@@ -633,6 +558,129 @@ class DistributeTranspiler:
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
def _has_distributed_lookup_table(self):
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
distributed_lookup_table_ops = []
# support only one distributed_lookup_table now
self.table_name = None
for op in self.origin_program.global_block().ops:
if op.type == LOOKUP_TABLE_TYPE:
if op.attrs['is_distributed'] is True:
if self.table_name is None:
self.table_name = op.input("W")[0]
if self.table_name != op.input("W")[0]:
raise RuntimeError("all distributed lookup_table_ops"
" should have only one table")
distributed_lookup_table_ops.append(op)
else:
if self.table_name is not None:
assert op.input("W")[0] != self.table_name
return len(distributed_lookup_table_ops) > 0
def _update_dist_lookup_table_vars(self, param_list, grad_list,
params_grads):
# TODO(wuyi): put find a way to put dist lookup table stuff all together.
# update self.table_param_grad and self.trainer_side_table_grad_list
program = self.origin_program
if self.has_distributed_lookup_table:
param_list = [
param for param in param_list if param.name != self.table_name
]
grad_list = [
grad for grad in grad_list
if grad.name != grad_var_name(self.table_name)
]
self.table_param_grad = [
param_grad for param_grad in params_grads
if param_grad[0].name == self.table_name
][0]
table_grad_var = self.table_param_grad[1]
if self.sync_mode:
self.trainer_side_table_grad_list = [
program.global_block().create_var(
name="%s.trainer_%d.pserver_%d" %
(table_grad_var.name, self.trainer_id, index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype)
for index in range(len(self.pserver_endpoints))
]
else:
self.trainer_side_table_grad_list = [
program.global_block().create_var(
name="%s.pserver_%d" % (table_grad_var.name, index),
type=table_grad_var.type,
shape=table_grad_var.shape,
dtype=table_grad_var.dtype)
for index in range(len(self.pserver_endpoints))
]
return param_list, grad_list
def _init_splited_vars(self, slice_var_up):
# update these mappings for further transpile:
# 1. param_var_mapping: param var name -> [splited params vars]
# 2. grad_var_mapping: grad var name -> [splited grads vars]
# 3. grad_param_mapping: grad.blockx -> param.blockx
# 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
param_list = []
grad_list = []
param_grad_set = set()
for p, g in self.params_grads:
# skip parameter marked not trainable
if type(p) == Parameter and p.trainable == False:
continue
if p.name not in param_grad_set:
param_list.append(p)
param_grad_set.add(p.name)
if g.name not in param_grad_set:
grad_list.append(g)
param_grad_set.add(g.name)
param_list, grad_list = self._update_dist_lookup_table_vars(
param_list, grad_list, self.params_grads)
if slice_var_up:
# when we slice var up into blocks, we will slice the var according to
# pserver services' count. A pserver may have two or more listening ports.
grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
param_blocks = slice_variable(param_list,
len(self.pserver_endpoints))
else:
# when we do NOT slice var up into blocks, we will always slice params
# grads into one block.
grad_blocks = slice_variable(grad_list, 1)
param_blocks = slice_variable(param_list, 1)
assert (len(grad_blocks) == len(param_blocks))
# origin_varname -> [splited_var]
self.param_var_mapping = self._create_vars_from_blocklist(
self.origin_program, param_blocks)
self.grad_var_mapping = self._create_vars_from_blocklist(
self.origin_program,
grad_blocks,
add_trainer_suffix=self.trainer_num > 1)
self.grad_param_mapping = dict()
for g, p in zip(grad_blocks, param_blocks):
g_name, g_bid, _ = g.split(":")
p_name, p_bid, _ = p.split(":")
self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \
self.param_var_mapping[p_name][int(p_bid)]
# create mapping of endpoint -> split var to create pserver side program
self.param_grad_ep_mapping = dict()
[
self.param_grad_ep_mapping.update({
ep: {
"params": [],
"grads": []
}
}) for ep in self.pserver_endpoints
]
# transpiler function for dis lookup_table # transpiler function for dis lookup_table
def _replace_lookup_table_op_with_prefetch(self, program, def _replace_lookup_table_op_with_prefetch(self, program,
pserver_endpoints): pserver_endpoints):
......
...@@ -383,6 +383,16 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): ...@@ -383,6 +383,16 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
def release_memory(input_program, skip_opt_set=None): def release_memory(input_program, skip_opt_set=None):
"""
Modify the input program and insert :code:`delete_op` to early drop not used
variables. The modification will be performed inplace.
Notes: This is an experimental API and could be removed in next few
releases. Users should not use this API.
Args:
input_program(Program): The program will be inserted :code:`delete_op`.
"""
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.release_memory(skip_opt_set=skip_opt_set) cfg.release_memory(skip_opt_set=skip_opt_set)
...@@ -33,15 +33,21 @@ class PSDispatcher(object): ...@@ -33,15 +33,21 @@ class PSDispatcher(object):
def dispatch(self, varlist): def dispatch(self, varlist):
""" """
:param varlist: a list of Variables Args:
:return: a map of pserver endpoint -> varname varlist(list): a list of Variables
Returns:
a map of pserver endpoint -> varname
""" """
AssertionError("Interface has not been implemented.") AssertionError("Interface has not been implemented.")
class HashName(PSDispatcher): class HashName(PSDispatcher):
""" """
Hash variable names to several endpoints Hash variable names to several endpoints using python
"hash()" function.
Args:
pserver_endpoints (list): list of endpoint(ip:port).
""" """
def __init__(self, pserver_endpoints): def __init__(self, pserver_endpoints):
...@@ -61,7 +67,11 @@ class HashName(PSDispatcher): ...@@ -61,7 +67,11 @@ class HashName(PSDispatcher):
class RoundRobin(PSDispatcher): class RoundRobin(PSDispatcher):
""" """
Distribute variables to serveral endpoints. Distribute variables to serveral endpoints using
RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
Args:
pserver_endpoints (list): list of endpoint(ip:port).
""" """
def __init__(self, pserver_endpoints): def __init__(self, pserver_endpoints):
......
...@@ -16,7 +16,7 @@ import collections ...@@ -16,7 +16,7 @@ import collections
import contextlib import contextlib
import sys import sys
__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator'] __all__ = ['generate', 'switch', 'guard']
class UniqueNameGenerator(object): class UniqueNameGenerator(object):
......
...@@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker): ...@@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker):
True if successful otherwise False. True if successful otherwise False.
""" """
if node.name.startswith("__") or node.name.startswith("_"):
return True
find = False find = False
for t in node.body: for t in node.body:
if not isinstance(t, astroid.Return): if not isinstance(t, astroid.Return):
...@@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker): ...@@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker):
Returns: Returns:
True if successful otherwise False. True if successful otherwise False.
""" """
if node.name.startswith("__") or node.name.startswith("_"):
return True
args = [] args = []
for arg in node.args.get_children(): for arg in node.args.get_children():
if (not isinstance(arg, astroid.AssignName)) \ if (not isinstance(arg, astroid.AssignName)) \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册