提交 9dccca96 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into some_small_fixes

...@@ -23,7 +23,7 @@ repos: ...@@ -23,7 +23,7 @@ repos:
- id: clang-format-with-version-check - id: clang-format-with-version-check
name: clang-format name: clang-format
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: bash ./.clang_format.hook -i entry: bash ./tools/codestyle/clang_format.hook -i
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local - repo: local
...@@ -52,7 +52,7 @@ repos: ...@@ -52,7 +52,7 @@ repos:
hooks: hooks:
- id: copyright_checker - id: copyright_checker
name: copyright_checker name: copyright_checker
entry: python ./.copyright.hook entry: python ./tools/codestyle/copyright.hook
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Use UBUNTU_MIRROR can speed up apt-get speed.
# ARG UBUNTU_MIRROR
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
# IMPORTANT: # IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python' RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
...@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace ...@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD *.whl / ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py models/ /workspace/ ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
ADD models/ /workspace/models/
...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args): ...@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
return train_program, fluid.default_startup_program() return train_program, fluid.default_startup_program()
else: else:
raise ValueError( raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER' 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
) )
...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
break break
else: else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
if args.use_reader_op: if args.use_reader_op:
num_samples += args.batch_size * args.gpus num_samples += args.batch_size * args.gpus
else: else:
...@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples): ...@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
(num_samples, train_elapsed, examples_per_sec)) (num_samples, train_elapsed, examples_per_sec))
def print_paddle_envs():
print('----------- Configuration envs -----------')
for k in os.environ:
if "PADDLE_" in k:
print "ENV %s:%s" % (k, os.environ[k])
print('------------------------------------------------')
def main(): def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
print_paddle_envs()
# the unique trainer id, starting from 0, needed by trainer # the unique trainer id, starting from 0, needed by trainer
# only # only
......
...@@ -17,6 +17,7 @@ import copy ...@@ -17,6 +17,7 @@ import copy
import argparse import argparse
import random import random
import os import os
import copy
from kube_templates import pserver, trainer, envs from kube_templates import pserver, trainer, envs
...@@ -108,10 +109,9 @@ def gen_job(): ...@@ -108,10 +109,9 @@ def gen_job():
tn_container["ports"][0]["containerPort"] = spreadport tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "TRAINERS", "value": str(args.trainers)}) envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify # NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster. # this settings before you run on your own cluster.
...@@ -166,17 +166,23 @@ def gen_job(): ...@@ -166,17 +166,23 @@ def gen_job():
tn["spec"]["template"]["spec"]["volumes"] = volumes tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = envs ps_container["env"] = copy.deepcopy(envs)
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"}) ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs tn_container["env"] = envs
if args.disttype == "pserver": if args.disttype == "pserver":
tn_container["env"].append({ tn_container["env"].append({
"name": "TRAINING_ROLE", "name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER" "value": "TRAINER"
}) })
elif args.disttype == "nccl2" or args.disttype == "local": elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER # NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"}) tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname) os.mkdir(args.jobname)
if args.disttype == "pserver": if args.disttype == "pserver":
......
...@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML") ...@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
ELSE() ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF() ENDIF()
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result") SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ExternalProject_Add( ExternalProject_Add(
......
#!/bin/bash #!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
do do
python gen_doc.py ${module} > ${module}.rst python gen_doc.py ${module} > ${module}.rst
done done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
transpiler
==========
DistributeTranspiler
--------------------
.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
:members:
:noindex:
InferenceTranspiler
-------------------
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
:members:
:noindex:
memory_optimize
---------------
.. autofunction:: paddle.fluid.transpiler.memory_optimize
:noindex:
release_memory
--------------
.. autofunction:: paddle.fluid.transpiler.release_memory
:noindex:
HashName
--------
.. autoclass:: paddle.fluid.transpiler.HashName
:members:
:noindex:
RoundRobin
----------
.. autoclass:: paddle.fluid.transpiler.RoundRobin
:members:
:noindex:
...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book ...@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server: 第二步,启动Parameter Server:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
``` ```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer: 第三步,启动Trainer:
```bash ```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
``` ```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
......
...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id): ...@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
ret_list.append(f) ret_list.append(f)
return ret_list return ret_list
trainers = int(os.getenv("TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
data_file = fluid.layers.io.open_files( data_file = fluid.layers.io.open_files(
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
thread_num=1, thread_num=1,
......
...@@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository ...@@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_ cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
====================== ======================================== ====================== ========================================
从源码编译 从源码编译
......
...@@ -40,10 +40,9 @@ void Main(bool use_gpu) { ...@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
//# 2. Prepare input. //# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "", PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}), .shape = std::vector<int>({4, 1}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64}; .dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
...@@ -55,14 +54,12 @@ void Main(bool use_gpu) { ...@@ -55,14 +54,12 @@ void Main(bool use_gpu) {
//# 4. Get output. //# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length; LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
} }
// TODO(Superjomn): this is should be free automatically
free(outputs[0].data.data);
} }
} }
...@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
for (int batch_id = 0; batch_id < num_batches; ++batch_id) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
// 2. Dummy Input Data // 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "", PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}), .shape = std::vector<int>({4, 1}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64}; .dtype = PaddleDType::INT64};
std::vector<PaddleTensor> inputs(4, tensor); std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
...@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
// 4. Get output. // 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", " LOG(INFO) << "TID: " << tid << ", "
<< "output buffer size: " << outputs.front().data.length; << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length / sizeof(float); const size_t num_elements =
outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i]; LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
} }
free(outputs[0].data.data);
} }
}); });
} }
......
# Inference High-level APIs
This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
## PaddleTensor
We provide the `PaddleTensor` data structure is to give a general tensor interface.
The definition is
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`.
The `name` field is used to specify the name of input variable,
that is important when there are multiple inputs and need to distiuish which variable to set.
## engine
The inference APIs has two different underlying implementation, currently there are two valid engines:
- the native engine, which is consists of the native operators and framework,
- the Anakin engine, which is a Anakin library embeded.
The native engine takes a native Paddle model as input, and supports any model that trained by Paddle,
but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
};
```
## PaddlePredictor and how to create one
The main interface is `PaddlePredictor`, there are following methods
- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
- take inputs and output `output_data`
- `Clone` to clone a predictor from an existing one, with model parameter shared.
There is a factory method to help create a predictor, and the user takes the ownership of this object.
```c++
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
```
By specifying the engine kind and config, one can get an specific implementation.
## Reference
- [paddle_inference_api.h](./paddle_inference_api.h)
- [demos](./demo)
...@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and ...@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
PaddleBuf::PaddleBuf(PaddleBuf&& other)
: data_(other.data_),
length_(other.length_),
memory_owned_(other.memory_owned_) {
other.memory_owned_ = false;
other.data_ = nullptr;
other.length_ = 0;
}
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
// only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
assert(memory_owned_);
Free();
data_ = new char[length];
length_ = length;
memory_owned_ = true;
}
void PaddleBuf::Reset(void* data, size_t length) {
Free();
memory_owned_ = false;
data_ = data;
length_ = length;
}
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
}
} // namespace paddle
\ No newline at end of file
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -32,12 +33,38 @@ enum PaddleDType { ...@@ -32,12 +33,38 @@ enum PaddleDType {
INT64, INT64,
}; };
struct PaddleBuf { class PaddleBuf {
void* data; // pointer to the data memory. public:
size_t length; // number of memory bytes. PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
// Reset to external memory.
void Reset(void* data, size_t length);
bool empty() const { return length_ == 0; }
void* data() const { return data_; }
size_t length() const { return length_; }
~PaddleBuf() { Free(); }
private:
void Free();
void* data_{nullptr}; // pointer to the data memory.
size_t length_{0}; // number of memory bytes.
bool memory_owned_{true};
}; };
struct PaddleTensor { struct PaddleTensor {
PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
std::vector<int> shape; std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed. // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
...@@ -67,8 +94,9 @@ class PaddlePredictor { ...@@ -67,8 +94,9 @@ class PaddlePredictor {
// Predict an record. // Predict an record.
// The caller should be responsible for allocating and releasing the memory of // The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be alive until Run returns. caller should be // `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for releasing the memory of `output_data`. // responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs, virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0; std::vector<PaddleTensor>* output_data) = 0;
...@@ -81,8 +109,7 @@ class PaddlePredictor { ...@@ -81,8 +109,7 @@ class PaddlePredictor {
// The common configs for all the predictors. // The common configs for all the predictors.
struct Config { struct Config {
std::string model_dir; // path to the model directory. std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
}; };
}; };
......
...@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
auto d_tensor_in_p = executor_.get_in(input.name); auto d_tensor_in_p = executor_.get_in(input.name);
float *d_data_p = d_tensor_in_p->mutable_data(); float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p, if (cudaMemcpy(d_data_p,
static_cast<float *>(input.data.data), static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float), d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) { cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error"; LOG(ERROR) << "copy data from CPU to GPU error";
...@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
for (auto &output : *output_data) { for (auto &output : *output_data) {
auto *tensor = executor_.get_out(output.name); auto *tensor = executor_.get_out(output.name);
output.shape = tensor->shape(); output.shape = tensor->shape();
if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float));
}
// Copy data from GPU -> CPU // Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data, if (cudaMemcpy(output.data.data(),
tensor->mutable_data(), tensor->mutable_data(),
tensor->valid_size() * sizeof(float), tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) { cudaMemcpyDeviceToHost) != 0) {
......
...@@ -37,28 +37,26 @@ TEST(inference, anakin) { ...@@ -37,28 +37,26 @@ TEST(inference, anakin) {
float data[1 * 3 * 224 * 224] = {1.0f}; float data[1 * 3 * 224 * 224] = {1.0f};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "input_0", PaddleTensor tensor{.name = "input_0",
.shape = std::vector<int>({1, 3, 224, 224}), .shape = std::vector<int>({1, 3, 224, 224}),
.data = buf, .data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::FLOAT32}; .dtype = PaddleDType::FLOAT32};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor); std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.emplace_back(std::move(tensor));
float data_out[1000];
PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
PaddleTensor tensor_out{.name = "prob_out", PaddleTensor tensor_out{.name = "prob_out",
.shape = std::vector<int>({1000, 1}), .shape = std::vector<int>({1000, 1}),
.data = buf_out, .data = PaddleBuf(),
.dtype = PaddleDType::FLOAT32}; .dtype = PaddleDType::FLOAT32};
std::vector<PaddleTensor> outputs(1, tensor_out); std::vector<PaddleTensor> outputs;
outputs.emplace_back(std::move(tensor_out));
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < 1000; ++j) { for (size_t j = 0; j < 1000; ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j]; LOG(INFO) << "output[" << j << "]: " << data_o[j];
} }
......
...@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), std::memcpy(static_cast<void *>(input_ptr),
inputs[i].data.data, inputs[i].data.data(),
inputs[i].data.length); inputs[i].data.length());
feeds->push_back(input); feeds->push_back(input);
} }
return true; return true;
...@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch( ...@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
} }
outputs->at(i).shape = shape; outputs->at(i).shape = shape;
outputs->at(i).data.length = sizeof(float) * data.size(); auto &buffer = outputs->at(i).data;
outputs->at(i).data.data = malloc(outputs->at(i).data.length); if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
std::memcpy( buffer.Resize(sizeof(float) * data.size());
outputs->at(i).data.data, data.data(), outputs->at(i).data.length); }
std::memcpy(buffer.data(), data.data(), buffer.length());
outputs->at(i).dtype = PaddleDType::FLOAT32; outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy. // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
} }
......
...@@ -27,13 +27,12 @@ namespace paddle { ...@@ -27,13 +27,12 @@ namespace paddle {
PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
PaddleTensor pt; PaddleTensor pt;
pt.data.data = t->data<void>();
if (t->type() == typeid(int64_t)) { if (t->type() == typeid(int64_t)) {
pt.data.length = t->numel() * sizeof(int64_t); pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
pt.dtype = PaddleDType::INT64; pt.dtype = PaddleDType::INT64;
} else if (t->type() == typeid(float)) { } else if (t->type() == typeid(float)) {
pt.data.length = t->numel() * sizeof(float); pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
pt.dtype = PaddleDType::FLOAT32; pt.dtype = PaddleDType::FLOAT32;
} else { } else {
LOG(FATAL) << "unsupported type."; LOG(FATAL) << "unsupported type.";
...@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) { ...@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length; size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data); float* data = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0); ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0); ASSERT_GT(data[j], -1.0);
...@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) { ...@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
EXPECT_LT(lod_data[i] - data[i], 1e-3); EXPECT_LT(lod_data[i] - data[i], 1e-3);
EXPECT_GT(lod_data[i] - data[i], -1e-3); EXPECT_GT(lod_data[i] - data[i], -1e-3);
} }
free(outputs[0].data.data);
} }
void MainImageClassification(bool use_gpu) { void MainImageClassification(bool use_gpu) {
...@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) { ...@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length; size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data); float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>(); float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], 1e-3); EXPECT_NEAR(lod_data[j], data[j], 1e-3);
} }
free(data);
} }
void MainThreadsWord2Vec(bool use_gpu) { void MainThreadsWord2Vec(bool use_gpu) {
...@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
// check outputs range // check outputs range
ASSERT_EQ(local_outputs.size(), 1UL); ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length; const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data); float* data = static_cast<float*>(local_outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) { for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0); ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0); ASSERT_GT(data[j], -1.0);
...@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], 1e-3);
} }
free(data);
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
...@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
// check outputs correctness // check outputs correctness
ASSERT_EQ(local_outputs.size(), 1UL); ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length; const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data); float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>(); float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), len / sizeof(float)); EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) { for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3); EXPECT_NEAR(ref_data[i], data[i], 1e-3);
} }
free(data);
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
......
...@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) { for (auto &p : params) {
grad_names_.insert(GradVarName(p)); grad_names_.insert(GradVarName(p));
} }
balance_vars_.resize(places_.size(), 0);
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
...@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp( ...@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
checker(op.InputArgumentNames(), recv_vars); checker(op.InputArgumentNames(), recv_vars);
} }
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0;
for (auto var_name : var_names) {
auto var_desc = all_vars_.at(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GT(numel, 0);
numel_sum += numel;
}
auto smallest =
std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
balance_vars_[dev_id] += numel_sum;
return dev_id;
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const { const ProgramDesc &program) const {
std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) { for (auto *var : program.Block(0).AllVars()) {
all_vars[var->Name()] = var; all_vars_.emplace(var->Name(), var);
} }
auto graph = new SSAGraph(); auto graph = new SSAGraph();
...@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program); auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program); auto recv_vars = FindDistTrainRecvVars(program);
std::vector<std::unordered_set<std::string>> var_name_on_devices;
std::vector<std::unordered_set<std::string>> bcast_var_name_set; std::vector<std::unordered_set<std::string>> bcast_var_name_set;
var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size()); bcast_var_name_set.resize(places_.size());
size_t cur_device_id = 0; size_t cur_device_id = 0;
std::vector<int64_t> balance_grads(places_.size(), 0);
auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
auto var_desc = all_vars.at(g_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GE(numel, 0);
auto smallest =
std::min_element(std::begin(balance_grads), std::end(balance_grads));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
balance_grads[dev_id] += numel;
return dev_id;
};
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
if (boost::get<int>( if (boost::get<int>(
op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) { static_cast<int>(OpRole::kRPC)) {
// append rpc op if program is distributed trainer main program.
// always use the first device
CreateRPCOp(&result, *op); CreateRPCOp(&result, *op);
} else if (IsDistTrainOp(*op, send_vars, recv_vars)) { } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
CreateDistTrainOp(&result, *op); CreateDistTrainOp(&result, *op);
...@@ -199,15 +200,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -199,15 +200,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
BuildStrategy::GradientScaleStrategy::kCustomized) { BuildStrategy::GradientScaleStrategy::kCustomized) {
CreateScaleLossGradOp(&result); CreateScaleLossGradOp(&result);
} }
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
// It also assumes backward op will always follow the forward op in
// the block.
is_forwarding = false; is_forwarding = false;
} else { } else {
int op_dev_id = GetOpDeviceID(var_name_on_devices, *op); int op_dev_id = GetOpDeviceID(*op);
if (op_dev_id == -1) { // var on all device if (op_dev_id == -1) { // var on all device
CreateComputationalOps(&result, *op, places_.size()); CreateComputationalOps(&result, *op, places_.size());
} else { } else {
CreateComputationalOp(&result, *op, op_dev_id); CreateComputationalOp(&result, *op, op_dev_id);
for (auto &var_name : op->OutputArgumentNames()) { for (auto &var_name : op->OutputArgumentNames()) {
var_name_on_devices[op_dev_id].emplace(var_name); var_name_on_devices_.emplace(var_name, op_dev_id);
} }
} }
if (!is_forwarding && places_.size() > 1) { if (!is_forwarding && places_.size() > 1) {
...@@ -230,19 +235,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -230,19 +235,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) { switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce: case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = get_appropriate_dev(g_name); cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(&result, g_name, cur_device_id); CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name); var_name_on_devices_.emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name); bcast_var_name_set[cur_device_id].emplace(p_name);
break; break;
case BuildStrategy::ReduceStrategy::kAllReduce: case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(all_vars, g_name)) { if (IsSparseGradient(g_name)) {
CreateReduceOp(&result, g_name, 0); CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0);
} else { } else {
InsertAllReduceOp(&result, g_name); InsertAllReduceOp(&result, g_name);
} }
break; break;
default:
LOG(FATAL) << "Unknown reduce strategy ";
break;
} }
} }
} catch (boost::bad_get e) { } catch (boost::bad_get e) {
...@@ -261,7 +269,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -261,7 +269,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} }
/* /*
Dependency graph has been constructed. However, there are still data Dependency graph has been constructed. However, there are still data
harzaeds need to be handled. hazards need to be handled.
*/ */
PolishGraphToSupportDataHazards(&result); PolishGraphToSupportDataHazards(&result);
...@@ -273,11 +281,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -273,11 +281,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
return std::unique_ptr<SSAGraph>(graph); return std::unique_ptr<SSAGraph>(graph);
} }
bool MultiDevSSAGraphBuilder::IsSparseGradient( bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
const std::unordered_map<std::string, VarDesc *> &all_vars, PADDLE_ENFORCE(all_vars_.count(og) != 0);
const std::string &og) const { if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
PADDLE_ENFORCE(all_vars.count(og) != 0);
if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true; return true;
} }
return false; return false;
...@@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( ...@@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
return is_pg_once; return is_pg_once;
} }
int MultiDevSSAGraphBuilder::GetOpDeviceID( int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1; return -1;
} }
int var_dev_id = -1; for (auto &varname : op.InputArgumentNames()) {
for (auto &var_name : op.InputArgumentNames()) { int dev_id = GetVarDeviceID(varname);
if (var_dev_id != -1) break; if (dev_id != -1) {
for (size_t i = 0; i < var_name_on_devices.size(); ++i) { return dev_id;
if (var_name_on_devices[i].count(var_name)) {
var_dev_id = static_cast<int>(i);
break;
}
} }
} }
return var_dev_id; return -1;
}
int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
auto got = var_name_on_devices_.find(varname);
return got == var_name_on_devices_.end() ? -1 : got->second;
} }
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
...@@ -449,6 +454,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, ...@@ -449,6 +454,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
return var; return var;
} }
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
const std::string &prev_op_name) const { const std::string &prev_op_name) const {
for (auto &prev_op : result->ops_) { for (auto &prev_op : result->ops_) {
...@@ -463,16 +470,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, ...@@ -463,16 +470,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
CreateComputationalOp(result, op, 0); int op_dev_id = -1;
if (op.Type() == "split_byref") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else if (op.Type() == "concat") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
} else {
PADDLE_ENFORCE(
"the distribute training related op should be in [split_byref, "
"concat].");
}
PADDLE_ENFORCE(op_dev_id != -1,
"can not find right place for distributed op: %s", op.Type());
CreateComputationalOp(result, op, op_dev_id);
if (op.Type() == "concat") { if (op.Type() == "concat") {
ConnectOp(result, result->ops_.back().get(), "fetch_barrier"); ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
} }
} }
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
result->ops_.emplace_back( int op_dev_id = -1;
new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); if (op.Type() == "send") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
op.InputArgumentNames()[0].find(".block") == std::string::npos) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
} else if (op.Type() == "recv") {
op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else {
// send_barrier and fetch_barrier op can be scheduled on device 0
op_dev_id = 0;
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
op.Type());
result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
op.Type(), places_[op_dev_id]));
if (op.Type() == "send_barrier") { if (op.Type() == "send_barrier") {
ConnectOp(result, result->ops_.back().get(), "send"); ConnectOp(result, result->ops_.back().get(), "send");
...@@ -488,9 +545,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, ...@@ -488,9 +545,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
"send, send_barrier. recv, fetch_barrier]"); "send, send_barrier. recv, fetch_barrier]");
} }
// TODO(Yancey1989): schedule rpc op on different place may CreateOpHandleIOs(result, op, op_dev_id);
// increate throughput
CreateOpHandleIOs(result, op, 0);
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
......
...@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#endif #endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
int GetVarDeviceID(const std::string &varname) const;
private: private:
void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
size_t place_id) const; size_t device_id) const;
private: private:
std::string loss_var_name_; std::string loss_var_name_;
...@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &og, const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const; std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID( int GetOpDeviceID(const OpDesc &op) const;
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const;
void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
bool IsSparseGradient( bool IsSparseGradient(const std::string &og) const;
const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const; size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
private: private:
BuildStrategy strategy_; BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
mutable std::unordered_map<std::string, int> var_name_on_devices_;
mutable std::vector<int64_t> balance_vars_;
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
......
...@@ -30,6 +30,7 @@ class SSAGraphBuilder { ...@@ -30,6 +30,7 @@ class SSAGraphBuilder {
SSAGraphBuilder() {} SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0; virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
......
...@@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout); auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) { if (timeout) {
std::lock_guard<std::mutex> l(exception_mu_);
if (exception_) { if (exception_) {
auto exp = *exception_; auto exp = *exception_;
exception_.reset(); exception_.reset();
...@@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EnforceNotMet ex) { } catch (platform::EnforceNotMet ex) {
std::lock_guard<std::mutex> l(exception_mu_);
exception_.reset(new platform::EnforceNotMet(ex)); exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) { } catch (...) {
LOG(FATAL) << "Unknown exception catched"; LOG(FATAL) << "Unknown exception catched";
......
...@@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
std::mutex exception_mu_;
std::unique_ptr<platform::EnforceNotMet> exception_; std::unique_ptr<platform::EnforceNotMet> exception_;
std::atomic<int> running_ops_; std::atomic<int> running_ops_;
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#endif #endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {} ...@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
void Executor::Complete() { void Executor::Complete() {
::paddle::operators::detail::RPCClient::GetInstance< ::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::detail::GRPCClient>() ::paddle::operators::distributed::GRPCClient>()
->SendComplete(); ->SendComplete();
} }
#endif #endif
...@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
} }
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars,
bool keep_kids) {
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
...@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) { if (local_scope != scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} else { } else {
// Delete the local scopes created in operators. if (!keep_kids) {
scope->DropKids(); // By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
scope->DropKids();
}
} }
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
......
...@@ -78,7 +78,7 @@ class Executor { ...@@ -78,7 +78,7 @@ class Executor {
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true, bool keep_kids = false);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets, std::map<std::string, const LoDTensor*>* feed_targets,
......
...@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor( ...@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
details::SSAGraphBuilderFactory builder_factory( details::SSAGraphBuilderFactory builder_factory(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
build_strategy); build_strategy);
...@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor( ...@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
#endif #endif
} }
builder_ = std::move(builder_factory.Create());
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, exec_strategy, member_->local_scopes_, places,
builder_factory.Create()->Build(main_program))); builder_->Build(main_program)));
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, std::move(var_infos), exec_strategy, member_->local_scopes_, std::move(var_infos),
...@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor( ...@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const std::unordered_set<std::string> &vars) const { const std::unordered_set<std::string> &vars) const {
auto *main_scope = member_->local_scopes_[0]; // the the initialize bcast, all vars would be bcast from device(0), otherwise
// bcast from the specified device.
bool initialize = builder_.get() == nullptr ? true : false;
for (auto &var : vars) { for (auto &var : vars) {
auto *main_var = main_scope->FindVar(var); int var_dev_id =
builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
if (!initialize && var_dev_id == -1) continue;
framework::Variable *main_var = nullptr;
if (initialize) {
main_var = member_->local_scopes_[0]->FindVar(var);
} else {
main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
}
if (main_var == nullptr || !main_var->IsType<LoDTensor>()) { if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
continue; continue;
} }
...@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) {
if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -19,12 +19,14 @@ limitations under the License. */ ...@@ -19,12 +19,14 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -68,6 +70,7 @@ class ParallelExecutor { ...@@ -68,6 +70,7 @@ class ParallelExecutor {
private: private:
ParallelExecutorPrivate *member_; ParallelExecutorPrivate *member_;
std::unique_ptr<details::SSAGraphBuilder> builder_;
}; };
} // namespace framework } // namespace framework
......
...@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { ...@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_); SubGraphFuse(graph, node_inside_subgraph_teller_);
} }
} // analysis } // namespace analysis
} // inference } // namespace inference
} // paddle } // namespace paddle
...@@ -184,9 +184,9 @@ else() ...@@ -184,9 +184,9 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op) set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif() endif()
add_subdirectory(detail)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
add_subdirectory(distributed)
set(DISTRIBUTE_DEPS "") set(DISTRIBUTE_DEPS "")
if(WITH_GRPC) if(WITH_GRPC)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
...@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE) ...@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE)
endif() endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endforeach()
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL) # listen_and_serv_op sum_op executor SERIAL)
......
...@@ -143,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ ...@@ -143,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC( __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
TanhShrink Activation Operator. TanhShrink Activation Operator.
$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC"; )DOC";
...@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
STanh Activation Operator. STanh Activation Operator.
$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ $$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
)DOC"); )DOC");
} }
......
...@@ -21,8 +21,6 @@ namespace operators { ...@@ -21,8 +21,6 @@ namespace operators {
using batch_norm_bwd = mkldnn::batch_normalization_backward; using batch_norm_bwd = mkldnn::batch_normalization_backward;
using batch_norm_fwd = mkldnn::batch_normalization_forward; using batch_norm_fwd = mkldnn::batch_normalization_forward;
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory; using mkldnn::memory;
using mkldnn::primitive; using mkldnn::primitive;
using mkldnn::reorder; using mkldnn::reorder;
...@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext; ...@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc; using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast; using platform::to_void_cast;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
namespace { namespace {
template <typename T> template <typename T>
struct bn_type_traits { struct bn_type_traits {
......
...@@ -22,22 +22,6 @@ limitations under the License. */ ...@@ -22,22 +22,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class BatchNormOp : public framework::OperatorWithKernel { class BatchNormOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
......
...@@ -19,6 +19,22 @@ limitations under the License. */ ...@@ -19,6 +19,22 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> { class BatchNormKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp, ...@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
ops::BilinearInterpOpMaker, ops::BilinearInterpOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad); REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
ops::BilinearInterpKernel<uint8_t>);
REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
ops::BilinearInterpGradKernel<float>); ops::BilinearInterpGradKernel<float>);
...@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> { ...@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw; int in_chw = channels * in_hw;
int out_chw = channels * out_hw; int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T)); memcpy(output, input, input_t->numel() * sizeof(T));
...@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> { ...@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i; int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0; int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h; float h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda; float h2lambda = 1.f - h1lambda;
for (int j = 0; j < out_w; ++j) { for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j; int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0; int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w; float w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda; float w2lambda = 1.f - w1lambda;
// calculate four position for bilinear interpolation // calculate four position for bilinear interpolation
const T* in_pos = &input[k * in_chw + h * in_w + w]; const T* in_pos = &input[k * in_chw + h * in_w + w];
T* out_pos = &output[k * out_chw + i * out_w + j]; T* out_pos = &output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels for (int c = 0; c < channels; ++c) { // loop for channels
// bilinear interpolation // bilinear interpolation
out_pos[0] = out_pos[0] = static_cast<T>(
h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
h1lambda * (w2lambda * in_pos[hid * in_w] + h1lambda * (w2lambda * in_pos[hid * in_w] +
w1lambda * in_pos[hid * in_w + wid]); w1lambda * in_pos[hid * in_w + wid]));
in_pos += in_hw; in_pos += in_hw;
out_pos += out_hw; out_pos += out_hw;
} }
...@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> { ...@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw; int in_chw = channels * in_hw;
int out_chw = channels * out_hw; int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f; float ratio_h =
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f; (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T)); memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
...@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> { ...@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i; int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0; int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h; float h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda; float h2lambda = 1 - h1lambda;
for (int j = 0; j < out_w; ++j) { for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j; int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0; int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w; float w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda; float w2lambda = 1 - w1lambda;
T* in_pos = &d_input[k * in_chw + h * in_w + w]; T* in_pos = &d_input[k * in_chw + h * in_w + w];
const T* out_pos = &d_output[k * out_chw + i * out_w + j]; const T* out_pos = &d_output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels for (int c = 0; c < channels; ++c) { // loop for channels
in_pos[0] += h2lambda * w2lambda * out_pos[0]; in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
in_pos[wid] += h2lambda * w1lambda * out_pos[0]; in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0]; in_pos[hid * in_w] +=
in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0]; static_cast<T>(h1lambda * w2lambda * out_pos[0]);
in_pos[hid * in_w + wid] +=
static_cast<T>(h1lambda * w1lambda * out_pos[0]);
in_pos += in_hw; in_pos += in_hw;
out_pos += out_hw; out_pos += out_hw;
} }
......
...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_GRPC #ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc_server.h"
#define RPCSERVER_T detail::AsyncGRPCServer #define RPCSERVER_T distributed::AsyncGRPCServer
#define RPCCLIENT_T detail::GRPCClient #define RPCCLIENT_T distributed::GRPCClient
#else #else
#include "paddle/fluid/operators/detail/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/operators/detail/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc_server.h"
#define RPCSERVER_T detail::AsyncBRPCServer #define RPCSERVER_T distributed::AsyncBRPCServer
#define RPCCLIENT_T detail::BRPCClient #define RPCCLIENT_T distributed::BRPCClient
#endif #endif
...@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Detection mAP evaluate operator. Detection mAP evaluate operator.
The general steps are as follows. First, calculate the true positive and The general steps are as follows. First, calculate the true positive and
false positive according to the input of detection and labels, then false positive according to the input of detection and labels, then
calculate the mAP evaluate value. calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles: from the following articles:
https://sanchom.wordpress.com/tag/average-precision/ https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325 https://arxiv.org/abs/1512.02325
)DOC"); )DOC");
} }
......
if(NOT WITH_DISTRIBUTE)
return()
endif()
if(WITH_GRPC) if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/brpc_client.h" #include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
DEFINE_int32(brpc_channel_num, 24, DEFINE_int32(brpc_channel_num, 24,
"Number of channels to send requests connected to one server"); "Number of channels to send requests connected to one server");
...@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ...@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
return q; return q;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -31,13 +31,13 @@ limitations under the License. */ ...@@ -31,13 +31,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
struct ChannelContext { struct ChannelContext {
brpc::Channel channel; brpc::Channel channel;
...@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient { ...@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(BRPCClient); DISABLE_COPY_AND_ASSIGN(BRPCClient);
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,13 +12,13 @@ ...@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/brpc_server.h" #include "paddle/fluid/operators/distributed/brpc_server.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace sendrecv { namespace sendrecv {
typedef std::unordered_map<std::string, typedef std::unordered_map<std::string,
paddle::operators::detail::RequestHandler*> paddle::operators::distributed::RequestHandler*>
HandlerMap; HandlerMap;
class BRPCServiceImpl : public SendRecvService { class BRPCServiceImpl : public SendRecvService {
...@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
: request_send_h_(nullptr), : request_send_h_(nullptr),
request_get_h_(nullptr), request_get_h_(nullptr),
request_prefetch_h_(nullptr) { request_prefetch_h_(nullptr) {
auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_send_h_ = it->second; request_send_h_ = it->second;
} }
it = rpc_call_map.find(paddle::operators::detail::kRequestSend); it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_get_h_ = it->second; request_get_h_ = it->second;
} }
it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
if (it != rpc_call_map.end()) { if (it != rpc_call_map.end()) {
request_prefetch_h_ = it->second; request_prefetch_h_ = it->second;
} }
...@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService { ...@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
} }
private: private:
paddle::operators::detail::RequestHandler* request_send_h_; paddle::operators::distributed::RequestHandler* request_send_h_;
paddle::operators::detail::RequestHandler* request_get_h_; paddle::operators::distributed::RequestHandler* request_get_h_;
paddle::operators::detail::RequestHandler* request_prefetch_h_; paddle::operators::distributed::RequestHandler* request_prefetch_h_;
}; };
} // namespace sendrecv } // namespace sendrecv
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void AsyncBRPCServer::StartServer() { void AsyncBRPCServer::StartServer() {
// Instance of your service. // Instance of your service.
...@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() { ...@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer WaitSeverReady"; VLOG(3) << "AsyncGRPCServer WaitSeverReady";
} }
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -19,12 +19,12 @@ limitations under the License. */ ...@@ -19,12 +19,12 @@ limitations under the License. */
#include <string> #include <string>
#include "brpc/server.h" #include "brpc/server.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class AsyncBRPCServer final : public RPCServer { class AsyncBRPCServer final : public RPCServer {
public: public:
...@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer { ...@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
int ready_; int ready_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -17,11 +17,11 @@ limitations under the License. */ ...@@ -17,11 +17,11 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC // file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data. // requests without too much copying of the tensor data.
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
GrpcByteBufferSource::GrpcByteBufferSource() {} GrpcByteBufferSource::GrpcByteBufferSource() {}
...@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { ...@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_; return byte_count_;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -106,7 +106,7 @@ class GrpcBufferReader final ...@@ -106,7 +106,7 @@ class GrpcBufferReader final
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
// Source provides a way for a particular RPC implementation to provide // Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom. // received data to ParseFrom.
class Source { class Source {
...@@ -183,6 +183,6 @@ class GrpcByteSource : public Source { ...@@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
char space_[sizeof(Reader)]; char space_[sizeof(Reader)];
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_client.h"
#include <sys/time.h> #include <sys/time.h>
#include <limits> #include <limits>
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void GRPCClient::InitImpl() { InitEventLoop(); } void GRPCClient::InitImpl() { InitEventLoop(); }
...@@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { ...@@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
return ch; return ch;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -38,13 +38,13 @@ limitations under the License. */ ...@@ -38,13 +38,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
struct VarHandle { struct VarHandle {
std::string ep; std::string ep;
...@@ -226,6 +226,6 @@ class GRPCClient : public RPCClient { ...@@ -226,6 +226,6 @@ class GRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(GRPCClient); DISABLE_COPY_AND_ASSIGN(GRPCClient);
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < 564; ++i) rows->push_back(i); for (int i = 0; i < 564; ++i) rows->push_back(i);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0)); EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize // deserialize
...@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// deserialize zero-copy // deserialize zero-copy
// framework::Variable var2; // framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx); operators::distributed::VariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar(); framework::Variable* var2 = resp.GetVar();
...@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
math::set_constant(ctx, tensor, 31.9); math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg; ::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0)); EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize // deserialize
...@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
// deserialize zero-copy // deserialize zero-copy
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx); operators::distributed::VariableResponse resp(&scope, &ctx);
if (from_type == 0) { if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
} else { } else {
......
...@@ -15,13 +15,13 @@ limitations under the License. */ ...@@ -15,13 +15,13 @@ limitations under the License. */
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/distributed/grpc_server.h"
using ::grpc::ServerAsyncResponseWriter; using ::grpc::ServerAsyncResponseWriter;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum CallStatus { PROCESS = 0, FINISH }; enum CallStatus { PROCESS = 0, FINISH };
// reference: // reference:
...@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase { ...@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase {
request_.reset(new VariableResponse(request_handler->scope(), request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), request_handler->dev_ctx(),
!request_handler->sync_mode())); !request_handler->sync_mode()));
int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable); int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_, method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase { ...@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase {
::grpc::ServerCompletionQueue* cq, ::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id) RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable); auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_, method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase {
local_scope_(nullptr) { local_scope_(nullptr) {
request_.reset(new VariableResponse(request_handler->scope(), request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), true)); request_handler->dev_ctx(), true));
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable); int method_id =
static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary( service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_, method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id))); reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
...@@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest( ...@@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest(
} }
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -29,17 +29,17 @@ limitations under the License. */ ...@@ -29,17 +29,17 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/distributed/grpc_service.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RequestBase; class RequestBase;
...@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer { ...@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
std::map<std::string, std::vector<RequestBase*>> rpc_reqs_; std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <grpc++/impl/codegen/stub_options.h> #include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h> #include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h> #include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -42,24 +42,25 @@ class ServerContext; ...@@ -42,24 +42,25 @@ class ServerContext;
// Support parsing/unparsing of tensorflow::VariableResponse. // Support parsing/unparsing of tensorflow::VariableResponse.
// Wire-format is identical to RecvVariableResponse. // Wire-format is identical to RecvVariableResponse.
template <> template <>
class SerializationTraits<paddle::operators::detail::VariableResponse> { class SerializationTraits<paddle::operators::distributed::VariableResponse> {
public: public:
static Status Serialize( static Status Serialize(
const paddle::operators::detail::VariableResponse& msg, const paddle::operators::distributed::VariableResponse& msg,
grpc_byte_buffer** bp, bool* own_buffer) { grpc_byte_buffer** bp, bool* own_buffer) {
PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
return Status(); return Status();
} }
static Status Deserialize(grpc_byte_buffer* buffer, static Status Deserialize(
paddle::operators::detail::VariableResponse* msg, grpc_byte_buffer* buffer,
int max_message_size = INT_MAX) { paddle::operators::distributed::VariableResponse* msg,
int max_message_size = INT_MAX) {
if (buffer == nullptr) { if (buffer == nullptr) {
return Status(StatusCode::INTERNAL, "No payload"); return Status(StatusCode::INTERNAL, "No payload");
} }
Status result = g_core_codegen_interface->ok(); Status result = g_core_codegen_interface->ok();
if (result.ok()) { if (result.ok()) {
paddle::operators::detail::GrpcByteSource source(buffer); paddle::operators::distributed::GrpcByteSource source(buffer);
int ret = msg->Parse(&source); int ret = msg->Parse(&source);
if (ret != 0) { if (ret != 0) {
result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
...@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> { ...@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum class GrpcMethod { enum class GrpcMethod {
kSendVariable, kSendVariable,
...@@ -118,6 +119,6 @@ class GrpcService final { ...@@ -118,6 +119,6 @@ class GrpcService final {
}; };
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
char* EncodeVarint32(char* dst, uint32_t v) { char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds // Operate on characters as unsigneds
...@@ -144,6 +144,6 @@ class ProtoEncodeHelper { ...@@ -144,6 +144,6 @@ class ProtoEncodeHelper {
char* limit_; // Just for CHECKs char* limit_; // Just for CHECKs
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestSend[] = "RequestSend";
constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestGet[] = "RequestGet";
...@@ -124,6 +124,6 @@ class RequestHandler { ...@@ -124,6 +124,6 @@ class RequestHandler {
RPCServer* rpc_server_; RPCServer* rpc_server_;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
bool RequestSendHandler::Handle(const std::string& varname, bool RequestSendHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
...@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, ...@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
return true; return true;
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -28,11 +28,11 @@ ...@@ -28,11 +28,11 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RequestSendHandler final : public RequestHandler { class RequestSendHandler final : public RequestHandler {
public: public:
...@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler { ...@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler {
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,15 +12,15 @@ ...@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
std::once_flag RPCClient::init_flag_; std::once_flag RPCClient::init_flag_;
std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr); std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RPCClient { class RPCClient {
public: public:
...@@ -84,6 +84,6 @@ class RPCClient { ...@@ -84,6 +84,6 @@ class RPCClient {
static std::once_flag init_flag_; static std::once_flag init_flag_;
static std::unique_ptr<RPCClient> rpc_client_; static std::unique_ptr<RPCClient> rpc_client_;
}; };
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -17,11 +17,11 @@ ...@@ -17,11 +17,11 @@
#include <limits> #include <limits>
#include <string> #include <string>
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
void RPCServer::ShutDown() { void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown "; LOG(INFO) << "RPCServer ShutDown ";
...@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) { ...@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -19,11 +19,11 @@ ...@@ -19,11 +19,11 @@
#include <thread> // NOLINT #include <thread> // NOLINT
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class RPCServer { class RPCServer {
public: public:
...@@ -86,6 +86,6 @@ class RPCServer { ...@@ -86,6 +86,6 @@ class RPCServer {
friend class RequestHandler; friend class RequestHandler;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -22,18 +22,18 @@ limitations under the License. */ ...@@ -22,18 +22,18 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace detail = paddle::operators::detail; namespace distributed = paddle::operators::distributed;
USE_OP(lookup_table); USE_OP(lookup_table);
std::unique_ptr<detail::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0); auto root_block = program->MutableBlock(0);
...@@ -113,19 +113,21 @@ void StartServer() { ...@@ -113,19 +113,21 @@ void StartServer() {
g_req_handler->SetScope(&scope); g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe); g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get()); g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join(); server_thread.join();
} }
TEST(PREFETCH, CPU) { TEST(PREFETCH, CPU) {
g_req_handler.reset(new detail::RequestPrefetchHandler(true)); g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::thread server_thread(StartServer); std::thread server_thread(StartServer);
g_rpc_service->WaitServerReady(); g_rpc_service->WaitServerReady();
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <nccl.h> #include <nccl.h>
...@@ -23,14 +23,14 @@ limitations under the License. */ ...@@ -23,14 +23,14 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
...@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var) { framework::Variable** var) {
operators::detail::VariableResponse resp(scope, &ctx); operators::distributed::VariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -25,12 +25,12 @@ limitations under the License. */ ...@@ -25,12 +25,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
typedef void (*DestroyCallback)(void*); typedef void (*DestroyCallback)(void*);
...@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { ...@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
} }
} }
} // namespace detail } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/distributed/variable_response.h"
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -22,12 +22,12 @@ ...@@ -22,12 +22,12 @@
#endif #endif
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
enum WireType { enum WireType {
WIRETYPE_VARINT = 0, WIRETYPE_VARINT = 0,
...@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData( ...@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData(
slr->set_height(meta_.slr_height()); slr->set_height(meta_.slr_height());
auto* tensor = slr->mutable_value(); auto* tensor = slr->mutable_value();
tensor->Resize(dims); tensor->Resize(dims);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
static_cast<size_t>(tensor->numel()), length / framework::SizeOfType(
length / framework::SizeOfType( paddle::operators::distributed::ToTypeIndex(
paddle::operators::detail::ToTypeIndex(meta_.data_type()))); meta_.data_type())));
void* tensor_data = tensor->mutable_data( void* tensor_data = tensor->mutable_data(
ctx.GetPlace(), ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta_.data_type())); paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false; return false;
...@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) { ...@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) {
return 0; return 0;
} }
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -22,17 +22,17 @@ ...@@ -22,17 +22,17 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace distributed {
class VariableResponse { class VariableResponse {
public: public:
...@@ -99,6 +99,6 @@ class VariableResponse { ...@@ -99,6 +99,6 @@ class VariableResponse {
sendrecv::VariableMessage meta_; sendrecv::VariableMessage meta_;
}; };
}; // namespace detail }; // namespace distributed
}; // namespace operators }; // namespace operators
}; // namespace paddle }; // namespace paddle
...@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
rpc_client->Wait(); rpc_client->Wait();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/mean_op.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
template <typename T>
class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::normal_distribution<T> dist(mean, std);
int64_t size = tensor->numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
ops::GaussianMKLDNNKernel<float>);
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#include <random> #include <random>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")), static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context()); ctx.device_context(), layout, library);
} }
}; };
...@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
"(int, default 5(FP32)) " "(int, default 5(FP32)) "
"Output data type.") "Output data type.")
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
GaussianRandom Operator. GaussianRandom Operator.
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
namespace paddle { namespace paddle {
...@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::vector<std::string> endpoint_list = std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("endpoint_list"); Attr<std::vector<std::string>>("endpoint_list");
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (auto& ep : endpoint_list) { for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep; VLOG(3) << "sending nccl id to " << ep;
...@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
// NOTE: Can not use unique_ptr here because the default // NOTE: Can not use unique_ptr here because the default
// deleter will call GRPC Server's base class's dtor and // deleter will call GRPC Server's base class's dtor and
// that will cause a wired crash. // that will cause a wired crash.
detail::RequestSendHandler rpc_h(true); distributed::RequestSendHandler rpc_h(true);
std::unique_ptr<detail::RPCServer> rpc_service( std::unique_ptr<distributed::RPCServer> rpc_service(
new RPCSERVER_T(endpoint, 1)); new RPCSERVER_T(endpoint, 1));
rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
rpc_h.SetRPCServer(rpc_service.get()); rpc_h.SetRPCServer(rpc_service.get());
framework::ProgramDesc empty_program; framework::ProgramDesc empty_program;
...@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
rpc_h.SetExecutor(&executor); rpc_h.SetExecutor(&executor);
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
rpc_service->SetCond(detail::kRequestSend); rpc_service->SetCond(distributed::kRequestSend);
VLOG(3) << "start getting nccl id from trainer 0..."; VLOG(3) << "start getting nccl id from trainer 0...";
rpc_service->WaitBarrier(detail::kRequestSend); rpc_service->WaitBarrier(distributed::kRequestSend);
VLOG(3) << "got nccl id and stop server..."; VLOG(3) << "got nccl id and stop server...";
rpc_service->ShutDown(); rpc_service->ShutDown();
VLOG(3) << "rpc server stopped"; VLOG(3) << "rpc server stopped";
......
...@@ -21,14 +21,14 @@ limitations under the License. */ ...@@ -21,14 +21,14 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void RunServer(std::shared_ptr<detail::RPCServer> service) { void RunServer(std::shared_ptr<distributed::RPCServer> service) {
service->StartServer(); service->StartServer();
VLOG(4) << "RunServer thread end"; VLOG(4) << "RunServer thread end";
} }
...@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop( ...@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop(
while (true) { while (true) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(detail::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
rpc_service_->WaitBarrier(detail::kRequestSend); rpc_service_->WaitBarrier(distributed::kRequestSend);
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
LOG(WARNING) << "get exit!rpc_processor break!"; LOG(WARNING) << "get exit!rpc_processor break!";
rpc_service_->SetCond(detail::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
break; break;
} }
...@@ -154,11 +154,11 @@ void ListenAndServOp::RunSyncLoop( ...@@ -154,11 +154,11 @@ void ListenAndServOp::RunSyncLoop(
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
rpc_service_->SetCond(detail::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(detail::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
// reset received sparse vars to avoid reuse it in the next mini-batch // reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get()) dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder(); ->ResetSparseVarRecorder();
} // while(true) } // while(true)
} }
...@@ -215,13 +215,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -215,13 +215,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
} }
static void FillRequestCtx( static void FillRequestCtx(
detail::RequestHandler *h, framework::Scope *scope, distributed::RequestHandler *h, framework::Scope *scope,
platform::DeviceContext *dev_ctx, framework::Executor *executor, platform::DeviceContext *dev_ctx, framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx, *prefetch_ctx,
detail::RPCServer *rpc_server) { distributed::RPCServer *rpc_server) {
h->SetScope(scope); h->SetScope(scope);
h->SetDevCtx(dev_ctx); h->SetDevCtx(dev_ctx);
h->SetExecutor(executor); h->SetExecutor(executor);
...@@ -249,14 +249,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -249,14 +249,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
request_prefetch_handler_.reset( request_prefetch_handler_.reset(
new detail::RequestPrefetchHandler(sync_mode)); new distributed::RequestPrefetchHandler(sync_mode));
rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get()); rpc_service_->RegisterRPC(distributed::kRequestSend,
rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get()); request_send_handler_.get());
rpc_service_->RegisterRPC(detail::kRequestPrefetch, rpc_service_->RegisterRPC(distributed::kRequestGet,
request_get_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
request_prefetch_handler_.get()); request_prefetch_handler_.get());
auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock); auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
......
...@@ -24,8 +24,8 @@ limitations under the License. */ ...@@ -24,8 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,7 +33,7 @@ namespace operators { ...@@ -33,7 +33,7 @@ namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock"; constexpr char kOptimizeBlock[] = "OptimizeBlock";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
void RunServer(std::shared_ptr<detail::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
class ListenAndServOp : public framework::OperatorBase { class ListenAndServOp : public framework::OperatorBase {
public: public:
...@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase {
const platform::Place& dev_place) const override; const platform::Place& dev_place) const override;
protected: protected:
mutable std::shared_ptr<detail::RPCServer> rpc_service_; mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
mutable std::shared_ptr<detail::RequestHandler> request_send_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_get_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_; mutable std::shared_ptr<distributed::RequestHandler>
request_prefetch_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
}; };
......
...@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); ...@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor); paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor, REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor); paddle::operators::LogicalXorFunctor);
...@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace()); auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation // computation
for (size_t k = 0; k < input_rows; ++k) { for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols; const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (size_t j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
auto* out_tensor = outputs->at(j); auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) { if (out_tensor != nullptr) {
......
...@@ -22,43 +22,24 @@ namespace paddle { ...@@ -22,43 +22,24 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
const T* orig = first;
const T* it = nullptr;
T step = 0;
while (count > 0) {
it = first;
step = count / 2;
it += step;
if (!(val < *it)) {
first = ++it;
count -= step + 1;
} else {
count = step;
}
}
return first - orig;
}
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols, const int output_rows, const int output_cols,
T* output) { T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1; int curr_segment = 0;
int curr_offset = input_cols[0];
int curr_offset = input_cols[segment];
int curr_segment = segment;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; int curr_col_offset = input_cols[curr_segment + 1];
while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
curr_col_offset = input_cols[curr_segment + 1];
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* input_ptr = inputs[curr_segment]; T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
...@@ -89,14 +70,14 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -89,14 +70,14 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int in_col, const int* out_cols, const int in_col, const int* out_cols,
int out_cols_size, T** outputs_data) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1; int curr_segment = 0;
int curr_offset = out_cols[segment]; int curr_offset = out_cols[0];
int curr_segment = segment;
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; int curr_col_offset = out_cols[curr_segment + 1];
while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) { while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
curr_col_offset = out_cols[curr_segment + 1];
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
...@@ -228,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -228,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs->at(i)->numel() / out_row; int t_col = ref_inputs.at(i)->numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out0_col) sameShape = false; if (t_col != out0_col) sameShape = false;
} }
......
...@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>; ...@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>; template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>; template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>; template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
#define DEFINE_CPU_TRANS(RANK) \ #define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \ template struct Transpose<platform::CPUDeviceContext, platform::float16, \
......
...@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
......
...@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase { ...@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
......
...@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
......
...@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase { ...@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase {
// For profiling // For profiling
platform::RecordEvent record_event(Type(), &ctx); platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
......
...@@ -14,11 +14,14 @@ ...@@ -14,11 +14,14 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/tensorrt_engine_op.h" #include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,10 +16,12 @@ ...@@ -16,10 +16,12 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
const std::string& z_name, bool x_created, const std::string& z_name, bool x_created,
const shape_t& x_shape, const shape_t& y_shape, const shape_t& x_shape, const shape_t& y_shape,
const shape_t& z_shape) { const shape_t& z_shape) {
LOG(INFO) << "create fc op"; LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp(); auto* fc = block_desc.AppendOp();
fc->SetType("mul"); fc->SetType("mul");
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
...@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv); ...@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv);
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
namespace m = paddle::operators::math; namespace m = paddle::operators::math;
namespace detail = paddle::operators::detail; namespace distributed = paddle::operators::distributed;
namespace string = paddle::string; namespace string = paddle::string;
std::unique_ptr<detail::RPCServer> g_rpc_service; std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler; std::unique_ptr<distributed::RequestHandler> g_req_handler;
void StartServer() { void StartServer() {
f::Scope scope; f::Scope scope;
...@@ -57,14 +57,14 @@ void StartServer() { ...@@ -57,14 +57,14 @@ void StartServer() {
g_req_handler->SetProgram(&empty_program); g_req_handler->SetProgram(&empty_program);
g_req_handler->SetExecutor(&executor); g_req_handler->SetExecutor(&executor);
g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get()); g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get()); g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread( std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
g_rpc_service->SetCond(detail::kRequestSend); g_rpc_service->SetCond(distributed::kRequestSend);
g_rpc_service->WaitBarrier(detail::kRequestSend); g_rpc_service->WaitBarrier(distributed::kRequestSend);
LOG(INFO) << "got nccl id and stop server..."; LOG(INFO) << "got nccl id and stop server...";
g_rpc_service->ShutDown(); g_rpc_service->ShutDown();
...@@ -72,7 +72,7 @@ void StartServer() { ...@@ -72,7 +72,7 @@ void StartServer() {
} }
TEST(SendNcclId, RPCServer) { TEST(SendNcclId, RPCServer) {
g_req_handler.reset(new detail::RequestSendHandler(true)); g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
std::thread server_thread(StartServer); std::thread server_thread(StartServer);
...@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) { ...@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) {
std::string ep = string::Sprintf("127.0.0.1:%d", port); std::string ep = string::Sprintf("127.0.0.1:%d", port);
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
LOG(INFO) << "connect to server" << ep; LOG(INFO) << "connect to server" << ep;
client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
......
...@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) { ...@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) {
new (&instance) LoDTensor(new_offset_lod); new (&instance) LoDTensor(new_offset_lod);
}) })
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
// We implement offset based LOD in C++ while we use length based with
// Python API. So we changed set_lod to set_recursive_sequence_lengths to
// avoid misuse.
// The discussion is here:
// https://github.com/PaddlePaddle/Paddle/issues/10855
.def("set_lod", .def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
// the input lod is offset-based level-of-detail info // the input lod is offset-based level-of-detail info
...@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) { ...@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) {
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod; return new_lod;
}) })
// Set above comments of set_lod.
.def("recursive_sequence_lengths", .def("recursive_sequence_lengths",
[](LoDTensor &self) -> std::vector<std::vector<size_t>> { [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
// output the length-based lod info // output the length-based lod info
......
...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool, details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor); uint8_t, platform::float16>()(tensor);
return buffer_info; return buffer_info;
} }
......
...@@ -44,7 +44,7 @@ import metrics ...@@ -44,7 +44,7 @@ import metrics
import transpiler import transpiler
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from transpiler import DistributeTranspiler, InferenceTranspiler, \ from transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory memory_optimize, release_memory
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
...@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ ...@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
'Scope',
] ]
......
...@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var): ...@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):
class WeightedAverage(object): class WeightedAverage(object):
"""
Calculate weighted average.
The average calculating is accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
Examples:
.. code-block:: python
avg = fluid.average.WeightedAverage()
avg.add(value=2.0, weight=1)
avg.add(value=4.0, weight=2)
avg.eval()
# The result is 3.333333333.
# For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
"""
def __init__(self): def __init__(self):
warnings.warn( warnings.warn(
"The %s is deprecated, please use fluid.metrics.Accuracy instead." % "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
......
...@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
else: else:
if len(renamed_vars[var_name]) == 1: if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name]) str(var_rename_count[var_name])
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
# rename original var_name # rename original var_name
renamed_vars[var_name][0] = new_name renamed_vars[var_name][0] = new_name
...@@ -155,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -155,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs):
_rename_arg_(pending_sum_ops, var_name, new_name) _rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name]) str(var_rename_count[var_name])
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name) op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name) renamed_vars[var_name].append(new_name)
...@@ -434,18 +434,65 @@ def _get_stop_gradients_(program): ...@@ -434,18 +434,65 @@ def _get_stop_gradients_(program):
def append_backward(loss, parameter_list=None, no_grad_set=None, def append_backward(loss, parameter_list=None, no_grad_set=None,
callbacks=None): callbacks=None):
""" """
Append backward part to main_program Append backward part to main_program.
Args: A complete neural network training is made up of forward and backward
loss(Variable): The variable generated by cost function. propagation. However, when we configure a network, we only need to
parameter_list(list[string]): Parameters that need to be updated by specify its forwrd part. The backward part is generated automatically
optimizer. If None, it means all parameters need to be updated. according to the forward part by this function.
no_grad_set(set): Variables that have no gradients in Block 0.
All variables with `step_gradient=True` from all blocks will be
automatically added.
Return: In most cases, users do not need to invoke this function manually. It
(list[(Variable,Variable)]): list of (parameter, gradient) pair. will be automatically invoked by the optimizer's `minimize` function.
Args:
loss(Variable): The loss variable of the network.
parameter_list(list[string]|None): Names of parameters that need
to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None
no_grad_set(set|None): Variables in the Block 0 whose gradients
should be ignored. All variables with
`step_gradient=True` from all blocks will
be automatically added into this set.
Default: None
callbacks(list[callable object]|None): The callbacks are used for
doing some custom jobs during
backward part building. All
callable objects in it will
be invoked once each time a
new gradient operator is added
into the program. The callable
object must has two input
parameters: 'block' and 'context'.
The 'block' is the block which
the new gradient operator will
be added to. The 'context' is a
map, whose keys are gradient
variable names and values are
corresponding original variables.
In addition to this, the 'context'
has another special key-value pair:
the key is string '__current_op_desc__'
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Returns:
list[(Variable,Variable)]: Pairs of parameter and its
corresponding gradients. The key is the parameter and the
value is gradient variable.
Raises:
AssertionError: If `loss` is not an instance of Variable.
Examples:
.. code-block:: python
# network configuration code
# ...
avg_loss = fluid.layers.mean(loss)
param_grad_list = fluid.backward.append_backward(loss=avg_loss)
""" """
assert isinstance(loss, framework.Variable) assert isinstance(loss, framework.Variable)
......
...@@ -24,8 +24,6 @@ __all__ = [ ...@@ -24,8 +24,6 @@ __all__ = [
'GradientClipByValue', 'GradientClipByValue',
'GradientClipByNorm', 'GradientClipByNorm',
'GradientClipByGlobalNorm', 'GradientClipByGlobalNorm',
'append_gradient_clip_ops',
'error_clip_callback',
] ]
...@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object): ...@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
class ErrorClipByValue(BaseErrorClipAttr): class ErrorClipByValue(BaseErrorClipAttr):
"""
Clips tensor values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
"""
def __init__(self, max, min=None): def __init__(self, max, min=None):
max = float(max) max = float(max)
if min is None: if min is None:
...@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr): ...@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
class GradientClipByValue(BaseGradientClipAttr): class GradientClipByValue(BaseGradientClipAttr):
"""
Clips gradient values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByValue(-1.0, 1.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, max, min=None): def __init__(self, max, min=None):
max = float(max) max = float(max)
if min is None: if min is None:
...@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
class GradientClipByNorm(BaseGradientClipAttr): class GradientClipByNorm(BaseGradientClipAttr):
"""
Clips tensor values to a maximum L2-norm.
This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
:math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
:math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
.. math::
Out = \\frac{max\_norm * X}{norm(X)},
where :math:`norm(X)` represents the L2 norm of :math:`X`.
Args:
clip_norm (float): The maximum norm value
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByNorm(clip_norm=2.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, clip_norm): def __init__(self, clip_norm):
self.clip_norm = clip_norm self.clip_norm = clip_norm
...@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
class GradientClipByGlobalNorm(BaseGradientClipAttr): class GradientClipByGlobalNorm(BaseGradientClipAttr):
"""
Clips values of multiple tensors by the ratio of the sum of their norms.
Given a list of tensors t_list, and a clipping ratio clip_norm, this
operation returns a list of clipped tensors list_clipped and the global
norm (global_norm) of all tensors in t_list.
To perform the clipping, the values :math:`t\_list[i]` are set to:
.. math::
t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
otherwise they're all shrunk by the global ratio.
Args:
clip_norm (float): The maximum norm value
group_name (str, optional): The group name for this clip.
Examples:
.. code-block:: python
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
"""
def __init__(self, clip_norm, group_name="default_group"): def __init__(self, clip_norm, group_name="default_group"):
if not isinstance(group_name, basestring): if not isinstance(group_name, basestring):
raise TypeError("'group_name' must be a basestring.") raise TypeError("'group_name' must be a basestring.")
...@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
def set_gradient_clip(clip, param_list=None, program=None): def set_gradient_clip(clip, param_list=None, program=None):
""" """
To specify parameters that require gradient clip. To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, Args:
which describes the type and detailed attributes of required gradient clip. clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
param_list(list, None by default): Parameters that require gradient clip. which describes the type and detailed attributes of required gradient clip.
It can be a list of parameter or a list of parameter's name. param_list(list(Variable)): Parameters that require gradient clip.
When it's None, all parameters in the program will be included. It can be a list of parameter or a list of parameter's name.
program(Program, None by default): The program where parameters are. When it's None, all parameters in the program will be included.
Will be the default main program when assigned with None. program(Program): The program where parameters are.
Will be the default main program when assigned with None.
""" """
if not isinstance(clip, BaseGradientClipAttr): if not isinstance(clip, BaseGradientClipAttr):
raise TypeError( raise TypeError(
......
...@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object): ...@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
self.place = place self.place = place
self.lod_level = lod_level self.lod_level = lod_level
self.shape = shape self.shape = shape
negtive_count = 0
for s in self.shape:
if s < 0:
negtive_count += 1
if negtive_count > 1:
self.shape = None
break
if dtype == core.VarDesc.VarType.FP32: if dtype == core.VarDesc.VarType.FP32:
self.dtype = 'float32' self.dtype = 'float32'
elif dtype == core.VarDesc.VarType.INT64: elif dtype == core.VarDesc.VarType.INT64:
...@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object): ...@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object):
self._feed_impl_(each_data, lod[1:], lod_level - 1) self._feed_impl_(each_data, lod[1:], lod_level - 1)
def done(self): def done(self):
arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape) arr = numpy.array(self.data, dtype=self.dtype)
if self.shape:
arr = arr.reshape(self.shape)
t = core.LoDTensor() t = core.LoDTensor()
t.set(arr, self.place) t.set(arr, self.place)
if self.lod_level > 0: if self.lod_level > 0:
...@@ -70,6 +79,61 @@ class DataToLoDTensorConverter(object): ...@@ -70,6 +79,61 @@ class DataToLoDTensorConverter(object):
class DataFeeder(object): class DataFeeder(object):
"""
DataFeeder converts the data that returned by a reader into a data
structure that can feed into Executor and ParallelExecutor. The reader
usually returns a list of mini-batch data entries. Each data entry in
the list is one sample. Each sample is a list or a tuple with one
feature or multiple features.
The simple usage shows below:
.. code-block:: python
place = fluid.CPUPlace()
img = fluid.layers.data(name='image', shape=[1, 28, 28])
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
If you want to feed data into GPU side separately in advance when you
use multi-GPU to train a model, you can use `decorate_reader` function.
.. code-block:: python
place=fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(flowers.train(), batch_size=16))
Args:
feed_list(list): The Variables or Variables'name that will
feed into model.
place(Place): place indicates feed data into CPU or GPU, if you want to
feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
the GPU id), or if you want to feed data into CPU, please using
`fluid.CPUPlace()`.
program(Program): The Program that will feed data into, if program
is None, it will use default_main_program(). Default None.
Raises:
ValueError: If some Variable is not in this Program.
Examples:
.. code-block:: python
# ...
place = fluid.CPUPlace()
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_vars_name
] # feed_vars_name is a list of variables' name.
feeder = fluid.DataFeeder(feed_list, place)
for data in reader():
outs = exe.run(program=main_program,
feed=feeder.feed(data))
"""
def __init__(self, feed_list, place, program=None): def __init__(self, feed_list, place, program=None):
self.feed_dtypes = [] self.feed_dtypes = []
self.feed_names = [] self.feed_names = []
...@@ -99,6 +163,16 @@ class DataFeeder(object): ...@@ -99,6 +163,16 @@ class DataFeeder(object):
self.place = place self.place = place
def feed(self, iterable): def feed(self, iterable):
"""
According to feed_list and iterable, converters the input into
a data structure that can feed into Executor and ParallelExecutor.
Args:
iterable(list|tuple): the input data.
Returns:
dict: the result of conversion.
"""
converter = [] converter = []
for lod_level, shape, dtype in six.zip( for lod_level, shape, dtype in six.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes): self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
...@@ -121,6 +195,20 @@ class DataFeeder(object): ...@@ -121,6 +195,20 @@ class DataFeeder(object):
return ret_dict return ret_dict
def feed_parallel(self, iterable, num_places=None): def feed_parallel(self, iterable, num_places=None):
"""
Takes multiple mini-batches. Each mini-batch will be feed on each
device in advance.
Args:
iterable(list|tuple): the input data.
num_places(int): the number of devices. Default None.
Returns:
dict: the result of conversion.
Notes:
The number of devices and number of mini-batches must be same.
"""
if isinstance(self.place, core.CUDAPlace): if isinstance(self.place, core.CUDAPlace):
places = [ places = [
core.CUDAPlace(i) core.CUDAPlace(i)
...@@ -159,6 +247,24 @@ class DataFeeder(object): ...@@ -159,6 +247,24 @@ class DataFeeder(object):
multi_devices, multi_devices,
num_places=None, num_places=None,
drop_last=True): drop_last=True):
"""
Converter the input data into a data that returned by reader into
multiple mini-batches. Each mini-batch will be feed on each device.
Args:
reader(fun): the input data.
multi_devices(bool): the number of places. Default None.
num_places(int): the number of places. Default None.
drop_last(bool): the number of places. Default None.
Returns:
dict: the result of conversion.
Raises:
ValueError: If drop_last is False and the data batch which cannot
fit for devices.
"""
def __reader_creator__(): def __reader_creator__():
if not multi_devices: if not multi_devices:
for item in reader(): for item in reader():
......
...@@ -41,7 +41,12 @@ def _clone_var_(block, var): ...@@ -41,7 +41,12 @@ def _clone_var_(block, var):
class Evaluator(object): class Evaluator(object):
""" """
Base Class for all evaluators Warning: better to use the fluid.metrics.* things, more
flexible support via pure Python and Operator, and decoupled
with executor. Short doc are intended to urge new user
start from Metrics.
Base Class for all evaluators.
Args: Args:
name(str): The name of evaluator. such as, "accuracy". Used for generate name(str): The name of evaluator. such as, "accuracy". Used for generate
...@@ -69,6 +74,10 @@ class Evaluator(object): ...@@ -69,6 +74,10 @@ class Evaluator(object):
def reset(self, executor, reset_program=None): def reset(self, executor, reset_program=None):
""" """
reset metric states at the begin of each pass/user specified batch reset metric states at the begin of each pass/user specified batch
Args:
executor(Executor|ParallelExecutor): a executor for executing the reset_program
reset_program(Program): a single Program for reset process
""" """
if reset_program is None: if reset_program is None:
reset_program = Program() reset_program = Program()
...@@ -85,15 +94,16 @@ class Evaluator(object): ...@@ -85,15 +94,16 @@ class Evaluator(object):
def eval(self, executor, eval_program=None): def eval(self, executor, eval_program=None):
""" """
Evaluate the statistics merged by multiple mini-batches. Evaluate the statistics merged by multiple mini-batches.
Args:
executor(Executor|ParallelExecutor): a executor for executing the eval_program
eval_program(Program): a single Program for eval process
""" """
raise NotImplementedError() raise NotImplementedError()
def create_state(self, suffix, dtype, shape): def _create_state(self, suffix, dtype, shape):
""" """
Create state variable. Create state variable.
NOTE: It is not a public API.
Args: Args:
suffix(str): the state suffix. suffix(str): the state suffix.
dtype(str|core.VarDesc.VarType): the state data type dtype(str|core.VarDesc.VarType): the state data type
...@@ -113,9 +123,35 @@ class Evaluator(object): ...@@ -113,9 +123,35 @@ class Evaluator(object):
class ChunkEvaluator(Evaluator): class ChunkEvaluator(Evaluator):
""" """
Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator
instead.
Accumulate counter numbers output by chunk_eval from mini-batches and Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter compute the precision recall and F1-score using the accumulated counter
numbers. numbers.
For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
Args:
input (Variable): prediction output of the network.
label (Variable): label of the test data set.
chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details.
num_chunk_types (int): the number of chunk type.
excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted.
Returns:
tuple: tuple containing: precision, recall, f1_score
Examples:
.. code-block:: python
exe = fluid.executor(place)
evaluator = fluid.Evaluator.ChunkEvaluator(input, label)
for epoch in PASS_NUM:
evaluator.reset(exe)
for data in batches:
loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe)
""" """
def __init__( def __init__(
...@@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator): ...@@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator):
if main_program.current_block().idx != 0: if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block") raise ValueError("You can only invoke Evaluator in root block")
self.num_infer_chunks = self.create_state( self.num_infer_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_infer_chunks') dtype='int64', shape=[1], suffix='num_infer_chunks')
self.num_label_chunks = self.create_state( self.num_label_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_label_chunks') dtype='int64', shape=[1], suffix='num_label_chunks')
self.num_correct_chunks = self.create_state( self.num_correct_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_correct_chunks') dtype='int64', shape=[1], suffix='num_correct_chunks')
precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval( precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
input=input, input=input,
...@@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator): ...@@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator):
class EditDistance(Evaluator): class EditDistance(Evaluator):
""" """
Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance
instead.
Accumulate edit distance sum and sequence number from mini-batches and Accumulate edit distance sum and sequence number from mini-batches and
compute the average edit_distance and instance error of all batches. compute the average edit_distance and instance error of all batches.
...@@ -188,15 +226,16 @@ class EditDistance(Evaluator): ...@@ -188,15 +226,16 @@ class EditDistance(Evaluator):
ignored_tokens(list of int): Tokens that should be removed before ignored_tokens(list of int): Tokens that should be removed before
calculating edit distance. calculating edit distance.
Example: Examples:
.. code-block:: python
exe = fluid.executor(place) exe = fluid.executor(place)
distance_evaluator = fluid.Evaluator.EditDistance(input, label) distance_evaluator = fluid.Evaluator.EditDistance(input, label)
for epoch in PASS_NUM: for epoch in PASS_NUM:
distance_evaluator.reset(exe) distance_evaluator.reset(exe)
for data in batches: for data in batches:
loss = exe.run(fetch_list=[cost]) loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe) distance, instance_error = distance_evaluator.eval(exe)
In the above example: In the above example:
'distance' is the average of the edit distance in a pass. 'distance' is the average of the edit distance in a pass.
...@@ -210,11 +249,11 @@ class EditDistance(Evaluator): ...@@ -210,11 +249,11 @@ class EditDistance(Evaluator):
if main_program.current_block().idx != 0: if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block") raise ValueError("You can only invoke Evaluator in root block")
self.total_distance = self.create_state( self.total_distance = self._create_state(
dtype='float32', shape=[1], suffix='total_distance') dtype='float32', shape=[1], suffix='total_distance')
self.seq_num = self.create_state( self.seq_num = self._create_state(
dtype='int64', shape=[1], suffix='seq_num') dtype='int64', shape=[1], suffix='seq_num')
self.instance_error = self.create_state( self.instance_error = self._create_state(
dtype='int64', shape=[1], suffix='instance_error') dtype='int64', shape=[1], suffix='instance_error')
distances, seq_num = layers.edit_distance( distances, seq_num = layers.edit_distance(
input=input, label=label, ignored_tokens=ignored_tokens) input=input, label=label, ignored_tokens=ignored_tokens)
...@@ -256,9 +295,10 @@ class EditDistance(Evaluator): ...@@ -256,9 +295,10 @@ class EditDistance(Evaluator):
class DetectionMAP(Evaluator): class DetectionMAP(Evaluator):
""" """
Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP
instead.
Calculate the detection mean average precision (mAP). Calculate the detection mean average precision (mAP).
TODO (Dang Qingqing): update the following doc.
The general steps are as follows: The general steps are as follows:
1. calculate the true positive and false positive according to the input 1. calculate the true positive and false positive according to the input
of detection and labels. of detection and labels.
...@@ -293,17 +333,18 @@ class DetectionMAP(Evaluator): ...@@ -293,17 +333,18 @@ class DetectionMAP(Evaluator):
- 11point: the 11-point interpolated average precision. - 11point: the 11-point interpolated average precision.
- integral: the natural integral of the precision-recall curve. - integral: the natural integral of the precision-recall curve.
Example: Examples:
.. code-block:: python
exe = fluid.executor(place) exe = fluid.executor(place)
map_evaluator = fluid.Evaluator.DetectionMAP(input, map_evaluator = fluid.Evaluator.DetectionMAP(input,
gt_label, gt_box, gt_difficult) gt_label, gt_box, gt_difficult)
cur_map, accum_map = map_evaluator.get_map_var() cur_map, accum_map = map_evaluator.get_map_var()
fetch = [cost, cur_map, accum_map] fetch = [cost, cur_map, accum_map]
for epoch in PASS_NUM: for epoch in PASS_NUM:
map_evaluator.reset(exe) map_evaluator.reset(exe)
for data in batches: for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example: In the above example:
...@@ -340,9 +381,10 @@ class DetectionMAP(Evaluator): ...@@ -340,9 +381,10 @@ class DetectionMAP(Evaluator):
evaluate_difficult=evaluate_difficult, evaluate_difficult=evaluate_difficult,
ap_version=ap_version) ap_version=ap_version)
self.create_state(dtype='int32', shape=None, suffix='accum_pos_count') self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
self.create_state(dtype='float32', shape=None, suffix='accum_true_pos') self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
self.create_state(dtype='float32', shape=None, suffix='accum_false_pos') self._create_state(
dtype='float32', shape=None, suffix='accum_false_pos')
self.has_state = None self.has_state = None
var = self.helper.create_variable( var = self.helper.create_variable(
......
...@@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable ...@@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable
from . import core from . import core
__all__ = [ __all__ = [
'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var' 'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
] ]
g_scope = core.Scope() g_scope = core.Scope()
def global_scope(): def global_scope():
"""
Get the global/default scope instance. There are a lot of APIs use
:code:`global_scope` as its default value, e.g., :code:`Executor.run`
Returns:
Scope: The global/default scope instance.
"""
return g_scope return g_scope
def switch_scope(scope): def _switch_scope(scope):
global g_scope global g_scope
ex = g_scope ex = g_scope
g_scope = scope g_scope = scope
...@@ -37,12 +44,40 @@ def switch_scope(scope): ...@@ -37,12 +44,40 @@ def switch_scope(scope):
@contextlib.contextmanager @contextlib.contextmanager
def scope_guard(scope): def scope_guard(scope):
ex = switch_scope(scope) """
Change the global/default scope instance by Python `with` statement. All
variable in runtime will assigned to the new scope.
Examples:
>>> import paddle.fluid as fluid
>>> new_scope = fluid.Scope()
>>> with fluid.scope_guard(new_scope):
>>> ...
Args:
scope: The new global/default scope.
"""
ex = _switch_scope(scope)
yield yield
switch_scope(ex) _switch_scope(ex)
def as_numpy(tensor): def as_numpy(tensor):
"""
Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> outs = executor.run(...)
>>> np_outs = map(lambda x: as_numpy(x), outs)
>>> ...
Args:
tensor(Variable): a instance of Tensor
Returns:
numpy.ndarray
"""
if isinstance(tensor, list): if isinstance(tensor, list):
return [as_numpy(t) for t in tensor] return [as_numpy(t) for t in tensor]
assert isinstance(tensor, core.LoDTensor) assert isinstance(tensor, core.LoDTensor)
...@@ -135,14 +170,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name): ...@@ -135,14 +170,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
def fetch_var(name, scope=None, return_numpy=True): def fetch_var(name, scope=None, return_numpy=True):
""" """
Fetch the value of the variable with the given name from the given scope Fetch the value of the variable with the given name from the
given scope.
Args: Args:
name(str): name of the variable. Typically, only persistable variables name(str): name of the variable. Typically, only persistable variables
can be found in the scope used for running the program. can be found in the scope used for running the program.
scope(core.Scope|None): scope object. It should be the scope where scope(core.Scope|None): scope object. It should be the scope where
you pass to Executor.run() when running your program. you pass to Executor.run() when running your program.
If None, global_scope() will be used. If None, global_scope() will be used. Default None.
return_numpy(bool): whether convert the tensor to numpy.ndarray return_numpy(bool): whether convert the tensor to numpy.ndarray.
Default True.
Returns: Returns:
LodTensor|numpy.ndarray LodTensor|numpy.ndarray
""" """
...@@ -162,7 +201,7 @@ def fetch_var(name, scope=None, return_numpy=True): ...@@ -162,7 +201,7 @@ def fetch_var(name, scope=None, return_numpy=True):
return tensor return tensor
def get_program_cache_key(feed, fetch_list): def _get_program_cache_key(feed, fetch_list):
feed_var_names = feed.keys() feed_var_names = feed.keys()
def to_name_str(var): def to_name_str(var):
...@@ -181,6 +220,25 @@ def get_program_cache_key(feed, fetch_list): ...@@ -181,6 +220,25 @@ def get_program_cache_key(feed, fetch_list):
class Executor(object): class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
ParallelExecutor.
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
def __init__(self, place): def __init__(self, place):
self.place = place self.place = place
p = core.Place() p = core.Place()
...@@ -189,6 +247,23 @@ class Executor(object): ...@@ -189,6 +247,23 @@ class Executor(object):
self.program_caches = dict() self.program_caches = dict()
def as_lodtensor(self, data): def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list): if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \ raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \ They can not be completely cast from a list of Python \
...@@ -280,23 +355,47 @@ class Executor(object): ...@@ -280,23 +355,47 @@ class Executor(object):
scope=None, scope=None,
return_numpy=True, return_numpy=True,
use_program_cache=False): use_program_cache=False):
""" Run program by this Executor. Feed data by feed map, fetch result by fetch_list. """
Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
Python executor takes a program, add feed operators and fetch operators to this program according Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all the variables(or names) that user want to get after program run.
Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list operators in the program but not only the operators dependent by the fetch_list
:param program: the program that need to run, if not provied, then default_main_program will be used. Args:
:param feed: feed variable map, e.g. {"image": ImageData, "label": LableData} program(Program): the program that need to run, if not provied, then default_main_program will be used.
:param fetch_list: a list of variable or variable names that user want to get, run will return them according feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
to this list. fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
:param feed_var_name: the name for the input variable of feed Operator. feed_var_name(str): the name for the input variable of feed Operator.
:param fetch_var_name: the name for the output variable of feed Operator. fetch_var_name(str): the name for the output variable of fetch Operator.
:param scope: the scope used to run this program, you can switch it to different scope. default is global_scope scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
:param return_numpy: if convert the fetched tensor to numpy return_numpy(bool): if convert the fetched tensor to numpy
:param use_program_cache: set use_program_cache to true if program not changed compare to the last step. use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
:return: result according to fetch_list.
Returns:
list(numpy.array): fetch result according to fetch_list.
Examples:
>>> data = layers.data(name='X', shape=[1], dtype='float32')
>>> hidden = layers.fc(input=data, size=10)
>>> layers.assign(hidden, out)
>>> loss = layers.mean(out)
>>> adam = fluid.optimizer.Adam()
>>> adam.minimize(loss)
>>> cpu = core.CPUPlace()
>>> exe = Executor(cpu)
>>> exe.run(default_startup_program())
>>> x = numpy.random.random(size=(10, 1)).astype('float32')
>>> outs = exe.run(
>>> feed={'X': x},
>>> fetch_list=[loss.name])
""" """
if feed is None: if feed is None:
feed = {} feed = {}
...@@ -317,7 +416,7 @@ class Executor(object): ...@@ -317,7 +416,7 @@ class Executor(object):
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
cache_key = get_program_cache_key(feed, fetch_list) cache_key = _get_program_cache_key(feed, fetch_list)
if use_program_cache: if use_program_cache:
cached_program = self._get_program_cache(cache_key) cached_program = self._get_program_cache(cache_key)
if cached_program is None: if cached_program is None:
......
此差异已折叠。
...@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ] ...@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ]
class Inferencer(object): class Inferencer(object):
"""
Inferencer High Level API.
Args:
infer_func (Python func): Infer function that will return predict Variable
param_path (str): The path where the inference model is saved by fluid.io.save_params
place (Place): place to do the inference
parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
Examples:
.. code-block:: python
def inference_program():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
return y_predict
place = fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path="/tmp/model", place=place)
"""
def __init__(self, infer_func, param_path, place=None, parallel=False): def __init__(self, infer_func, param_path, place=None, parallel=False):
"""
:param infer_func: a function that will return predict Variable
:param param_path: the path where the inference model is saved by fluid.io.save_params
:param place: place to do the inference
:param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
"""
self.param_path = param_path self.param_path = param_path
self.scope = core.Scope() self.scope = core.Scope()
self.parallel = parallel self.parallel = parallel
...@@ -60,9 +77,20 @@ class Inferencer(object): ...@@ -60,9 +77,20 @@ class Inferencer(object):
def infer(self, inputs, return_numpy=True): def infer(self, inputs, return_numpy=True):
""" """
:param inputs: a map of {"input_name": input_var} that will be feed into the inference program Do Inference for Inputs
to get the predict value
:return: the predict value of the inference model Args:
inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
return_numpy (bool): transform return value into numpy or not
Returns:
Tensor or Numpy: the predict value of the inference model for the inputs
Examples:
.. code-block:: python
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
""" """
if not isinstance(inputs, dict): if not isinstance(inputs, dict):
raise ValueError( raise ValueError(
......
此差异已折叠。
此差异已折叠。
...@@ -28,8 +28,8 @@ import math_op_patch ...@@ -28,8 +28,8 @@ import math_op_patch
from math_op_patch import * from math_op_patch import *
import detection import detection
from detection import * from detection import *
import metric import metric_op
from metric import * from metric_op import *
from learning_rate_scheduler import * from learning_rate_scheduler import *
__all__ = [] __all__ = []
...@@ -41,5 +41,5 @@ __all__ += control_flow.__all__ ...@@ -41,5 +41,5 @@ __all__ += control_flow.__all__
__all__ += ops.__all__ __all__ += ops.__all__
__all__ += device.__all__ __all__ += device.__all__
__all__ += detection.__all__ __all__ += detection.__all__
__all__ += metric.__all__ __all__ += metric_op.__all__
__all__ += learning_rate_scheduler.__all__ __all__ += learning_rate_scheduler.__all__
...@@ -185,12 +185,14 @@ def Print(input, ...@@ -185,12 +185,14 @@ def Print(input,
Returns: Returns:
Variable: Output tensor, same data with input tensor. Variable: Output tensor, same data with input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
value = some_layer(...) value = some_layer(...)
Print(value, summarize=10, Print(value, summarize=10,
message="The content of some_layer: ") message="The content of some_layer: ")
''' '''
helper = LayerHelper('print', **locals()) helper = LayerHelper('print', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_tmp_variable(dtype=helper.input_dtype())
...@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard): ...@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
class ConditionalBlock(object): class ConditionalBlock(object):
'''
**ConditionalBlock**
ConditionalBlock is an operator that bind a block to a specific condition,
if the condition matches, the corresponding block will be executed.
Args:
inputs (Variable): bool conditions.
is_scalar_condition (bool): whether the branch is controled by a scalar.
name(str): name of this ConditionalBlock.
Examples:
.. code-block:: python
cond = layers.less_than(x=label, y=limit)
true_image, false_image = layers.split_lod_tensor(
input=image, mask=cond)
true_cond = layers.ConditionalBlock([true_image])
with true_cond.block():
...
with false_cond.block():
...
'''
def __init__(self, inputs, is_scalar_condition=False, name=None): def __init__(self, inputs, is_scalar_condition=False, name=None):
for each_input in inputs: for each_input in inputs:
if not isinstance(each_input, Variable): if not isinstance(each_input, Variable):
......
...@@ -16,7 +16,7 @@ All layers just related to the detection neural network. ...@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
""" """
from layer_function_generator import generate_layer_fn from layer_function_generator import generate_layer_fn
from layer_function_generator import autodoc from layer_function_generator import autodoc, templatedoc
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
import tensor import tensor
import nn import nn
...@@ -155,7 +155,7 @@ def detection_output(loc, ...@@ -155,7 +155,7 @@ def detection_output(loc,
return nmsed_outs return nmsed_outs
@autodoc() @templatedoc()
def detection_map(detect_res, def detection_map(detect_res,
label, label,
class_num, class_num,
...@@ -166,6 +166,47 @@ def detection_map(detect_res, ...@@ -166,6 +166,47 @@ def detection_map(detect_res,
input_states=None, input_states=None,
out_states=None, out_states=None,
ap_version='integral'): ap_version='integral'):
"""
${comment}
Args:
detect_res: ${detect_res_comment}
label: ${label_comment}
class_num: ${class_num_comment}
background_label: ${background_label_comment}
overlap_threshold: ${overlap_threshold_comment}
evaluate_difficult: ${evaluate_difficult_comment}
has_state: ${has_state_comment}
input_states: If not None, It contains 3 elements:
1. pos_count ${pos_count_comment}.
2. true_pos ${true_pos_comment}.
3. false_pos ${false_pos_comment}.
out_states: If not None, it contains 3 elements.
1. accum_pos_count ${accum_pos_count_comment}.
2. accum_true_pos ${accum_true_pos_comment}.
3. accum_false_pos ${accum_false_pos_comment}.
ap_version: ${ap_type_comment}
Returns:
${map_comment}
Examples:
.. code-block:: python
detect_res = fluid.layers.data(
name='detect_res',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
label = fluid.layers.data(
name='label',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
map_out = fluid.layers.detection_map(detect_res, label, 21)
"""
helper = LayerHelper("detection_map", **locals()) helper = LayerHelper("detection_map", **locals())
def __create_var(type): def __create_var(type):
......
...@@ -126,7 +126,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -126,7 +126,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
topk_out, topk_indices = nn.topk(input, k=k) topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32") auc_out = helper.create_tmp_variable(dtype="float32")
helper.append_op( helper.append_op(
type="accuracy", type="auc",
inputs={ inputs={
"Out": [topk_out], "Out": [topk_out],
"Indices": [topk_indices], "Indices": [topk_indices],
......
...@@ -94,6 +94,7 @@ __all__ = [ ...@@ -94,6 +94,7 @@ __all__ = [
'mean_iou', 'mean_iou',
'relu', 'relu',
'log', 'log',
'crop',
] ]
...@@ -2675,18 +2676,35 @@ def sequence_expand(x, y, ref_level=-1, name=None): ...@@ -2675,18 +2676,35 @@ def sequence_expand(x, y, ref_level=-1, name=None):
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
''' '''
**beam search**
This function implements the beam search algorithm. This function implements the beam search algorithm.
Beam search is a classical algorithm for selecting candidate words
in a machine translation task.
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details.
Args: Args:
pre_ids (Variable): ${pre_ids_comment} pre_ids (Variable): ids in previous step.
ids (Variable): ${ids_comment} ids (Variable): a LoDTensor of shape of [None,k]
scores (Variable): ${scores_comment} scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
beam_size (int): ${beam_size_comment} beam_size (int): beam size for beam search
end_id (int): ${end_id_comment} end_id (int): the token id which indicates the end of a sequence
level (int): ${level_comment} level (int): the level of LoDTensor
Returns: Returns:
tuple: a tuple of beam_search output variables: selected_ids, selected_scores tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
Examples:
.. code-block:: python
# current_score is a Tensor of shape (num_batch_size, embed_size), which
# consists score of each candidate word.
topk_scores, topk_indices = pd.topk(current_score, k=50)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
''' '''
helper = LayerHelper('beam_search', **locals()) helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype score_type = scores.dtype
...@@ -4996,3 +5014,101 @@ def mean_iou(input, label, num_classes): ...@@ -4996,3 +5014,101 @@ def mean_iou(input, label, num_classes):
}, },
attrs={"num_classes": num_classes}) attrs={"num_classes": num_classes})
return out_mean_iou, out_wrong, out_correct return out_mean_iou, out_wrong, out_correct
def crop(x, shape=None, offsets=None, name=None):
"""
Crop input into output, as specified by offsets and shape.
.. code-block:: text
* Case 1:
Given
X = [[0, 1, 2, 0, 0]
[0, 3, 4, 0, 0]
[0, 0, 0, 0, 0]],
and
shape = [2, 2],
offsets = [0, 1],
output is:
Out = [[1, 2],
[3, 4]].
* Case 2:
Given
X = [[0, 1, 2, 5, 0]
[0, 3, 4, 6, 0]
[0, 0, 0, 0, 0]],
and shape is tensor
shape = [[0, 0, 0]
[0, 0, 0]]
and
offsets = [0, 1],
output is:
Out = [[1, 2, 5],
[3, 4, 6]].
Args:
x (Variable): The input tensor variable.
shape (Variable|list/tuple of integer): The output shape is specified
by `shape`, which can a Variable or a list/tupe of integer.
If a tensor Variable, it's rank must be the same as `x`. This way
is suitable for the case that the output shape may be changed each
iteration. If a list/tupe of integer, it's length must be the same
as the rank of `x`
offsets (Variable|list/tuple of integer|None): Specifies the copping
offsets at each dimension. It can be a Variable or or a list/tupe
of integer. If a tensor Variable, it's rank must be the same as `x`.
This way is suitable for the case that the offsets may be changed
each iteration. If a list/tupe of integer, it's length must be the
same as the rank of `x`. If None, the offsets are 0 at each
dimension.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The cropped tensor variable.
Raises:
ValueError: If shape is not a list, tuple or Variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
crop = fluid.layers.crop(x, shape=y)
# or
z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
crop = fluid.layers.crop(z, shape=[2, 3])
"""
helper = LayerHelper('crop', **locals())
if not (isinstance(shape, list) or isinstance(shape, tuple) or \
isinstance(shape, Variable)):
raise ValueError("The shape should be a list, tuple or Variable.")
if offsets is None:
offsets = [0] * len(x.shape)
out = helper.create_tmp_variable(x.dtype)
ipts = {'X': x}
attrs = {}
if isinstance(shape, Variable):
ipts['Y'] = shape
else:
attrs['shape'] = shape
if isinstance(offsets, Variable):
ipts['Offsets'] = offsets
else:
attrs['offsets'] = offsets
helper.append_op(
type='crop',
inputs=ipts,
outputs={'Out': out},
attrs=None if len(attrs) == 0 else attrs)
return out
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册