提交 9dccca96 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into some_small_fixes

......@@ -23,7 +23,7 @@ repos:
- id: clang-format-with-version-check
name: clang-format
description: Format files with ClangFormat.
entry: bash ./.clang_format.hook -i
entry: bash ./tools/codestyle/clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local
......@@ -52,7 +52,7 @@ repos:
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./.copyright.hook
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Use UBUNTU_MIRROR can speed up apt-get speed.
# ARG UBUNTU_MIRROR
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
# IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
......@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
ADD models/ /workspace/models/
......@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
return train_program, fluid.default_startup_program()
else:
raise ValueError(
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
......@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
break
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver":
exe.bcast_params()
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
......@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
(num_samples, train_elapsed, examples_per_sec))
def print_paddle_envs():
print('----------- Configuration envs -----------')
for k in os.environ:
if "PADDLE_" in k:
print "ENV %s:%s" % (k, os.environ[k])
print('------------------------------------------------')
def main():
args = parse_args()
print_arguments(args)
print_paddle_envs()
# the unique trainer id, starting from 0, needed by trainer
# only
......
......@@ -17,6 +17,7 @@ import copy
import argparse
import random
import os
import copy
from kube_templates import pserver, trainer, envs
......@@ -108,10 +109,9 @@ def gen_job():
tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
......@@ -166,17 +166,23 @@ def gen_job():
tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = envs
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
ps_container["env"] = copy.deepcopy(envs)
ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs
if args.disttype == "pserver":
tn_container["env"].append({
"name": "TRAINING_ROLE",
"name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER"
})
elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname)
if args.disttype == "pserver":
......
......@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF()
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ExternalProject_Add(
......
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
transpiler
==========
DistributeTranspiler
--------------------
.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
:members:
:noindex:
InferenceTranspiler
-------------------
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
:members:
:noindex:
memory_optimize
---------------
.. autofunction:: paddle.fluid.transpiler.memory_optimize
:noindex:
release_memory
--------------
.. autofunction:: paddle.fluid.transpiler.release_memory
:noindex:
HashName
--------
.. autoclass:: paddle.fluid.transpiler.HashName
:members:
:noindex:
RoundRobin
----------
.. autoclass:: paddle.fluid.transpiler.RoundRobin
:members:
:noindex:
......@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server:
```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer:
```bash
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
......
......@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
ret_list.append(f)
return ret_list
trainers = int(os.getenv("TRAINERS"))
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
trainers = int(os.getenv("PADDLE_TRAINERS"))
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
data_file = fluid.layers.io.open_files(
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
thread_num=1,
......
......@@ -13,6 +13,7 @@ cpu_noavx_openblas `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
cuda7.5_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn5_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda8.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
cuda9.0_cudnn7_avx_mkl `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
====================== ========================================
从源码编译
......
......@@ -40,10 +40,9 @@ void Main(bool use_gpu) {
//# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = buf,
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data.
......@@ -55,14 +54,12 @@ void Main(bool use_gpu) {
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length;
const size_t num_elements = outputs.front().data.length / sizeof(float);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
}
// TODO(Superjomn): this is should be free automatically
free(outputs[0].data.data);
}
}
......@@ -86,10 +83,9 @@ void MainThreads(int num_threads, bool use_gpu) {
for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
// 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = buf,
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> outputs;
......@@ -99,13 +95,13 @@ void MainThreads(int num_threads, bool use_gpu) {
// 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", "
<< "output buffer size: " << outputs.front().data.length;
const size_t num_elements = outputs.front().data.length / sizeof(float);
<< "output buffer size: " << outputs.front().data.length();
const size_t num_elements =
outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
}
free(outputs[0].data.data);
}
});
}
......
# Inference High-level APIs
This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
## PaddleTensor
We provide the `PaddleTensor` data structure is to give a general tensor interface.
The definition is
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`.
The `name` field is used to specify the name of input variable,
that is important when there are multiple inputs and need to distiuish which variable to set.
## engine
The inference APIs has two different underlying implementation, currently there are two valid engines:
- the native engine, which is consists of the native operators and framework,
- the Anakin engine, which is a Anakin library embeded.
The native engine takes a native Paddle model as input, and supports any model that trained by Paddle,
but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
};
```
## PaddlePredictor and how to create one
The main interface is `PaddlePredictor`, there are following methods
- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
- take inputs and output `output_data`
- `Clone` to clone a predictor from an existing one, with model parameter shared.
There is a factory method to help create a predictor, and the user takes the ownership of this object.
```c++
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
```
By specifying the engine kind and config, one can get an specific implementation.
## Reference
- [paddle_inference_api.h](./paddle_inference_api.h)
- [demos](./demo)
......@@ -13,3 +13,53 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
PaddleBuf::PaddleBuf(PaddleBuf&& other)
: data_(other.data_),
length_(other.length_),
memory_owned_(other.memory_owned_) {
other.memory_owned_ = false;
other.data_ = nullptr;
other.length_ = 0;
}
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
// only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
assert(memory_owned_);
Free();
data_ = new char[length];
length_ = length;
memory_owned_ = true;
}
void PaddleBuf::Reset(void* data, size_t length) {
Free();
memory_owned_ = false;
data_ = data;
length_ = length;
}
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
}
} // namespace paddle
\ No newline at end of file
......@@ -21,6 +21,7 @@ limitations under the License. */
#pragma once
#include <cassert>
#include <memory>
#include <string>
#include <vector>
......@@ -32,12 +33,38 @@ enum PaddleDType {
INT64,
};
struct PaddleBuf {
void* data; // pointer to the data memory.
size_t length; // number of memory bytes.
class PaddleBuf {
public:
PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
// Reset to external memory.
void Reset(void* data, size_t length);
bool empty() const { return length_ == 0; }
void* data() const { return data_; }
size_t length() const { return length_; }
~PaddleBuf() { Free(); }
private:
void Free();
void* data_{nullptr}; // pointer to the data memory.
size_t length_{0}; // number of memory bytes.
bool memory_owned_{true};
};
struct PaddleTensor {
PaddleTensor() = default;
std::string name; // variable name.
std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
......@@ -67,8 +94,9 @@ class PaddlePredictor {
// Predict an record.
// The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be alive until Run returns. caller should be
// responsible for releasing the memory of `output_data`.
// `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0;
......@@ -81,8 +109,7 @@ class PaddlePredictor {
// The common configs for all the predictors.
struct Config {
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
std::string model_dir; // path to the model directory.
};
};
......
......@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
auto d_tensor_in_p = executor_.get_in(input.name);
float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p,
static_cast<float *>(input.data.data),
static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error";
......@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
for (auto &output : *output_data) {
auto *tensor = executor_.get_out(output.name);
output.shape = tensor->shape();
if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float));
}
// Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data,
if (cudaMemcpy(output.data.data(),
tensor->mutable_data(),
tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) {
......
......@@ -37,28 +37,26 @@ TEST(inference, anakin) {
float data[1 * 3 * 224 * 224] = {1.0f};
PaddleBuf buf{.data = data, .length = sizeof(data)};
PaddleTensor tensor{.name = "input_0",
.shape = std::vector<int>({1, 3, 224, 224}),
.data = buf,
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::FLOAT32};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.emplace_back(std::move(tensor));
float data_out[1000];
PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
PaddleTensor tensor_out{.name = "prob_out",
.shape = std::vector<int>({1000, 1}),
.data = buf_out,
.data = PaddleBuf(),
.dtype = PaddleDType::FLOAT32};
std::vector<PaddleTensor> outputs(1, tensor_out);
std::vector<PaddleTensor> outputs;
outputs.emplace_back(std::move(tensor_out));
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data);
float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < 1000; ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j];
}
......
......@@ -178,8 +178,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr),
inputs[i].data.data,
inputs[i].data.length);
inputs[i].data.data(),
inputs[i].data.length());
feeds->push_back(input);
}
return true;
......@@ -241,10 +241,11 @@ bool NativePaddlePredictor::GetFetch(
}
outputs->at(i).shape = shape;
outputs->at(i).data.length = sizeof(float) * data.size();
outputs->at(i).data.data = malloc(outputs->at(i).data.length);
std::memcpy(
outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
auto &buffer = outputs->at(i).data;
if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
buffer.Resize(sizeof(float) * data.size());
}
std::memcpy(buffer.data(), data.data(), buffer.length());
outputs->at(i).dtype = PaddleDType::FLOAT32;
// TODO(panyx0718): support other types? fill tensor name? avoid a copy.
}
......
......@@ -27,13 +27,12 @@ namespace paddle {
PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
PaddleTensor pt;
pt.data.data = t->data<void>();
if (t->type() == typeid(int64_t)) {
pt.data.length = t->numel() * sizeof(int64_t);
pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
pt.dtype = PaddleDType::INT64;
} else if (t->type() == typeid(float)) {
pt.data.length = t->numel() * sizeof(float);
pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
pt.dtype = PaddleDType::FLOAT32;
} else {
LOG(FATAL) << "unsupported type.";
......@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length;
float* data = static_cast<float*>(outputs[0].data.data);
size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
......@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
EXPECT_LT(lod_data[i] - data[i], 1e-3);
EXPECT_GT(lod_data[i] - data[i], -1e-3);
}
free(outputs[0].data.data);
}
void MainImageClassification(bool use_gpu) {
......@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length;
float* data = static_cast<float*>(outputs[0].data.data);
size_t len = outputs[0].data.length();
float* data = static_cast<float*>(outputs[0].data.data());
float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_NEAR(lod_data[j], data[j], 1e-3);
}
free(data);
}
void MainThreadsWord2Vec(bool use_gpu) {
......@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
// check outputs range
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length;
float* data = static_cast<float*>(local_outputs[0].data.data);
const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data());
for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
......@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3);
}
free(data);
});
}
for (int i = 0; i < num_jobs; ++i) {
......@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
// check outputs correctness
ASSERT_EQ(local_outputs.size(), 1UL);
const size_t len = local_outputs[0].data.length;
float* data = static_cast<float*>(local_outputs[0].data.data);
const size_t len = local_outputs[0].data.length();
float* data = static_cast<float*>(local_outputs[0].data.data());
float* ref_data = refs[tid].data<float>();
EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
for (int i = 0; i < refs[tid].numel(); ++i) {
EXPECT_NEAR(ref_data[i], data[i], 1e-3);
}
free(data);
});
}
for (int i = 0; i < num_jobs; ++i) {
......
......@@ -57,6 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) {
grad_names_.insert(GradVarName(p));
}
balance_vars_.resize(places_.size(), 0);
}
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
......@@ -140,11 +141,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
checker(op.InputArgumentNames(), recv_vars);
}
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0;
for (auto var_name : var_names) {
auto var_desc = all_vars_.at(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GT(numel, 0);
numel_sum += numel;
}
auto smallest =
std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
balance_vars_[dev_id] += numel_sum;
return dev_id;
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) {
all_vars[var->Name()] = var;
all_vars_.emplace(var->Name(), var);
}
auto graph = new SSAGraph();
......@@ -161,35 +181,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program);
std::vector<std::unordered_set<std::string>> var_name_on_devices;
std::vector<std::unordered_set<std::string>> bcast_var_name_set;
var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size());
size_t cur_device_id = 0;
std::vector<int64_t> balance_grads(places_.size(), 0);
auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
auto var_desc = all_vars.at(g_name);
PADDLE_ENFORCE_NOT_NULL(var_desc);
auto dim = framework::make_ddim(var_desc->GetShape());
int64_t numel = framework::product(dim);
PADDLE_ENFORCE_GE(numel, 0);
auto smallest =
std::min_element(std::begin(balance_grads), std::end(balance_grads));
size_t dev_id =
static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
balance_grads[dev_id] += numel;
return dev_id;
};
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
if (boost::get<int>(
op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) {
// append rpc op if program is distributed trainer main program.
// always use the first device
CreateRPCOp(&result, *op);
} else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
CreateDistTrainOp(&result, *op);
......@@ -199,15 +200,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
BuildStrategy::GradientScaleStrategy::kCustomized) {
CreateScaleLossGradOp(&result);
}
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
// It also assumes backward op will always follow the forward op in
// the block.
is_forwarding = false;
} else {
int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
int op_dev_id = GetOpDeviceID(*op);
if (op_dev_id == -1) { // var on all device
CreateComputationalOps(&result, *op, places_.size());
} else {
CreateComputationalOp(&result, *op, op_dev_id);
for (auto &var_name : op->OutputArgumentNames()) {
var_name_on_devices[op_dev_id].emplace(var_name);
var_name_on_devices_.emplace(var_name, op_dev_id);
}
}
if (!is_forwarding && places_.size() > 1) {
......@@ -230,19 +235,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
cur_device_id = get_appropriate_dev(g_name);
cur_device_id = GetAppropriateDeviceID({g_name});
CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name);
var_name_on_devices_.emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name);
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(all_vars, g_name)) {
if (IsSparseGradient(g_name)) {
CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0);
} else {
InsertAllReduceOp(&result, g_name);
}
break;
default:
LOG(FATAL) << "Unknown reduce strategy ";
break;
}
}
} catch (boost::bad_get e) {
......@@ -261,7 +269,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
/*
Dependency graph has been constructed. However, there are still data
harzaeds need to be handled.
hazards need to be handled.
*/
PolishGraphToSupportDataHazards(&result);
......@@ -273,11 +281,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
return std::unique_ptr<SSAGraph>(graph);
}
bool MultiDevSSAGraphBuilder::IsSparseGradient(
const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const {
PADDLE_ENFORCE(all_vars.count(og) != 0);
if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
PADDLE_ENFORCE(all_vars_.count(og) != 0);
if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true;
}
return false;
......@@ -363,24 +369,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
return is_pg_once;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const {
int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1;
}
int var_dev_id = -1;
for (auto &var_name : op.InputArgumentNames()) {
if (var_dev_id != -1) break;
for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
if (var_name_on_devices[i].count(var_name)) {
var_dev_id = static_cast<int>(i);
break;
}
for (auto &varname : op.InputArgumentNames()) {
int dev_id = GetVarDeviceID(varname);
if (dev_id != -1) {
return dev_id;
}
}
return var_dev_id;
return -1;
}
int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
auto got = var_name_on_devices_.find(varname);
return got == var_name_on_devices_.end() ? -1 : got->second;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
......@@ -449,6 +454,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
return var;
}
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
const std::string &prev_op_name) const {
for (auto &prev_op : result->ops_) {
......@@ -463,16 +470,66 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
const OpDesc &op) const {
CreateComputationalOp(result, op, 0);
int op_dev_id = -1;
if (op.Type() == "split_byref") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else if (op.Type() == "concat") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
} else {
PADDLE_ENFORCE(
"the distribute training related op should be in [split_byref, "
"concat].");
}
PADDLE_ENFORCE(op_dev_id != -1,
"can not find right place for distributed op: %s", op.Type());
CreateComputationalOp(result, op, op_dev_id);
if (op.Type() == "concat") {
ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
}
}
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
const OpDesc &op) const {
result->ops_.emplace_back(
new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
int op_dev_id = -1;
if (op.Type() == "send") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
op.InputArgumentNames()[0].find(".block") == std::string::npos) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
} else if (op.Type() == "recv") {
op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else {
// send_barrier and fetch_barrier op can be scheduled on device 0
op_dev_id = 0;
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
op.Type());
result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
op.Type(), places_[op_dev_id]));
if (op.Type() == "send_barrier") {
ConnectOp(result, result->ops_.back().get(), "send");
......@@ -488,9 +545,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
"send, send_barrier. recv, fetch_barrier]");
}
// TODO(Yancey1989): schedule rpc op on different place may
// increate throughput
CreateOpHandleIOs(result, op, 0);
CreateOpHandleIOs(result, op, op_dev_id);
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
......
......@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
int GetVarDeviceID(const std::string &varname) const;
private:
void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
size_t place_id) const;
size_t device_id) const;
private:
std::string loss_var_name_;
......@@ -96,21 +97,23 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID(
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
const OpDesc &op) const;
int GetOpDeviceID(const OpDesc &op) const;
void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
size_t src_dev_id) const;
bool IsSparseGradient(
const std::unordered_map<std::string, VarDesc *> &all_vars,
const std::string &og) const;
bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
private:
BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
mutable std::unordered_map<std::string, int> var_name_on_devices_;
mutable std::vector<int64_t> balance_vars_;
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
......
......@@ -30,6 +30,7 @@ class SSAGraphBuilder {
SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
......
......@@ -96,6 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) {
std::lock_guard<std::mutex> l(exception_mu_);
if (exception_) {
auto exp = *exception_;
exception_.reset();
......@@ -199,6 +200,7 @@ void ThreadedSSAGraphExecutor::RunOp(
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EnforceNotMet ex) {
std::lock_guard<std::mutex> l(exception_mu_);
exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) {
LOG(FATAL) << "Unknown exception catched";
......
......@@ -56,6 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_;
std::mutex exception_mu_;
std::unique_ptr<platform::EnforceNotMet> exception_;
std::atomic<int> running_ops_;
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_client.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -49,8 +49,8 @@ Executor::Executor(const platform::Place& place) : place_(place) {}
#ifdef PADDLE_WITH_DISTRIBUTE
void Executor::Complete() {
::paddle::operators::detail::RPCClient::GetInstance<
::paddle::operators::detail::GRPCClient>()
::paddle::operators::distributed::RPCClient::GetInstance<
::paddle::operators::distributed::GRPCClient>()
->SendComplete();
}
#endif
......@@ -321,7 +321,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
}
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
bool create_local_scope, bool create_vars,
bool keep_kids) {
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
......@@ -344,12 +345,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
}
platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) {
if (local_scope != scope) {
scope->DeleteScope(local_scope);
} else {
// Delete the local scopes created in operators.
scope->DropKids();
if (!keep_kids) {
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
scope->DropKids();
}
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
......
......@@ -78,7 +78,7 @@ class Executor {
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
bool create_vars = true, bool keep_kids = false);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
std::map<std::string, const LoDTensor*>* feed_targets,
......
......@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
details::SSAGraphBuilderFactory builder_factory(
member_->places_, loss_var_name, params, member_->local_scopes_,
build_strategy);
......@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
#endif
}
builder_ = std::move(builder_factory.Create());
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places,
builder_factory.Create()->Build(main_program)));
builder_->Build(main_program)));
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, std::move(var_infos),
......@@ -133,10 +133,22 @@ ParallelExecutor::ParallelExecutor(
void ParallelExecutor::BCastParamsToGPUs(
const std::unordered_set<std::string> &vars) const {
auto *main_scope = member_->local_scopes_[0];
// the the initialize bcast, all vars would be bcast from device(0), otherwise
// bcast from the specified device.
bool initialize = builder_.get() == nullptr ? true : false;
for (auto &var : vars) {
auto *main_var = main_scope->FindVar(var);
int var_dev_id =
builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
if (!initialize && var_dev_id == -1) continue;
framework::Variable *main_var = nullptr;
if (initialize) {
main_var = member_->local_scopes_[0]->FindVar(var);
} else {
main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
}
if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
continue;
}
......@@ -151,7 +163,8 @@ void ParallelExecutor::BCastParamsToGPUs(
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i];
void *buffer;
if (i == 0) {
if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
buffer = const_cast<void *>(main_tensor.data<void>());
} else {
auto local_scope = member_->local_scopes_[i];
......
......@@ -19,12 +19,14 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
......@@ -68,6 +70,7 @@ class ParallelExecutor {
private:
ParallelExecutorPrivate *member_;
std::unique_ptr<details::SSAGraphBuilder> builder_;
};
} // namespace framework
......
......@@ -27,7 +27,7 @@ void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_);
}
} // analysis
} // inference
} // namespace analysis
} // namespace inference
} // paddle
} // namespace paddle
......@@ -184,9 +184,9 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif()
add_subdirectory(detail)
if(WITH_DISTRIBUTE)
add_subdirectory(distributed)
set(DISTRIBUTE_DEPS "")
if(WITH_GRPC)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
......@@ -195,18 +195,11 @@ if(WITH_DISTRIBUTE)
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endforeach()
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL)
......
......@@ -143,7 +143,7 @@ $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
TanhShrink Activation Operator.
$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC";
......@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
STanh Activation Operator.
$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
)DOC");
}
......
......@@ -21,8 +21,6 @@ namespace operators {
using batch_norm_bwd = mkldnn::batch_normalization_backward;
using batch_norm_fwd = mkldnn::batch_normalization_forward;
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
......@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
namespace {
template <typename T>
struct bn_type_traits {
......
......@@ -22,22 +22,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class BatchNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......
......@@ -19,6 +19,22 @@ limitations under the License. */
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> {
public:
......
......@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
ops::BilinearInterpOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
ops::BilinearInterpKernel<uint8_t>);
REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
ops::BilinearInterpGradKernel<float>);
......@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(output, input, input_t->numel() * sizeof(T));
......@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda;
float h1lambda = ratio_h * i - h;
float h2lambda = 1.f - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda;
float w1lambda = ratio_w * j - w;
float w2lambda = 1.f - w1lambda;
// calculate four position for bilinear interpolation
const T* in_pos = &input[k * in_chw + h * in_w + w];
T* out_pos = &output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
// bilinear interpolation
out_pos[0] =
out_pos[0] = static_cast<T>(
h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
h1lambda * (w2lambda * in_pos[hid * in_w] +
w1lambda * in_pos[hid * in_w + wid]);
w1lambda * in_pos[hid * in_w + wid]));
in_pos += in_hw;
out_pos += out_hw;
}
......@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
int in_chw = channels * in_hw;
int out_chw = channels * out_hw;
T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
float ratio_h =
(out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
float ratio_w =
(out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
if (in_h == out_h && in_w == out_w) {
memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
......@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < out_h; ++i) { // loop for images
int h = ratio_h * i;
int hid = (h < in_h - 1) ? 1 : 0;
T h1lambda = ratio_h * i - h;
T h2lambda = 1 - h1lambda;
float h1lambda = ratio_h * i - h;
float h2lambda = 1 - h1lambda;
for (int j = 0; j < out_w; ++j) {
int w = ratio_w * j;
int wid = (w < in_w - 1) ? 1 : 0;
T w1lambda = ratio_w * j - w;
T w2lambda = 1 - w1lambda;
float w1lambda = ratio_w * j - w;
float w2lambda = 1 - w1lambda;
T* in_pos = &d_input[k * in_chw + h * in_w + w];
const T* out_pos = &d_output[k * out_chw + i * out_w + j];
for (int c = 0; c < channels; ++c) { // loop for channels
in_pos[0] += h2lambda * w2lambda * out_pos[0];
in_pos[wid] += h2lambda * w1lambda * out_pos[0];
in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
in_pos[hid * in_w] +=
static_cast<T>(h1lambda * w2lambda * out_pos[0]);
in_pos[hid * in_w + wid] +=
static_cast<T>(h1lambda * w1lambda * out_pos[0]);
in_pos += in_hw;
out_pos += out_hw;
}
......
......@@ -15,13 +15,13 @@
#pragma once
#ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/detail/grpc_server.h"
#define RPCSERVER_T detail::AsyncGRPCServer
#define RPCCLIENT_T detail::GRPCClient
#include "paddle/fluid/operators/distributed/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_server.h"
#define RPCSERVER_T distributed::AsyncGRPCServer
#define RPCCLIENT_T distributed::GRPCClient
#else
#include "paddle/fluid/operators/detail/brpc_client.h"
#include "paddle/fluid/operators/detail/brpc_server.h"
#define RPCSERVER_T detail::AsyncBRPCServer
#define RPCCLIENT_T detail::BRPCClient
#include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/operators/distributed/brpc_server.h"
#define RPCSERVER_T distributed::AsyncBRPCServer
#define RPCCLIENT_T distributed::BRPCClient
#endif
......@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Detection mAP evaluate operator.
The general steps are as follows. First, calculate the true positive and
false positive according to the input of detection and labels, then
calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
false positive according to the input of detection and labels, then
calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
)DOC");
}
......
if(NOT WITH_DISTRIBUTE)
return()
endif()
if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
......
......@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/brpc_client.h"
#include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/framework/threadpool.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
DEFINE_int32(brpc_channel_num, 24,
"Number of channels to send requests connected to one server");
......@@ -175,6 +175,6 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
return q;
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -31,13 +31,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
struct ChannelContext {
brpc::Channel channel;
......@@ -95,6 +95,6 @@ class BRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(BRPCClient);
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/brpc_server.h"
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/distributed/brpc_server.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
namespace sendrecv {
typedef std::unordered_map<std::string,
paddle::operators::detail::RequestHandler*>
paddle::operators::distributed::RequestHandler*>
HandlerMap;
class BRPCServiceImpl : public SendRecvService {
......@@ -27,17 +27,17 @@ class BRPCServiceImpl : public SendRecvService {
: request_send_h_(nullptr),
request_get_h_(nullptr),
request_prefetch_h_(nullptr) {
auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) {
request_send_h_ = it->second;
}
it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
if (it != rpc_call_map.end()) {
request_get_h_ = it->second;
}
it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
if (it != rpc_call_map.end()) {
request_prefetch_h_ = it->second;
}
......@@ -88,15 +88,15 @@ class BRPCServiceImpl : public SendRecvService {
}
private:
paddle::operators::detail::RequestHandler* request_send_h_;
paddle::operators::detail::RequestHandler* request_get_h_;
paddle::operators::detail::RequestHandler* request_prefetch_h_;
paddle::operators::distributed::RequestHandler* request_send_h_;
paddle::operators::distributed::RequestHandler* request_get_h_;
paddle::operators::distributed::RequestHandler* request_prefetch_h_;
};
} // namespace sendrecv
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
void AsyncBRPCServer::StartServer() {
// Instance of your service.
......@@ -139,6 +139,6 @@ void AsyncBRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer WaitSeverReady";
}
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -19,12 +19,12 @@ limitations under the License. */
#include <string>
#include "brpc/server.h"
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class AsyncBRPCServer final : public RPCServer {
public:
......@@ -48,6 +48,6 @@ class AsyncBRPCServer final : public RPCServer {
int ready_;
};
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -17,11 +17,11 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
GrpcByteBufferSource::GrpcByteBufferSource() {}
......@@ -83,6 +83,6 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_;
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -106,7 +106,7 @@ class GrpcBufferReader final
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
// Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom.
class Source {
......@@ -183,6 +183,6 @@ class GrpcByteSource : public Source {
char space_[sizeof(Reader)];
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc_client.h"
#include <sys/time.h>
#include <limits>
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
void GRPCClient::InitImpl() { InitEventLoop(); }
......@@ -276,6 +276,6 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
return ch;
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -38,13 +38,13 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/rpc_client.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
struct VarHandle {
std::string ep;
......@@ -226,6 +226,6 @@ class GRPCClient : public RPCClient {
DISABLE_COPY_AND_ASSIGN(GRPCClient);
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
......@@ -50,7 +50,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < 564; ++i) rows->push_back(i);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize
......@@ -81,10 +81,10 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// deserialize zero-copy
// framework::Variable var2;
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
// operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
framework::Scope scope;
scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx);
operators::distributed::VariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar();
......@@ -128,7 +128,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize
......@@ -171,7 +171,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
// deserialize zero-copy
framework::Scope scope;
scope.Var("myvar");
operators::detail::VariableResponse resp(&scope, &ctx);
operators::distributed::VariableResponse resp(&scope, &ctx);
if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0);
} else {
......
......@@ -15,13 +15,13 @@ limitations under the License. */
#include <limits>
#include <string>
#include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/distributed/grpc_server.h"
using ::grpc::ServerAsyncResponseWriter;
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
enum CallStatus { PROCESS = 0, FINISH };
// reference:
......@@ -74,7 +74,7 @@ class RequestSend final : public RequestBase {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(),
!request_handler->sync_mode()));
int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
......@@ -106,7 +106,7 @@ class RequestGet final : public RequestBase {
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
......@@ -150,7 +150,8 @@ class RequestPrefetch final : public RequestBase {
local_scope_(nullptr) {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), true));
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
int method_id =
static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
......@@ -354,6 +355,6 @@ void AsyncGRPCServer::HandleRequest(
}
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -29,17 +29,17 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/grpc_service.h"
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/grpc_service.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class RequestBase;
......@@ -84,6 +84,6 @@ class AsyncGRPCServer final : public RPCServer {
std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
};
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -23,7 +23,7 @@
#include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -42,24 +42,25 @@ class ServerContext;
// Support parsing/unparsing of tensorflow::VariableResponse.
// Wire-format is identical to RecvVariableResponse.
template <>
class SerializationTraits<paddle::operators::detail::VariableResponse> {
class SerializationTraits<paddle::operators::distributed::VariableResponse> {
public:
static Status Serialize(
const paddle::operators::detail::VariableResponse& msg,
const paddle::operators::distributed::VariableResponse& msg,
grpc_byte_buffer** bp, bool* own_buffer) {
PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
return Status();
}
static Status Deserialize(grpc_byte_buffer* buffer,
paddle::operators::detail::VariableResponse* msg,
int max_message_size = INT_MAX) {
static Status Deserialize(
grpc_byte_buffer* buffer,
paddle::operators::distributed::VariableResponse* msg,
int max_message_size = INT_MAX) {
if (buffer == nullptr) {
return Status(StatusCode::INTERNAL, "No payload");
}
Status result = g_core_codegen_interface->ok();
if (result.ok()) {
paddle::operators::detail::GrpcByteSource source(buffer);
paddle::operators::distributed::GrpcByteSource source(buffer);
int ret = msg->Parse(&source);
if (ret != 0) {
result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
......@@ -73,7 +74,7 @@ class SerializationTraits<paddle::operators::detail::VariableResponse> {
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
enum class GrpcMethod {
kSendVariable,
......@@ -118,6 +119,6 @@ class GrpcService final {
};
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
......@@ -144,6 +144,6 @@ class ProtoEncodeHelper {
char* limit_; // Just for CHECKs
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -31,7 +31,7 @@
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
constexpr char kRequestSend[] = "RequestSend";
constexpr char kRequestGet[] = "RequestGet";
......@@ -124,6 +124,6 @@ class RequestHandler {
RPCServer* rpc_server_;
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -20,12 +20,12 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
bool RequestSendHandler::Handle(const std::string& varname,
framework::Scope* scope,
......@@ -119,6 +119,6 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
return true;
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -28,11 +28,11 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class RequestSendHandler final : public RequestHandler {
public:
......@@ -66,6 +66,6 @@ class RequestPrefetchHandler final : public RequestHandler {
const std::string& out_var_name = "") override;
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/rpc_client.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
std::once_flag RPCClient::init_flag_;
std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -22,7 +22,7 @@
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class RPCClient {
public:
......@@ -84,6 +84,6 @@ class RPCClient {
static std::once_flag init_flag_;
static std::unique_ptr<RPCClient> rpc_client_;
};
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -17,11 +17,11 @@
#include <limits>
#include <string>
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown ";
......@@ -112,6 +112,6 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -19,11 +19,11 @@
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class RPCServer {
public:
......@@ -86,6 +86,6 @@ class RPCServer {
friend class RequestHandler;
};
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -22,18 +22,18 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h"
#include "paddle/fluid/operators/detail/rpc_client.h"
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace detail = paddle::operators::detail;
namespace distributed = paddle::operators::distributed;
USE_OP(lookup_table);
std::unique_ptr<detail::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler;
std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
......@@ -113,19 +113,21 @@ void StartServer() {
g_req_handler->SetScope(&scope);
g_req_handler->SetExecutor(&exe);
g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
server_thread.join();
}
TEST(PREFETCH, CPU) {
g_req_handler.reset(new detail::RequestPrefetchHandler(true));
g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::thread server_thread(StartServer);
g_rpc_service->WaitServerReady();
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
......@@ -23,14 +23,14 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
using VarMsg = sendrecv::VariableMessage;
......@@ -222,11 +222,11 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var) {
operators::detail::VariableResponse resp(scope, &ctx);
operators::distributed::VariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar();
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -25,12 +25,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
typedef void (*DestroyCallback)(void*);
......@@ -61,6 +61,6 @@ inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
}
}
} // namespace detail
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include <string>
#include <utility>
......@@ -22,12 +22,12 @@
#endif
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
enum WireType {
WIRETYPE_VARINT = 0,
......@@ -158,13 +158,13 @@ bool VariableResponse::CopySelectRowsTensorData(
slr->set_height(meta_.slr_height());
auto* tensor = slr->mutable_value();
tensor->Resize(dims);
PADDLE_ENFORCE_EQ(
static_cast<size_t>(tensor->numel()),
length / framework::SizeOfType(
paddle::operators::detail::ToTypeIndex(meta_.data_type())));
PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
length / framework::SizeOfType(
paddle::operators::distributed::ToTypeIndex(
meta_.data_type())));
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta_.data_type()));
paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false;
......@@ -480,6 +480,6 @@ int VariableResponse::Parse(Source* source) {
return 0;
}
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -22,17 +22,17 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
namespace distributed {
class VariableResponse {
public:
......@@ -99,6 +99,6 @@ class VariableResponse {
sendrecv::VariableMessage meta_;
};
}; // namespace detail
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
rpc_client->Wait();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/mean_op.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
template <typename T>
class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine;
if (seed == 0) {
seed = std::random_device()();
}
engine.seed(seed);
std::normal_distribution<T> dist(mean, std);
int64_t size = tensor->numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor->set_layout(DataLayout::kMKLDNN);
tensor->set_format(mkldnn::memory::format::oihw);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
ops::GaussianMKLDNNKernel<float>);
......@@ -15,6 +15,10 @@ limitations under the License. */
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
ctx.device_context());
ctx.device_context(), layout, library);
}
};
......@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
"(int, default 5(FP32)) "
"Output data type.")
.SetDefault(framework::proto::VarType::FP32);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
GaussianRandom Operator.
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
......@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::vector<std::string> endpoint_list =
Attr<std::vector<std::string>>("endpoint_list");
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (auto& ep : endpoint_list) {
VLOG(3) << "sending nccl id to " << ep;
......@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
// NOTE: Can not use unique_ptr here because the default
// deleter will call GRPC Server's base class's dtor and
// that will cause a wired crash.
detail::RequestSendHandler rpc_h(true);
std::unique_ptr<detail::RPCServer> rpc_service(
distributed::RequestSendHandler rpc_h(true);
std::unique_ptr<distributed::RPCServer> rpc_service(
new RPCSERVER_T(endpoint, 1));
rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
rpc_h.SetRPCServer(rpc_service.get());
framework::ProgramDesc empty_program;
......@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
rpc_h.SetExecutor(&executor);
std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
rpc_service->SetCond(detail::kRequestSend);
rpc_service->SetCond(distributed::kRequestSend);
VLOG(3) << "start getting nccl id from trainer 0...";
rpc_service->WaitBarrier(detail::kRequestSend);
rpc_service->WaitBarrier(distributed::kRequestSend);
VLOG(3) << "got nccl id and stop server...";
rpc_service->ShutDown();
VLOG(3) << "rpc server stopped";
......
......@@ -21,14 +21,14 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
void RunServer(std::shared_ptr<detail::RPCServer> service) {
void RunServer(std::shared_ptr<distributed::RPCServer> service) {
service->StartServer();
VLOG(4) << "RunServer thread end";
}
......@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop(
while (true) {
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(detail::kRequestSend);
rpc_service_->WaitBarrier(detail::kRequestSend);
rpc_service_->SetCond(distributed::kRequestSend);
rpc_service_->WaitBarrier(distributed::kRequestSend);
if (rpc_service_->IsExit()) {
LOG(WARNING) << "get exit!rpc_processor break!";
rpc_service_->SetCond(detail::kRequestGet);
rpc_service_->SetCond(distributed::kRequestGet);
break;
}
......@@ -154,11 +154,11 @@ void ListenAndServOp::RunSyncLoop(
recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
rpc_service_->SetCond(detail::kRequestGet);
rpc_service_->WaitBarrier(detail::kRequestGet);
rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter();
// reset received sparse vars to avoid reuse it in the next mini-batch
dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder();
} // while(true)
}
......@@ -215,13 +215,13 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
}
static void FillRequestCtx(
detail::RequestHandler *h, framework::Scope *scope,
distributed::RequestHandler *h, framework::Scope *scope,
platform::DeviceContext *dev_ctx, framework::Executor *executor,
framework::ProgramDesc *program,
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx,
detail::RPCServer *rpc_server) {
distributed::RPCServer *rpc_server) {
h->SetScope(scope);
h->SetDevCtx(dev_ctx);
h->SetExecutor(executor);
......@@ -249,14 +249,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
request_prefetch_handler_.reset(
new detail::RequestPrefetchHandler(sync_mode));
new distributed::RequestPrefetchHandler(sync_mode));
rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
rpc_service_->RegisterRPC(detail::kRequestPrefetch,
rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestGet,
request_get_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
request_prefetch_handler_.get());
auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
......
......@@ -24,8 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/request_handler.h"
#include "paddle/fluid/operators/detail/rpc_server.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
namespace paddle {
namespace operators {
......@@ -33,7 +33,7 @@ namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
void RunServer(std::shared_ptr<detail::RPCServer> service);
void RunServer(std::shared_ptr<distributed::RPCServer> service);
class ListenAndServOp : public framework::OperatorBase {
public:
......@@ -62,10 +62,11 @@ class ListenAndServOp : public framework::OperatorBase {
const platform::Place& dev_place) const override;
protected:
mutable std::shared_ptr<detail::RPCServer> rpc_service_;
mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
mutable std::shared_ptr<distributed::RequestHandler>
request_prefetch_handler_;
mutable std::shared_ptr<std::thread> server_thread_;
};
......
......@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
paddle::operators::LogicalNotFunctor);
REGISTER_BINARY_LOGICAL_OP(logical_xor,
"$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
"$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
paddle::operators::LogicalXorFunctor);
......@@ -93,10 +93,10 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
for (size_t k = 0; k < input_rows; ++k) {
for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
for (size_t j = 0; j < num; ++j) {
int col_len = output_cols[j];
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
......
......@@ -22,43 +22,24 @@ namespace paddle {
namespace operators {
namespace math {
template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
const T* orig = first;
const T* it = nullptr;
T step = 0;
while (count > 0) {
it = first;
step = count / 2;
it += step;
if (!(val < *it)) {
first = ++it;
count -= step + 1;
} else {
count = step;
}
}
return first - orig;
}
template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols,
T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
int curr_offset = input_cols[segment];
int curr_segment = segment;
int curr_segment = 0;
int curr_offset = input_cols[0];
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
int curr_col_offset = input_cols[curr_segment + 1];
while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
curr_col_offset = input_cols[curr_segment + 1];
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
......@@ -89,14 +70,14 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int in_col, const int* out_cols,
int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
int curr_offset = out_cols[segment];
int curr_segment = segment;
int curr_segment = 0;
int curr_offset = out_cols[0];
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
int curr_col_offset = out_cols[curr_segment + 1];
while (curr_col_offset <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
curr_col_offset = out_cols[curr_segment + 1];
}
int local_col = tid_x - curr_offset;
......@@ -228,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) {
int t_col = outputs->at(i)->numel() / out_row;
int t_col = ref_inputs.at(i)->numel() / out_row;
if (sameShape) {
if (t_col != out0_col) sameShape = false;
}
......
......@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
......
......@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
......
......@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase {
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
......
......@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
......
......@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase {
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
detail::RPCClient* rpc_client =
detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
......
......@@ -14,11 +14,14 @@
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/tensorrt_engine_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle {
namespace operators {
......
......@@ -16,10 +16,12 @@
#ifdef PADDLE_WITH_CUDA
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle {
namespace operators {
......
......@@ -179,7 +179,6 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
const std::string& z_name, bool x_created,
const shape_t& x_shape, const shape_t& y_shape,
const shape_t& z_shape) {
LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp();
fc->SetType("mul");
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
......@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv);
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
namespace detail = paddle::operators::detail;
namespace distributed = paddle::operators::distributed;
namespace string = paddle::string;
std::unique_ptr<detail::RPCServer> g_rpc_service;
std::unique_ptr<detail::RequestHandler> g_req_handler;
std::unique_ptr<distributed::RPCServer> g_rpc_service;
std::unique_ptr<distributed::RequestHandler> g_req_handler;
void StartServer() {
f::Scope scope;
......@@ -57,14 +57,14 @@ void StartServer() {
g_req_handler->SetProgram(&empty_program);
g_req_handler->SetExecutor(&executor);
g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
g_req_handler->SetRPCServer(g_rpc_service.get());
std::thread server_thread(
std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
g_rpc_service->SetCond(detail::kRequestSend);
g_rpc_service->WaitBarrier(detail::kRequestSend);
g_rpc_service->SetCond(distributed::kRequestSend);
g_rpc_service->WaitBarrier(distributed::kRequestSend);
LOG(INFO) << "got nccl id and stop server...";
g_rpc_service->ShutDown();
......@@ -72,7 +72,7 @@ void StartServer() {
}
TEST(SendNcclId, RPCServer) {
g_req_handler.reset(new detail::RequestSendHandler(true));
g_req_handler.reset(new distributed::RequestSendHandler(true));
g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
std::thread server_thread(StartServer);
......@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) {
std::string ep = string::Sprintf("127.0.0.1:%d", port);
detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
LOG(INFO) << "connect to server" << ep;
client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
......
......@@ -159,6 +159,11 @@ PYBIND11_PLUGIN(core) {
new (&instance) LoDTensor(new_offset_lod);
})
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
// We implement offset based LOD in C++ while we use length based with
// Python API. So we changed set_lod to set_recursive_sequence_lengths to
// avoid misuse.
// The discussion is here:
// https://github.com/PaddlePaddle/Paddle/issues/10855
.def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
// the input lod is offset-based level-of-detail info
......@@ -199,6 +204,7 @@ PYBIND11_PLUGIN(core) {
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
})
// Set above comments of set_lod.
.def("recursive_sequence_lengths",
[](LoDTensor &self) -> std::vector<std::vector<size_t>> {
// output the length-based lod info
......
......@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor);
uint8_t, platform::float16>()(tensor);
return buffer_info;
}
......
......@@ -44,7 +44,7 @@ import metrics
import transpiler
from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory
from concurrency import (Go, make_channel, channel_send, channel_recv,
......@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
'profiler',
'unique_name',
'recordio_writer',
'Scope',
]
......
......@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):
class WeightedAverage(object):
"""
Calculate weighted average.
The average calculating is accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
Examples:
.. code-block:: python
avg = fluid.average.WeightedAverage()
avg.add(value=2.0, weight=1)
avg.add(value=4.0, weight=2)
avg.eval()
# The result is 3.333333333.
# For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
"""
def __init__(self):
warnings.warn(
"The %s is deprecated, please use fluid.metrics.Accuracy instead." %
......
......@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
else:
if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
......@@ -155,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs):
_rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name)
......@@ -434,18 +434,65 @@ def _get_stop_gradients_(program):
def append_backward(loss, parameter_list=None, no_grad_set=None,
callbacks=None):
"""
Append backward part to main_program
Append backward part to main_program.
Args:
loss(Variable): The variable generated by cost function.
parameter_list(list[string]): Parameters that need to be updated by
optimizer. If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0.
All variables with `step_gradient=True` from all blocks will be
automatically added.
A complete neural network training is made up of forward and backward
propagation. However, when we configure a network, we only need to
specify its forwrd part. The backward part is generated automatically
according to the forward part by this function.
Return:
(list[(Variable,Variable)]): list of (parameter, gradient) pair.
In most cases, users do not need to invoke this function manually. It
will be automatically invoked by the optimizer's `minimize` function.
Args:
loss(Variable): The loss variable of the network.
parameter_list(list[string]|None): Names of parameters that need
to be updated by optimizers.
If it is None, all parameters
will be updated.
Default: None
no_grad_set(set|None): Variables in the Block 0 whose gradients
should be ignored. All variables with
`step_gradient=True` from all blocks will
be automatically added into this set.
Default: None
callbacks(list[callable object]|None): The callbacks are used for
doing some custom jobs during
backward part building. All
callable objects in it will
be invoked once each time a
new gradient operator is added
into the program. The callable
object must has two input
parameters: 'block' and 'context'.
The 'block' is the block which
the new gradient operator will
be added to. The 'context' is a
map, whose keys are gradient
variable names and values are
corresponding original variables.
In addition to this, the 'context'
has another special key-value pair:
the key is string '__current_op_desc__'
and the value is the op_desc of the
gradient operator who has just
triggered the callable object.
Returns:
list[(Variable,Variable)]: Pairs of parameter and its
corresponding gradients. The key is the parameter and the
value is gradient variable.
Raises:
AssertionError: If `loss` is not an instance of Variable.
Examples:
.. code-block:: python
# network configuration code
# ...
avg_loss = fluid.layers.mean(loss)
param_grad_list = fluid.backward.append_backward(loss=avg_loss)
"""
assert isinstance(loss, framework.Variable)
......
......@@ -24,8 +24,6 @@ __all__ = [
'GradientClipByValue',
'GradientClipByNorm',
'GradientClipByGlobalNorm',
'append_gradient_clip_ops',
'error_clip_callback',
]
......@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
class ErrorClipByValue(BaseErrorClipAttr):
"""
Clips tensor values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
"""
def __init__(self, max, min=None):
max = float(max)
if min is None:
......@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
class GradientClipByValue(BaseGradientClipAttr):
"""
Clips gradient values to the range [min, max].
Given a tensor t, this operation clips its value to min and max inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to -max by framework.
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByValue(-1.0, 1.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, max, min=None):
max = float(max)
if min is None:
......@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
class GradientClipByNorm(BaseGradientClipAttr):
"""
Clips tensor values to a maximum L2-norm.
This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
:math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
:math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
.. math::
Out = \\frac{max\_norm * X}{norm(X)},
where :math:`norm(X)` represents the L2 norm of :math:`X`.
Args:
clip_norm (float): The maximum norm value
Examples:
.. code-block:: python
w_param_attrs = ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0,
regularizer=L1Decay(1.0),
trainable=True,
clip=GradientClipByNorm(clip_norm=2.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
"""
def __init__(self, clip_norm):
self.clip_norm = clip_norm
......@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
class GradientClipByGlobalNorm(BaseGradientClipAttr):
"""
Clips values of multiple tensors by the ratio of the sum of their norms.
Given a list of tensors t_list, and a clipping ratio clip_norm, this
operation returns a list of clipped tensors list_clipped and the global
norm (global_norm) of all tensors in t_list.
To perform the clipping, the values :math:`t\_list[i]` are set to:
.. math::
t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
otherwise they're all shrunk by the global ratio.
Args:
clip_norm (float): The maximum norm value
group_name (str, optional): The group name for this clip.
Examples:
.. code-block:: python
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
"""
def __init__(self, clip_norm, group_name="default_group"):
if not isinstance(group_name, basestring):
raise TypeError("'group_name' must be a basestring.")
......@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
def set_gradient_clip(clip, param_list=None, program=None):
"""
To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
which describes the type and detailed attributes of required gradient clip.
param_list(list, None by default): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
When it's None, all parameters in the program will be included.
program(Program, None by default): The program where parameters are.
Will be the default main program when assigned with None.
To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
which describes the type and detailed attributes of required gradient clip.
param_list(list(Variable)): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
When it's None, all parameters in the program will be included.
program(Program): The program where parameters are.
Will be the default main program when assigned with None.
"""
if not isinstance(clip, BaseGradientClipAttr):
raise TypeError(
......
......@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
self.place = place
self.lod_level = lod_level
self.shape = shape
negtive_count = 0
for s in self.shape:
if s < 0:
negtive_count += 1
if negtive_count > 1:
self.shape = None
break
if dtype == core.VarDesc.VarType.FP32:
self.dtype = 'float32'
elif dtype == core.VarDesc.VarType.INT64:
......@@ -61,7 +68,9 @@ class DataToLoDTensorConverter(object):
self._feed_impl_(each_data, lod[1:], lod_level - 1)
def done(self):
arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
arr = numpy.array(self.data, dtype=self.dtype)
if self.shape:
arr = arr.reshape(self.shape)
t = core.LoDTensor()
t.set(arr, self.place)
if self.lod_level > 0:
......@@ -70,6 +79,61 @@ class DataToLoDTensorConverter(object):
class DataFeeder(object):
"""
DataFeeder converts the data that returned by a reader into a data
structure that can feed into Executor and ParallelExecutor. The reader
usually returns a list of mini-batch data entries. Each data entry in
the list is one sample. Each sample is a list or a tuple with one
feature or multiple features.
The simple usage shows below:
.. code-block:: python
place = fluid.CPUPlace()
img = fluid.layers.data(name='image', shape=[1, 28, 28])
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
If you want to feed data into GPU side separately in advance when you
use multi-GPU to train a model, you can use `decorate_reader` function.
.. code-block:: python
place=fluid.CUDAPlace(0)
feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
reader = feeder.decorate_reader(
paddle.batch(flowers.train(), batch_size=16))
Args:
feed_list(list): The Variables or Variables'name that will
feed into model.
place(Place): place indicates feed data into CPU or GPU, if you want to
feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
the GPU id), or if you want to feed data into CPU, please using
`fluid.CPUPlace()`.
program(Program): The Program that will feed data into, if program
is None, it will use default_main_program(). Default None.
Raises:
ValueError: If some Variable is not in this Program.
Examples:
.. code-block:: python
# ...
place = fluid.CPUPlace()
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_vars_name
] # feed_vars_name is a list of variables' name.
feeder = fluid.DataFeeder(feed_list, place)
for data in reader():
outs = exe.run(program=main_program,
feed=feeder.feed(data))
"""
def __init__(self, feed_list, place, program=None):
self.feed_dtypes = []
self.feed_names = []
......@@ -99,6 +163,16 @@ class DataFeeder(object):
self.place = place
def feed(self, iterable):
"""
According to feed_list and iterable, converters the input into
a data structure that can feed into Executor and ParallelExecutor.
Args:
iterable(list|tuple): the input data.
Returns:
dict: the result of conversion.
"""
converter = []
for lod_level, shape, dtype in six.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
......@@ -121,6 +195,20 @@ class DataFeeder(object):
return ret_dict
def feed_parallel(self, iterable, num_places=None):
"""
Takes multiple mini-batches. Each mini-batch will be feed on each
device in advance.
Args:
iterable(list|tuple): the input data.
num_places(int): the number of devices. Default None.
Returns:
dict: the result of conversion.
Notes:
The number of devices and number of mini-batches must be same.
"""
if isinstance(self.place, core.CUDAPlace):
places = [
core.CUDAPlace(i)
......@@ -159,6 +247,24 @@ class DataFeeder(object):
multi_devices,
num_places=None,
drop_last=True):
"""
Converter the input data into a data that returned by reader into
multiple mini-batches. Each mini-batch will be feed on each device.
Args:
reader(fun): the input data.
multi_devices(bool): the number of places. Default None.
num_places(int): the number of places. Default None.
drop_last(bool): the number of places. Default None.
Returns:
dict: the result of conversion.
Raises:
ValueError: If drop_last is False and the data batch which cannot
fit for devices.
"""
def __reader_creator__():
if not multi_devices:
for item in reader():
......
......@@ -41,7 +41,12 @@ def _clone_var_(block, var):
class Evaluator(object):
"""
Base Class for all evaluators
Warning: better to use the fluid.metrics.* things, more
flexible support via pure Python and Operator, and decoupled
with executor. Short doc are intended to urge new user
start from Metrics.
Base Class for all evaluators.
Args:
name(str): The name of evaluator. such as, "accuracy". Used for generate
......@@ -69,6 +74,10 @@ class Evaluator(object):
def reset(self, executor, reset_program=None):
"""
reset metric states at the begin of each pass/user specified batch
Args:
executor(Executor|ParallelExecutor): a executor for executing the reset_program
reset_program(Program): a single Program for reset process
"""
if reset_program is None:
reset_program = Program()
......@@ -85,15 +94,16 @@ class Evaluator(object):
def eval(self, executor, eval_program=None):
"""
Evaluate the statistics merged by multiple mini-batches.
Args:
executor(Executor|ParallelExecutor): a executor for executing the eval_program
eval_program(Program): a single Program for eval process
"""
raise NotImplementedError()
def create_state(self, suffix, dtype, shape):
def _create_state(self, suffix, dtype, shape):
"""
Create state variable.
NOTE: It is not a public API.
Args:
suffix(str): the state suffix.
dtype(str|core.VarDesc.VarType): the state data type
......@@ -113,9 +123,35 @@ class Evaluator(object):
class ChunkEvaluator(Evaluator):
"""
Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator
instead.
Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter
numbers.
For some basics of chunking, please refer to
'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
Args:
input (Variable): prediction output of the network.
label (Variable): label of the test data set.
chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details.
num_chunk_types (int): the number of chunk type.
excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted.
Returns:
tuple: tuple containing: precision, recall, f1_score
Examples:
.. code-block:: python
exe = fluid.executor(place)
evaluator = fluid.Evaluator.ChunkEvaluator(input, label)
for epoch in PASS_NUM:
evaluator.reset(exe)
for data in batches:
loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe)
"""
def __init__(
......@@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator):
if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block")
self.num_infer_chunks = self.create_state(
self.num_infer_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_infer_chunks')
self.num_label_chunks = self.create_state(
self.num_label_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_label_chunks')
self.num_correct_chunks = self.create_state(
self.num_correct_chunks = self._create_state(
dtype='int64', shape=[1], suffix='num_correct_chunks')
precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
input=input,
......@@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator):
class EditDistance(Evaluator):
"""
Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance
instead.
Accumulate edit distance sum and sequence number from mini-batches and
compute the average edit_distance and instance error of all batches.
......@@ -188,15 +226,16 @@ class EditDistance(Evaluator):
ignored_tokens(list of int): Tokens that should be removed before
calculating edit distance.
Example:
Examples:
.. code-block:: python
exe = fluid.executor(place)
distance_evaluator = fluid.Evaluator.EditDistance(input, label)
for epoch in PASS_NUM:
distance_evaluator.reset(exe)
for data in batches:
loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe)
exe = fluid.executor(place)
distance_evaluator = fluid.Evaluator.EditDistance(input, label)
for epoch in PASS_NUM:
distance_evaluator.reset(exe)
for data in batches:
loss = exe.run(fetch_list=[cost])
distance, instance_error = distance_evaluator.eval(exe)
In the above example:
'distance' is the average of the edit distance in a pass.
......@@ -210,11 +249,11 @@ class EditDistance(Evaluator):
if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block")
self.total_distance = self.create_state(
self.total_distance = self._create_state(
dtype='float32', shape=[1], suffix='total_distance')
self.seq_num = self.create_state(
self.seq_num = self._create_state(
dtype='int64', shape=[1], suffix='seq_num')
self.instance_error = self.create_state(
self.instance_error = self._create_state(
dtype='int64', shape=[1], suffix='instance_error')
distances, seq_num = layers.edit_distance(
input=input, label=label, ignored_tokens=ignored_tokens)
......@@ -256,9 +295,10 @@ class EditDistance(Evaluator):
class DetectionMAP(Evaluator):
"""
Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP
instead.
Calculate the detection mean average precision (mAP).
TODO (Dang Qingqing): update the following doc.
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
......@@ -293,17 +333,18 @@ class DetectionMAP(Evaluator):
- 11point: the 11-point interpolated average precision.
- integral: the natural integral of the precision-recall curve.
Example:
Examples:
.. code-block:: python
exe = fluid.executor(place)
map_evaluator = fluid.Evaluator.DetectionMAP(input,
gt_label, gt_box, gt_difficult)
cur_map, accum_map = map_evaluator.get_map_var()
fetch = [cost, cur_map, accum_map]
for epoch in PASS_NUM:
map_evaluator.reset(exe)
for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
exe = fluid.executor(place)
map_evaluator = fluid.Evaluator.DetectionMAP(input,
gt_label, gt_box, gt_difficult)
cur_map, accum_map = map_evaluator.get_map_var()
fetch = [cost, cur_map, accum_map]
for epoch in PASS_NUM:
map_evaluator.reset(exe)
for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example:
......@@ -340,9 +381,10 @@ class DetectionMAP(Evaluator):
evaluate_difficult=evaluate_difficult,
ap_version=ap_version)
self.create_state(dtype='int32', shape=None, suffix='accum_pos_count')
self.create_state(dtype='float32', shape=None, suffix='accum_true_pos')
self.create_state(dtype='float32', shape=None, suffix='accum_false_pos')
self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
self._create_state(
dtype='float32', shape=None, suffix='accum_false_pos')
self.has_state = None
var = self.helper.create_variable(
......
......@@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable
from . import core
__all__ = [
'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
]
g_scope = core.Scope()
def global_scope():
"""
Get the global/default scope instance. There are a lot of APIs use
:code:`global_scope` as its default value, e.g., :code:`Executor.run`
Returns:
Scope: The global/default scope instance.
"""
return g_scope
def switch_scope(scope):
def _switch_scope(scope):
global g_scope
ex = g_scope
g_scope = scope
......@@ -37,12 +44,40 @@ def switch_scope(scope):
@contextlib.contextmanager
def scope_guard(scope):
ex = switch_scope(scope)
"""
Change the global/default scope instance by Python `with` statement. All
variable in runtime will assigned to the new scope.
Examples:
>>> import paddle.fluid as fluid
>>> new_scope = fluid.Scope()
>>> with fluid.scope_guard(new_scope):
>>> ...
Args:
scope: The new global/default scope.
"""
ex = _switch_scope(scope)
yield
switch_scope(ex)
_switch_scope(ex)
def as_numpy(tensor):
"""
Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> outs = executor.run(...)
>>> np_outs = map(lambda x: as_numpy(x), outs)
>>> ...
Args:
tensor(Variable): a instance of Tensor
Returns:
numpy.ndarray
"""
if isinstance(tensor, list):
return [as_numpy(t) for t in tensor]
assert isinstance(tensor, core.LoDTensor)
......@@ -135,14 +170,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
def fetch_var(name, scope=None, return_numpy=True):
"""
Fetch the value of the variable with the given name from the given scope
Fetch the value of the variable with the given name from the
given scope.
Args:
name(str): name of the variable. Typically, only persistable variables
can be found in the scope used for running the program.
scope(core.Scope|None): scope object. It should be the scope where
you pass to Executor.run() when running your program.
If None, global_scope() will be used.
return_numpy(bool): whether convert the tensor to numpy.ndarray
If None, global_scope() will be used. Default None.
return_numpy(bool): whether convert the tensor to numpy.ndarray.
Default True.
Returns:
LodTensor|numpy.ndarray
"""
......@@ -162,7 +201,7 @@ def fetch_var(name, scope=None, return_numpy=True):
return tensor
def get_program_cache_key(feed, fetch_list):
def _get_program_cache_key(feed, fetch_list):
feed_var_names = feed.keys()
def to_name_str(var):
......@@ -181,6 +220,25 @@ def get_program_cache_key(feed, fetch_list):
class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
ParallelExecutor.
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
def __init__(self, place):
self.place = place
p = core.Place()
......@@ -189,6 +247,23 @@ class Executor(object):
self.program_caches = dict()
def as_lodtensor(self, data):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if isinstance(data, list):
raise RuntimeError("Some of your feed data hold LoD information. \
They can not be completely cast from a list of Python \
......@@ -280,23 +355,47 @@ class Executor(object):
scope=None,
return_numpy=True,
use_program_cache=False):
""" Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
"""
Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
the variables(or names) that user want to get after program run.
Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list
:param program: the program that need to run, if not provied, then default_main_program will be used.
:param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
:param fetch_list: a list of variable or variable names that user want to get, run will return them according
to this list.
:param feed_var_name: the name for the input variable of feed Operator.
:param fetch_var_name: the name for the output variable of feed Operator.
:param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
:param return_numpy: if convert the fetched tensor to numpy
:param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
:return: result according to fetch_list.
Args:
program(Program): the program that need to run, if not provied, then default_main_program will be used.
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
feed_var_name(str): the name for the input variable of feed Operator.
fetch_var_name(str): the name for the output variable of fetch Operator.
scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
return_numpy(bool): if convert the fetched tensor to numpy
use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
Returns:
list(numpy.array): fetch result according to fetch_list.
Examples:
>>> data = layers.data(name='X', shape=[1], dtype='float32')
>>> hidden = layers.fc(input=data, size=10)
>>> layers.assign(hidden, out)
>>> loss = layers.mean(out)
>>> adam = fluid.optimizer.Adam()
>>> adam.minimize(loss)
>>> cpu = core.CPUPlace()
>>> exe = Executor(cpu)
>>> exe.run(default_startup_program())
>>> x = numpy.random.random(size=(10, 1)).astype('float32')
>>> outs = exe.run(
>>> feed={'X': x},
>>> fetch_list=[loss.name])
"""
if feed is None:
feed = {}
......@@ -317,7 +416,7 @@ class Executor(object):
if scope is None:
scope = global_scope()
cache_key = get_program_cache_key(feed, fetch_list)
cache_key = _get_program_cache_key(feed, fetch_list)
if use_program_cache:
cached_program = self._get_program_cache(cache_key)
if cached_program is None:
......
此差异已折叠。
......@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ]
class Inferencer(object):
"""
Inferencer High Level API.
Args:
infer_func (Python func): Infer function that will return predict Variable
param_path (str): The path where the inference model is saved by fluid.io.save_params
place (Place): place to do the inference
parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
Examples:
.. code-block:: python
def inference_program():
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
return y_predict
place = fluid.CPUPlace()
inferencer = fluid.Inferencer(
infer_func=inference_program, param_path="/tmp/model", place=place)
"""
def __init__(self, infer_func, param_path, place=None, parallel=False):
"""
:param infer_func: a function that will return predict Variable
:param param_path: the path where the inference model is saved by fluid.io.save_params
:param place: place to do the inference
:param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
"""
self.param_path = param_path
self.scope = core.Scope()
self.parallel = parallel
......@@ -60,9 +77,20 @@ class Inferencer(object):
def infer(self, inputs, return_numpy=True):
"""
:param inputs: a map of {"input_name": input_var} that will be feed into the inference program
to get the predict value
:return: the predict value of the inference model
Do Inference for Inputs
Args:
inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
return_numpy (bool): transform return value into numpy or not
Returns:
Tensor or Numpy: the predict value of the inference model for the inputs
Examples:
.. code-block:: python
tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
results = inferencer.infer({'x': tensor_x})
"""
if not isinstance(inputs, dict):
raise ValueError(
......
此差异已折叠。
此差异已折叠。
......@@ -28,8 +28,8 @@ import math_op_patch
from math_op_patch import *
import detection
from detection import *
import metric
from metric import *
import metric_op
from metric_op import *
from learning_rate_scheduler import *
__all__ = []
......@@ -41,5 +41,5 @@ __all__ += control_flow.__all__
__all__ += ops.__all__
__all__ += device.__all__
__all__ += detection.__all__
__all__ += metric.__all__
__all__ += metric_op.__all__
__all__ += learning_rate_scheduler.__all__
......@@ -185,12 +185,14 @@ def Print(input,
Returns:
Variable: Output tensor, same data with input tensor.
Examples:
.. code-block:: python
value = some_layer(...)
Print(value, summarize=10,
message="The content of some_layer: ")
value = some_layer(...)
Print(value, summarize=10,
message="The content of some_layer: ")
'''
helper = LayerHelper('print', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype())
......@@ -1201,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
class ConditionalBlock(object):
'''
**ConditionalBlock**
ConditionalBlock is an operator that bind a block to a specific condition,
if the condition matches, the corresponding block will be executed.
Args:
inputs (Variable): bool conditions.
is_scalar_condition (bool): whether the branch is controled by a scalar.
name(str): name of this ConditionalBlock.
Examples:
.. code-block:: python
cond = layers.less_than(x=label, y=limit)
true_image, false_image = layers.split_lod_tensor(
input=image, mask=cond)
true_cond = layers.ConditionalBlock([true_image])
with true_cond.block():
...
with false_cond.block():
...
'''
def __init__(self, inputs, is_scalar_condition=False, name=None):
for each_input in inputs:
if not isinstance(each_input, Variable):
......
......@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
"""
from layer_function_generator import generate_layer_fn
from layer_function_generator import autodoc
from layer_function_generator import autodoc, templatedoc
from ..layer_helper import LayerHelper
import tensor
import nn
......@@ -155,7 +155,7 @@ def detection_output(loc,
return nmsed_outs
@autodoc()
@templatedoc()
def detection_map(detect_res,
label,
class_num,
......@@ -166,6 +166,47 @@ def detection_map(detect_res,
input_states=None,
out_states=None,
ap_version='integral'):
"""
${comment}
Args:
detect_res: ${detect_res_comment}
label: ${label_comment}
class_num: ${class_num_comment}
background_label: ${background_label_comment}
overlap_threshold: ${overlap_threshold_comment}
evaluate_difficult: ${evaluate_difficult_comment}
has_state: ${has_state_comment}
input_states: If not None, It contains 3 elements:
1. pos_count ${pos_count_comment}.
2. true_pos ${true_pos_comment}.
3. false_pos ${false_pos_comment}.
out_states: If not None, it contains 3 elements.
1. accum_pos_count ${accum_pos_count_comment}.
2. accum_true_pos ${accum_true_pos_comment}.
3. accum_false_pos ${accum_false_pos_comment}.
ap_version: ${ap_type_comment}
Returns:
${map_comment}
Examples:
.. code-block:: python
detect_res = fluid.layers.data(
name='detect_res',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
label = fluid.layers.data(
name='label',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
map_out = fluid.layers.detection_map(detect_res, label, 21)
"""
helper = LayerHelper("detection_map", **locals())
def __create_var(type):
......
......@@ -126,7 +126,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32")
helper.append_op(
type="accuracy",
type="auc",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
......
......@@ -94,6 +94,7 @@ __all__ = [
'mean_iou',
'relu',
'log',
'crop',
]
......@@ -2675,18 +2676,35 @@ def sequence_expand(x, y, ref_level=-1, name=None):
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
'''
**beam search**
This function implements the beam search algorithm.
Beam search is a classical algorithm for selecting candidate words
in a machine translation task.
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details.
Args:
pre_ids (Variable): ${pre_ids_comment}
ids (Variable): ${ids_comment}
scores (Variable): ${scores_comment}
beam_size (int): ${beam_size_comment}
end_id (int): ${end_id_comment}
level (int): ${level_comment}
pre_ids (Variable): ids in previous step.
ids (Variable): a LoDTensor of shape of [None,k]
scores (Variable): a LoDTensor that has the same shape and LoD with `ids`
beam_size (int): beam size for beam search
end_id (int): the token id which indicates the end of a sequence
level (int): the level of LoDTensor
Returns:
tuple: a tuple of beam_search output variables: selected_ids, selected_scores
tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores`
Examples:
.. code-block:: python
# current_score is a Tensor of shape (num_batch_size, embed_size), which
# consists score of each candidate word.
topk_scores, topk_indices = pd.topk(current_score, k=50)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
'''
helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype
......@@ -4996,3 +5014,101 @@ def mean_iou(input, label, num_classes):
},
attrs={"num_classes": num_classes})
return out_mean_iou, out_wrong, out_correct
def crop(x, shape=None, offsets=None, name=None):
"""
Crop input into output, as specified by offsets and shape.
.. code-block:: text
* Case 1:
Given
X = [[0, 1, 2, 0, 0]
[0, 3, 4, 0, 0]
[0, 0, 0, 0, 0]],
and
shape = [2, 2],
offsets = [0, 1],
output is:
Out = [[1, 2],
[3, 4]].
* Case 2:
Given
X = [[0, 1, 2, 5, 0]
[0, 3, 4, 6, 0]
[0, 0, 0, 0, 0]],
and shape is tensor
shape = [[0, 0, 0]
[0, 0, 0]]
and
offsets = [0, 1],
output is:
Out = [[1, 2, 5],
[3, 4, 6]].
Args:
x (Variable): The input tensor variable.
shape (Variable|list/tuple of integer): The output shape is specified
by `shape`, which can a Variable or a list/tupe of integer.
If a tensor Variable, it's rank must be the same as `x`. This way
is suitable for the case that the output shape may be changed each
iteration. If a list/tupe of integer, it's length must be the same
as the rank of `x`
offsets (Variable|list/tuple of integer|None): Specifies the copping
offsets at each dimension. It can be a Variable or or a list/tupe
of integer. If a tensor Variable, it's rank must be the same as `x`.
This way is suitable for the case that the offsets may be changed
each iteration. If a list/tupe of integer, it's length must be the
same as the rank of `x`. If None, the offsets are 0 at each
dimension.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The cropped tensor variable.
Raises:
ValueError: If shape is not a list, tuple or Variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
crop = fluid.layers.crop(x, shape=y)
# or
z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
crop = fluid.layers.crop(z, shape=[2, 3])
"""
helper = LayerHelper('crop', **locals())
if not (isinstance(shape, list) or isinstance(shape, tuple) or \
isinstance(shape, Variable)):
raise ValueError("The shape should be a list, tuple or Variable.")
if offsets is None:
offsets = [0] * len(x.shape)
out = helper.create_tmp_variable(x.dtype)
ipts = {'X': x}
attrs = {}
if isinstance(shape, Variable):
ipts['Y'] = shape
else:
attrs['shape'] = shape
if isinstance(offsets, Variable):
ipts['Offsets'] = offsets
else:
attrs['offsets'] = offsets
helper.append_op(
type='crop',
inputs=ipts,
outputs={'Out': out},
attrs=None if len(attrs) == 0 else attrs)
return out
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册