提交 a459764d 编写于 作者: C chengduoZH

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_fetchop

...@@ -80,6 +80,8 @@ parser.add_argument( ...@@ -80,6 +80,8 @@ parser.add_argument(
type=str, type=str,
default="", default="",
help="Comma-separated list of hostname:port pairs") help="Comma-separated list of hostname:port pairs")
parser.add_argument(
"--profile", action='store_true', help="If set, profile a few steps.")
# Flags for defining the tf.train.Server # Flags for defining the tf.train.Server
parser.add_argument( parser.add_argument(
...@@ -183,8 +185,8 @@ def main(): ...@@ -183,8 +185,8 @@ def main():
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
train_pass_acc.reset() train_pass_acc.reset()
for batch_id, data in enumerate(train_reader()):
ts = time.time() def run_step(batch_id, data):
img_data = np.array( img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype( map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32") "float32")
...@@ -196,14 +198,28 @@ def main(): ...@@ -196,14 +198,28 @@ def main():
feed={"pixel": img_data, feed={"pixel": img_data,
"label": y_data}, "label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size]) fetch_list=[avg_cost, batch_acc, batch_size])
return loss, acc, b_size
if args.profile and args.task_index == 0:
# warmup.
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
for batch_id, data in enumerate(train_reader()):
if batch_id > 5: break
run_step(batch_id, data)
for batch_id, data in enumerate(train_reader()):
ts = time.time()
loss, acc, b_size = run_step(batch_id, data)
iters += 1 iters += 1
num_samples += len(data) num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size) train_pass_acc.add(value=acc, weight=b_size)
print( print(
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
"Speed = %.2f img/s " % (args.task_index, pass_id, iters, "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
loss, acc, len(data) / (time.time() - ts))
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
......
../../v2/build_and_install/paddleci.png
\ No newline at end of file
...@@ -125,12 +125,12 @@ Compile Time -> IR -> Runtime ...@@ -125,12 +125,12 @@ Compile Time -> IR -> Runtime
## Operator/OpWithKernel/OpKernel ## Operator/OpWithKernel/OpKernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) ![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
--- ---
## Operator ## Operator
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) ![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
* `Operator` is the fundamental building block of the user interface. * `Operator` is the fundamental building block of the user interface.
* Operator stores input/output variable names and attributes. * Operator stores input/output variable names and attributes.
...@@ -141,7 +141,7 @@ Compile Time -> IR -> Runtime ...@@ -141,7 +141,7 @@ Compile Time -> IR -> Runtime
## OpWithKernel/Kernel ## OpWithKernel/Kernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) ![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
* `OpWithKernel` inherits `Operator`. * `OpWithKernel` inherits `Operator`.
* `OpWithKernel` contains a Kernel map. * `OpWithKernel` contains a Kernel map.
......
digraph sample {
graph [rankdir=TD]; node [shape=record];
op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"];
}
\ No newline at end of file
digraph sample {
graph [rankdir=TD]; node [shape=record];
op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"];
op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"]
op_kernel [label="{OpKernel | Compute()=0}"]
op_kernel_key [label="{OpKernelKey| Place place\n...}"]
op -> op_with_kern [dir=back, arrowtail=onormal]
op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
{
rank=same;
op_with_kern
op_kernel
}
op_kernel -> op_kernel_key [style=invis]
{
rank=same;
op_kernel
op_kernel_key
}
op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
mul_op [label="MulOp"]
op_with_kern -> mul_op [dir=back, arrowtail=onormal]
mul_kernel [label="template <typename Place>\lclass MulOpKernel\l"]
op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
mul_op -> mul_kernel [arrowhead=vee, label="register many"]
{
rank=same;
mul_op;
mul_kernel;
}
}
\ No newline at end of file
digraph sample {
graph [rankdir=TD]; node [shape=record];
op [label="{Operator}"];
op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"]
op_kernel [label="{OpKernel | Compute()=0}"]
op_kernel_key [label="{OpKernelKey| Place place\n...}"]
op -> op_with_kern [dir=back, arrowtail=onormal]
op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
{
rank=same;
op_with_kern
op_kernel
}
op_kernel -> op_kernel_key [style=invis]
{
rank=same;
op_kernel
op_kernel_key
}
op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
}
\ No newline at end of file
...@@ -142,7 +142,7 @@ gated_unit ...@@ -142,7 +142,7 @@ gated_unit
----------- -----------
.. autoclass:: paddle.v2.layer.gated_unit .. autoclass:: paddle.v2.layer.gated_unit
:noindex: :noindex:
Recurrent Layer Group Recurrent Layer Group
===================== =====================
...@@ -354,7 +354,7 @@ dropout ...@@ -354,7 +354,7 @@ dropout
-------- --------
.. autoclass:: paddle.v2.layer.dropout .. autoclass:: paddle.v2.layer.dropout
:noindex: :noindex:
dot_prod dot_prod
--------- ---------
.. autoclass:: paddle.v2.layer.dot_prod .. autoclass:: paddle.v2.layer.dot_prod
...@@ -460,6 +460,11 @@ multi_binary_label_cross_entropy_cost ...@@ -460,6 +460,11 @@ multi_binary_label_cross_entropy_cost
.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
:noindex: :noindex:
classification_cost
-------------------
.. autoclass:: paddle.v2.layer.classification_cost
:noindex:
huber_regression_cost huber_regression_cost
------------------------- -------------------------
.. autoclass:: paddle.v2.layer.huber_regression_cost .. autoclass:: paddle.v2.layer.huber_regression_cost
...@@ -534,7 +539,7 @@ detection_output ...@@ -534,7 +539,7 @@ detection_output
---------------- ----------------
.. autoclass:: paddle.v2.layer.detection_output .. autoclass:: paddle.v2.layer.detection_output
:noindex: :noindex:
Check Layer Check Layer
============ ============
......
...@@ -41,7 +41,7 @@ Training docker image needs to package the paddle pserver and paddle trainer run ...@@ -41,7 +41,7 @@ Training docker image needs to package the paddle pserver and paddle trainer run
- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes. - Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link: Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile - https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
```bash ```bash
...@@ -62,7 +62,7 @@ represent the Docker Image which built in this step. ...@@ -62,7 +62,7 @@ represent the Docker Image which built in this step.
### Prepare Training Data ### Prepare Training Data
We can download and split the training job by creating a Kubernetes Job, or custom your image We can download and split the training job by creating a Kubernetes Job, or custom your image
by editing [k8s_train](./src/k8s_train/). by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
the different file system, the generated dataset would be saved on this volume. the different file system, the generated dataset would be saved on this volume.
......
...@@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -348,8 +348,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
} }
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait();
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} else {
// Delete the local scopes created in operators.
scope->DropKids();
} }
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
......
nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_tensorrt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc nv_test(test_trt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc
DEPS ${FLUID_CORE_MODULES} activation_op) DEPS ${FLUID_CORE_MODULES} activation_op)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include <cuda.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
using platform::is_gpu_place;
using platform::is_cpu_place;
class DefaultInputConverter : public EngineInputConverter {
public:
DefaultInputConverter() {}
// NOTE out is GPU memory.
virtual void operator()(const LoDTensor& in, void* out,
size_t max_size) override {
PADDLE_ENFORCE(out != nullptr);
PADDLE_ENFORCE_LE(in.memory_size(), max_size);
const auto& place = in.place();
if (is_cpu_place(place)) {
PADDLE_ENFORCE(stream_ != nullptr);
PADDLE_ENFORCE_EQ(0,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToDevice, *stream_));
} else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToHost, *stream_));
} else {
PADDLE_THROW("Unknown device for converter");
}
cudaStreamSynchronize(*stream_);
}
};
REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter);
} // namespace tensorrt
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
namespace inference {
namespace tensorrt {
using framework::LoDTensor;
/*
* Convert Input from Fluid to an Engine.
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
* most cases just need to copy the data.
*/
class EngineInputConverter {
public:
EngineInputConverter() {}
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
void SetStream(cudaStream_t* stream) { stream_ = stream; }
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
size_t max_size, cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineInputConverter>::Lookup(
in_op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream);
(*converter)(in, out, max_size);
}
virtual ~EngineInputConverter() {}
protected:
cudaStream_t* stream_{nullptr};
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
struct trt_input_##in_op_type__##_converter { \
trt_input_##in_op_type__##_converter() { \
::paddle::inference::Registry<EngineInputConverter>::Register< \
Converter__>(#in_op_type__); \
} \
}; \
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -32,34 +33,23 @@ class OpConverter { ...@@ -32,34 +33,23 @@ class OpConverter {
OpConverter() {} OpConverter() {}
virtual void operator()(const framework::OpDesc& op) {} virtual void operator()(const framework::OpDesc& op) {}
void Execute(const framework::OpDesc& op, TensorRTEngine* engine) { void Run(const framework::OpDesc& op, TensorRTEngine* engine) {
std::string type = op.Type(); std::string type = op.Type();
auto it = converters_.find(type); auto* it = Registry<OpConverter>::Lookup(type);
PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]", PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
type); it->SetEngine(engine);
it->second->SetEngine(engine); (*it)(op);
(*it->second)(op);
}
static OpConverter& Global() {
static auto* x = new OpConverter;
return *x;
}
template <typename T>
void Register(const std::string& key) {
converters_[key] = new T;
} }
// convert fluid op to tensorrt layer // convert fluid op to tensorrt layer
void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) { void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) {
OpConverter::Global().Execute(op, engine); OpConverter::Run(op, engine);
} }
// convert fluid block to tensorrt network // convert fluid block to tensorrt network
void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) { void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) {
for (auto op : block.AllOps()) { for (auto op : block.AllOps()) {
OpConverter::Global().Execute(*op, engine); OpConverter::Run(*op, engine);
} }
} }
...@@ -78,12 +68,12 @@ class OpConverter { ...@@ -78,12 +68,12 @@ class OpConverter {
framework::Scope* scope_{nullptr}; framework::Scope* scope_{nullptr};
}; };
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter { \ struct trt_##op_type__##_converter { \
trt_##op_type__##_converter() { \ trt_##op_type__##_converter() { \
OpConverter::Global().Register<Converter__>(#op_type__); \ Registry<OpConverter>::Register<Converter__>(#op_type__); \
} \ } \
}; \ }; \
trt_##op_type__##_converter trt_##op_type__##_converter__; trt_##op_type__##_converter trt_##op_type__##_converter__;
} // namespace tensorrt } // namespace tensorrt
......
...@@ -26,7 +26,7 @@ namespace paddle { ...@@ -26,7 +26,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void compare(float input, float expect) { void Compare(float input, float expect) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
...@@ -85,8 +85,8 @@ void compare(float input, float expect) { ...@@ -85,8 +85,8 @@ void compare(float input, float expect) {
} }
TEST(OpConverter, ConvertRelu) { TEST(OpConverter, ConvertRelu) {
compare(1, 1); // relu(1) = 1 Compare(1, 1); // relu(1) = 1
compare(-5, 0); // relu(-5) = 0 Compare(-5, 0); // relu(-5) = 0
} }
} // namespace tensorrt } // namespace tensorrt
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include <gtest/gtest.h>
namespace paddle {
namespace inference {
namespace tensorrt {
class EngineInputConverterTester : public ::testing::Test {
public:
void SetUp() override { tensor.Resize({10, 10}); }
framework::LoDTensor tensor;
};
TEST_F(EngineInputConverterTester, DefaultCPU) {
void* buffer;
tensor.mutable_data<float>(platform::CPUPlace());
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
cudaStream_t stream;
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
&stream);
}
TEST_F(EngineInputConverterTester, DefaultGPU) {
void* buffer;
tensor.mutable_data<float>(platform::CUDAPlace());
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
cudaStream_t stream;
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(),
&stream);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -20,7 +20,7 @@ namespace paddle { ...@@ -20,7 +20,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
TEST(BlockConverter, ConvertBlock) { TEST(OpConverter, ConvertBlock) {
framework::ProgramDesc prog; framework::ProgramDesc prog;
auto* block = prog.MutableBlock(0); auto* block = prog.MutableBlock(0);
auto* mul_op = block->AppendOp(); auto* mul_op = block->AppendOp();
......
...@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME) ...@@ -24,6 +24,11 @@ function(inference_test TARGET_NAME)
endforeach() endforeach()
endfunction(inference_test) endfunction(inference_test)
####################
# Inference tests here depend on fluid/tests/book. If users want to run
# individual test with ctest, they need to run tests in fluid/tests/book
# first to generate saved model.
####################
# This unittest is buggy! # This unittest is buggy!
#inference_test(fit_a_line) #inference_test(fit_a_line)
inference_test(image_classification ARGS vgg resnet) inference_test(image_classification ARGS vgg resnet)
...@@ -31,5 +36,5 @@ inference_test(label_semantic_roles) ...@@ -31,5 +36,5 @@ inference_test(label_semantic_roles)
inference_test(recognize_digits ARGS mlp conv) inference_test(recognize_digits ARGS mlp conv)
inference_test(recommender_system) inference_test(recommender_system)
#inference_test(rnn_encoder_decoder) #inference_test(rnn_encoder_decoder)
inference_test(understand_sentiment ARGS conv) #inference_test(understand_sentiment ARGS conv)
inference_test(word2vec) inference_test(word2vec)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
// NOTE not thread-safe.
template <typename T>
struct Singleton {
static T& Global() {
static T* x = new T;
return *x;
}
Singleton() = delete;
Singleton& operator=(const Singleton&) = delete;
};
/*
* An registor for any type.
* NOTE not thread-safe.
*/
template <typename ItemParent>
struct Registry {
static Registry& Global() {
static auto* x = new Registry<ItemParent>;
return *x;
}
template <typename ItemChild>
static void Register(const std::string& name) {
PADDLE_ENFORCE_EQ(items_.count(name), 0);
items_[name] = new ItemChild;
}
static ItemParent* Lookup(const std::string& name,
const std::string& default_name = "") {
auto it = items_.find(name);
if (it == items_.end()) {
if (default_name == "")
return nullptr;
else
return items_.find(default_name)->second;
}
return it->second;
}
~Registry() {
for (auto& item : items_) {
delete item.second;
}
}
private:
Registry() = default;
static std::unordered_map<std::string, ItemParent*> items_;
};
template <typename ItemParent>
std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
} // namespace inference
} // namespace paddle
...@@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, ...@@ -366,7 +366,8 @@ REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>); paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>); paddle::operators::CUDNNConvGradOpKernel<double>);
...@@ -187,7 +187,8 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -187,7 +187,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(filter_slice, col_matrix, &out_slice); blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
T(0.0));
} }
} }
} }
...@@ -304,7 +305,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -304,7 +305,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
col_matrix.ShareDataWith(in_grad_slice); col_matrix.ShareDataWith(in_grad_slice);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
blas.MatMul(filter_slice, true, out_grad_slice, false, &col_matrix); blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
&col_matrix, T(0.0));
if (is_expand && data_dim == 2U) { if (is_expand && data_dim == 2U) {
col2im(dev_ctx, col, dilations, strides, col2im(dev_ctx, col, dilations, strides,
...@@ -351,8 +353,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -351,8 +353,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
// gemm // gemm
Tensor filter_grad_slice = Tensor filter_grad_slice =
filter_grad_.Slice(g * out_step, (g + 1) * out_step); filter_grad_.Slice(g * out_step, (g + 1) * out_step);
blas.MatMul(out_grad_slice, false, col_matrix, true, blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
&filter_grad_slice); &filter_grad_slice, T(1.0));
} }
} }
} }
......
...@@ -135,7 +135,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -135,7 +135,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
// col_matrix = filter * input_batch // col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
blas.MatMul(filter, true, input_batch, false, &col_matrix); blas.MatMul(filter, true, input_batch, false, static_cast<T>(1.0),
&col_matrix, static_cast<T>(0.0));
if (data_dim == 2U) { if (data_dim == 2U) {
// col2im: col_matrix -> dy // col2im: col_matrix -> dy
...@@ -267,7 +268,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -267,7 +268,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or // or
// (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
// d, h, w) // d, h, w)
blas.MatMul(filter, false, col_matrix, false, &input_grad_batch); blas.MatMul(filter, false, col_matrix, false, static_cast<T>(1.0),
&input_grad_batch, static_cast<T>(0.0));
} }
if (filter_grad) { if (filter_grad) {
// input batch // input batch
...@@ -277,7 +279,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -277,7 +279,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// or // or
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
// k_h * k_w) // k_h * k_w)
blas.MatMul(in_batch, false, col_matrix, true, &filter_grad_); blas.MatMul(in_batch, false, col_matrix, true, static_cast<T>(1.0),
&filter_grad_, static_cast<T>(1.0));
} }
} }
} }
......
...@@ -69,6 +69,10 @@ message VariableMessage { ...@@ -69,6 +69,10 @@ message VariableMessage {
bytes rows = 9; bytes rows = 9;
// Look up table block execution output variable name. // Look up table block execution output variable name.
string out_varname = 10; string out_varname = 10;
// If true, the ps server will start profiling, the ps
// server stops profiling and generates a profile to /tmp/profile_ps_*
// when profile switches from true to false.
bool profile = 11;
} }
message VoidMessage {} message VoidMessage {}
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h"
#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/detail/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -45,6 +46,13 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void* payload = nullptr; void* payload = nullptr;
size_t payload_size; size_t payload_size;
ProtoEncodeHelper e(static_cast<char*>(buf), 1024); ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if (platform::ShouldSendProfileState()) {
e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
}
e.WriteString(VarMsg::kVarnameFieldNumber, name); e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0); e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
...@@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) { ...@@ -427,7 +428,26 @@ int VariableResponse::Parse(Source* source) {
meta_.set_out_varname(temp); meta_.set_out_varname(temp);
break; break;
} }
case sendrecv::VariableMessage::kProfileFieldNumber: {
bool profiling;
if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
return tag;
}
meta_.set_profile(profiling);
int64_t listener_id = platform::ListenerId();
if (listener_id <= 0) {
break;
}
if (profiling && !platform::IsProfileEnabled()) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (!profiling && platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
}
break;
}
default: { default: {
// Unknown tag, return unknown error. // Unknown tag, return unknown error.
return -1; return -1;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -294,6 +295,8 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
void ListenAndServOp::RunImpl(const framework::Scope &scope, void ListenAndServOp::RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
// Mark this as PS that it should decide profiling by listening from trainer.
platform::SetProfileListener();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place); auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope(); framework::Scope &recv_scope = scope.NewScope();
......
...@@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -46,8 +46,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
auto* lod = lod_t->data<int>(); auto* lod = lod_t->data<int>();
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor lod_cpu; framework::Tensor lod_cpu;
framework::TensorCopy(*lod_t, platform::CPUPlace(), framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
ctx.device_context(), &lod_cpu);
lod = lod_cpu.data<int>(); lod = lod_cpu.data<int>();
} }
level0 = std::vector<int>(lod, lod + lod_t->numel()); level0 = std::vector<int>(lod, lod + lod_t->numel());
......
...@@ -69,8 +69,8 @@ void testConcat() { ...@@ -69,8 +69,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
std::vector<paddle::framework::Tensor> input; std::vector<paddle::framework::Tensor> input;
...@@ -86,8 +86,8 @@ void testConcat() { ...@@ -86,8 +86,8 @@ void testConcat() {
int* out_ptr; int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -142,8 +142,8 @@ void testConcat() { ...@@ -142,8 +142,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -157,8 +157,8 @@ void testConcat() { ...@@ -157,8 +157,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -215,8 +215,8 @@ void testConcat() { ...@@ -215,8 +215,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -230,8 +230,8 @@ void testConcat() { ...@@ -230,8 +230,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -290,8 +290,8 @@ void testConcat() { ...@@ -290,8 +290,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(input_a_cpu, Place(), *context, &input_a); paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
paddle::framework::TensorCopy(input_b_cpu, Place(), *context, &input_b); paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
} }
input.clear(); input.clear();
...@@ -305,8 +305,8 @@ void testConcat() { ...@@ -305,8 +305,8 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
paddle::framework::TensorCopy(out, paddle::platform::CPUPlace(), *context, paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
&out_cpu); &out_cpu);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
......
...@@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -41,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
seq = cpu_seq; seq = cpu_seq;
} else { } else {
TensorCopy(cpu_seq, *place, *context, &seq); TensorCopySync(cpu_seq, *place, &seq);
seq.set_lod(lod); seq.set_lod(lod);
} }
...@@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -64,7 +64,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back; cpu_seq_back = seq_back;
} else { } else {
TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back); TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
cpu_seq_back.set_lod(lod); cpu_seq_back.set_lod(lod);
} }
......
...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
......
...@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, ...@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>, ops::PoolCUDNNOpKernel<float>,
ops::PoolCUDNNOpKernel<double>); ops::PoolCUDNNOpKernel<double>,
ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>); ops::PoolCUDNNGradOpKernel<double>);
...@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
&offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
&length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
...@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> { ...@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
&offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(), framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
&length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
......
...@@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) { ...@@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) {
val += platform::CudaShuffleDownSync(mask, val, offset); val += platform::CudaShuffleDownSync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0; if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) { if (tid % warpSize == 0) {
shm[tid / warpSize] = val; shm[tid / warpSize] = val;
......
...@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits>
#include <map> #include <map>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random>
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
...@@ -33,6 +36,9 @@ namespace platform { ...@@ -33,6 +36,9 @@ namespace platform {
struct EventList; struct EventList;
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
...@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) { ...@@ -219,13 +225,12 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, if (state == g_state) {
"The profiling state should be disabled when calling ", return;
"EnableProfiler.");
g_state = state;
if (g_state == ProfilerState::kAll) {
GetDeviceTracer()->Enable();
} }
g_state = state;
should_send_profile_state = true;
GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
...@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -435,8 +440,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, if (g_state == ProfilerState::kDisabled) return;
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
...@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -444,12 +448,25 @@ void DisableProfiler(EventSortingKey sorted_key,
ParseEvents(all_events, sorted_key); ParseEvents(all_events, sorted_key);
ResetProfiler(); ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true;
}
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }
void SetProfileListener() {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<int64_t>::max());
profiler_lister_id = dist6(rng);
} }
int64_t ListenerId() { return profiler_lister_id; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -114,5 +114,13 @@ void ResetProfiler(); ...@@ -114,5 +114,13 @@ void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path); const std::string& profile_path);
// Test if the profiler is currently enabled.
bool IsProfileEnabled();
// Whether the trainer should send profiling state to PS.
bool ShouldSendProfileState();
// Mark current process as PS by assigning a lister id.
void SetProfileListener();
int64_t ListenerId();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -40,16 +40,14 @@ import backward ...@@ -40,16 +40,14 @@ import backward
import regularizer import regularizer
import average import average
import metrics import metrics
import transpiler
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
from distribute_transpiler import DistributeTranspiler from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select) channel_close, Select)
from inference_transpiler import InferenceTranspiler
import clip import clip
from memory_optimization_transpiler import memory_optimize, release_memory
import profiler import profiler
import unique_name import unique_name
import recordio_writer import recordio_writer
...@@ -58,10 +56,11 @@ from parallel_executor import ParallelExecutor ...@@ -58,10 +56,11 @@ from parallel_executor import ParallelExecutor
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
trainer.__all__ + inferencer.__all__ + [ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
'transpiler'
'nets', 'nets',
'optimizer', 'optimizer',
'learning_rate_decay', 'learning_rate_decay',
...@@ -76,11 +75,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\ ...@@ -76,11 +75,6 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
'WeightNormParamAttr', 'WeightNormParamAttr',
'DataFeeder', 'DataFeeder',
'clip', 'clip',
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'InferenceTranspiler',
'memory_optimize',
'release_memory',
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer', 'recordio_writer',
......
...@@ -1042,13 +1042,14 @@ class Program(object): ...@@ -1042,13 +1042,14 @@ class Program(object):
Returns(Program): Returns(Program):
The cloned Program object. The cloned Program object.
""" """
p = Program()
if for_test: if for_test:
p.desc = core.inference_optimize(self.desc) p = self.inference_optimize()
else: else:
p = Program()
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p.sync_with_cpp() p.sync_with_cpp()
p.copy_param_info_from(self) p.copy_param_info_from(self)
return p return p
...@@ -1061,7 +1062,7 @@ class Program(object): ...@@ -1061,7 +1062,7 @@ class Program(object):
if isinstance(t, Variable): if isinstance(t, Variable):
# After transpiler processing, the op that output this # After transpiler processing, the op that output this
# variable maybe has been changed, so t.op is not reliable # variable maybe has been changed, so t.op is not reliable
# and we need to find the current op that generate this # and we need to find the current op that generate this
# variable here. # variable here.
t.op = None t.op = None
global_block = self.global_block() global_block = self.global_block()
...@@ -1087,8 +1088,16 @@ class Program(object): ...@@ -1087,8 +1088,16 @@ class Program(object):
return res return res
def inference_optimize(self): def inference_optimize(self):
# this is an alternative implement before
# core.inference_optimize being fixed.
res = Program() res = Program()
res.desc = core.inference_optimize(self.desc) res.desc = core.ProgramDesc(self.desc)
for i in xrange(res.desc.num_blocks()):
block = res.desc.block(i)
for j in xrange(block.op_size()):
op = block.op(j)
if op.has_attr('is_test'):
op.set_attr('is_test', True)
res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
res.sync_with_cpp() res.sync_with_cpp()
return res return res
......
...@@ -251,7 +251,7 @@ class EditDistance(MetricBase): ...@@ -251,7 +251,7 @@ class EditDistance(MetricBase):
self.instance_error += seq_num - seq_right_count self.instance_error += seq_num - seq_right_count
self.total_distance += total_distance self.total_distance += total_distance
def eval(): def eval(self):
if self.seq_num == 0: if self.seq_num == 0:
raise ValueError( raise ValueError(
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
...@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase): ...@@ -280,6 +280,7 @@ class DetectionMAP(MetricBase):
super(DetectionMAP, self).__init__(name) super(DetectionMAP, self).__init__(name)
# the current map value # the current map value
self.value = .0 self.value = .0
self.weight = .0
def update(self, value, weight): def update(self, value, weight):
if not _is_number_or_matrix_(value): if not _is_number_or_matrix_(value):
...@@ -340,8 +341,8 @@ class Auc(MetricBase): ...@@ -340,8 +341,8 @@ class Auc(MetricBase):
raise ValueError("The 'predictions' must be a numpy ndarray.") raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
for i in range(num_thresholds - 2)] for i in range(self._num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count # caculate TP, FN, TN, FP count
...@@ -358,19 +359,20 @@ class Auc(MetricBase): ...@@ -358,19 +359,20 @@ class Auc(MetricBase):
fp += 1 fp += 1
else: else:
tn += 1 tn += 1
tp_list[idx_thresh] += tp self.tp_list[idx_thresh] += tp
fn_list[idx_thresh] += fn self.fn_list[idx_thresh] += fn
tn_list[idx_thresh] += tn self.tn_list[idx_thresh] += tn
fp_list[idx_thresh] += fp self.fp_list[idx_thresh] += fp
def eval(self): def eval(self):
epsilon = self._epsilon epsilon = self._epsilon
num_thresholds = self._num_thresholds num_thresholds = self._num_thresholds
tpr = (tp_list.astype("float32") + epsilon) / ( tpr = (self.tp_list.astype("float32") + epsilon) / (
tp_list + fn_list + epsilon) self.tp_list + self.fn_list + epsilon)
fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon) fpr = self.fp_list.astype("float32") / (
rec = (tp_list.astype("float32") + epsilon) / ( self.fp_list + self.tn_list + epsilon)
tp_list + fp_list + epsilon) rec = (self.tp_list.astype("float32") + epsilon) / (
self.tp_list + self.fp_list + epsilon)
x = fpr[:num_thresholds - 1] - fpr[1:] x = fpr[:num_thresholds - 1] - fpr[1:]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy
WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
WORD_DICT_LEN = len(WORD_DICT)
LABEL_DICT_LEN = len(LABEL_DICT)
PRED_DICT_LEN = len(VERB_DICT)
MARK_DICT_LEN = 2
def lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark):
WORD_DIM = 32
MARK_DIM = 5
HIDDEN_DIM = 512
DEPTH = 8
EMBEDDING_NAME = 'emb'
# Data definitions
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
# 8 features
predicate_embedding = fluid.layers.embedding(
input=predicate,
size=[PRED_DICT_LEN, WORD_DIM],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark,
size=[MARK_DICT_LEN, MARK_DIM],
dtype='float32',
is_sparse=IS_SPARSE)
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
fluid.layers.embedding(
size=[WORD_DICT_LEN, WORD_DIM],
input=x,
param_attr=fluid.ParamAttr(
name=EMBEDDING_NAME, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=HIDDEN_DIM,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, DEPTH):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=HIDDEN_DIM,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
])
return feature_out
def inference_network():
predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
mark)
crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
return crf_decode
def train_network():
MIX_HIDDEN_LR = 1e-3
predict = lstm_net(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,
mark)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
input=predict,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=MIX_HIDDEN_LR))
avg_cost = fluid.layers.mean(crf_cost)
return avg_cost
def train(use_cuda, save_path):
BATCH_SIZE = 128
EPOCH_NUM = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.train(), buf_size=8192),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
if (event.batch_id % 10) == 0:
avg_cost = trainer.test(reader=test_reader)
print('BatchID {0:04}, Loss {1:2.2}'.format(event.batch_id + 1,
avg_cost))
if avg_cost > 0.01: # Low threshold for speeding up CI
trainer.save_params(save_path)
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.01,
decay_steps=100000,
decay_rate=0.5,
staircase=True))
trainer = fluid.Trainer(train_network, optimizer=sgd_optimizer, place=place)
trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
def infer(use_cuda, save_path):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_path, place=place)
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high,
[lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
# Create an input example
lod = [0, 4, 10]
word = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
pred = create_random_lodtensor(lod, place, low=0, high=PRED_DICT_LEN - 1)
ctx_n2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_0 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p1 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p2 = create_random_lodtensor(lod, place, low=0, high=WORD_DICT_LEN - 1)
mark = create_random_lodtensor(lod, place, low=0, high=MARK_DICT_LEN - 1)
results = inferencer.infer({
'word_data': word,
'verb_data': pred,
'ctx_n2_data': ctx_n2,
'ctx_n1_data': ctx_n1,
'ctx_0_data': ctx_0,
'ctx_p1_data': ctx_p1,
'ctx_p2_data': ctx_p2,
'mark_data': mark
})
print("infer results: ", results)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "label_semantic_roles.inference.model"
train(use_cuda, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle
import sys
import numpy
import unittest
import math
import sys
import os
import paddle.v2.dataset as dataset
BATCH_SIZE = 64
def inference_program():
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
return prediction
def train_program():
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
predict = inference_program()
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, acc
def train(use_cuda, save_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
avg_cost, acc = event.values
print("avg_cost: %s" % avg_cost)
print("acc : %s" % acc)
if (event.batch_id + 1) % 10 == 0:
test_metrics = trainer.test(reader=dataset.mnist.test())
avg_cost_set = test_metrics[0]
acc_set = test_metrics[1]
# get test acc and loss
acc = numpy.array(acc_set).mean()
avg_cost = numpy.array(avg_cost_set).mean()
if float(acc) > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.batch_id + 1, float(avg_cost), float(acc)))
if math.isnan(float(avg_cost)):
sys.exit("got NaN loss, training failed.")
trainer.train(
reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler)
def infer(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
[batch_size, 1, 28, 28]).astype("float32")
results = inferencer.infer({'img': tensor_img})
print("infer results: ", results[0])
def main(use_cuda):
save_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train
train(use_cuda=use_cuda, save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle
import sys
import numpy
import unittest
import math
import sys
import os
import paddle.v2.dataset as dataset
BATCH_SIZE = 64
def inference_program():
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
hidden = fluid.layers.fc(input=img, size=200, act='tanh')
hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
return prediction
def train_program():
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
predict = inference_program()
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=predict, label=label)
return avg_cost, acc
def train(use_cuda, save_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer)
def event_handler(event):
if isinstance(event, fluid.EndIteration):
avg_cost, acc = event.values
print("avg_cost: %s" % avg_cost)
print("acc : %s" % acc)
if (event.batch_id + 1) % 10 == 0:
test_metrics = trainer.test(reader=dataset.mnist.test())
avg_cost_set = test_metrics[0]
acc_set = test_metrics[1]
# get test acc and loss
acc = numpy.array(acc_set).mean()
avg_cost = numpy.array(avg_cost_set).mean()
if float(acc) > 0.2: # Smaller value to increase CI speed
trainer.save_params(save_dirname)
else:
print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
event.batch_id + 1, float(avg_cost), float(acc)))
if math.isnan(float(avg_cost)):
sys.exit("got NaN loss, training failed.")
trainer.train(
reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler)
def infer(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_dirname, place=place)
batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0,
[batch_size, 1, 28, 28]).astype("float32")
results = inferencer.infer({'img': tensor_img})
print("infer results: ", results[0])
def main(use_cuda):
save_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train
train(use_cuda=use_cuda, save_dirname=save_dirname)
infer(use_cuda=use_cuda, save_dirname=save_dirname)
if __name__ == '__main__':
for use_cuda in (False, True):
main(use_cuda=use_cuda)
...@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param): ...@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param):
class TestConv3dOp(OpTest): class TestConv3dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv3d"
self.use_cudnn = False self.use_cudnn = False
self.dtype = np.float32
self.init_kernel_type()
self.init_group() self.init_group()
self.init_op_type()
self.init_dilation() self.init_dilation()
self.init_test_case() self.init_test_case()
...@@ -80,20 +82,24 @@ class TestConv3dOp(OpTest): ...@@ -80,20 +82,24 @@ class TestConv3dOp(OpTest):
'stride': self.stride, 'stride': self.stride,
'pad': self.pad, 'pad': self.pad,
'dilations': self.dilations, 'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32") input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv3d_forward_naive(input, filter, self.groups, output = conv3d_forward_naive(input, filter, self.groups,
conv3d_param).astype("float32") conv3d_param).astype(self.dtype)
self.inputs = {'Input': input, 'Filter': filter} self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
'paddings': self.pad, 'paddings': self.pad,
'groups': self.groups, 'groups': self.groups,
'dilations': self.dilations 'dilations': self.dilations,
'use_cudnn': self.use_cudnn
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -108,6 +114,8 @@ class TestConv3dOp(OpTest): ...@@ -108,6 +114,8 @@ class TestConv3dOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -120,6 +128,8 @@ class TestConv3dOp(OpTest): ...@@ -120,6 +128,8 @@ class TestConv3dOp(OpTest):
set(['Input', 'Filter']), 'Output', max_relative_error=0.03) set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -135,6 +145,8 @@ class TestConv3dOp(OpTest): ...@@ -135,6 +145,8 @@ class TestConv3dOp(OpTest):
no_grad_set=set(['Filter'])) no_grad_set=set(['Filter']))
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
if self.testcudnn(): if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -163,8 +175,8 @@ class TestConv3dOp(OpTest): ...@@ -163,8 +175,8 @@ class TestConv3dOp(OpTest):
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 1
def init_op_type(self): def init_kernel_type(self):
self.op_type = "conv3d" pass
class TestCase1(TestConv3dOp): class TestCase1(TestConv3dOp):
...@@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp): ...@@ -235,34 +247,90 @@ class TestWithDilation(TestConv3dOp):
self.groups = 3 self.groups = 3
#----------------Conv3dCUDNN----------------
class TestCUDNN(TestConv3dOp): class TestCUDNN(TestConv3dOp):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16CUDNN(TestConv3dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithGroup1CUDNN(TestWithGroup1): class TestWithGroup1CUDNN(TestWithGroup1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithGroup1CUDNN(TestWithGroup1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithGroup2CUDNN(TestWithGroup2): class TestWithGroup2CUDNN(TestWithGroup2):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithGroup2CUDNN(TestWithGroup2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWith1x1CUDNN(TestWith1x1): class TestWith1x1CUDNN(TestWith1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16With1x1CUDNN(TestWith1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv3d"
class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
# FIXME(typhoonzero): find a way to determine if # FIXME(typhoonzero): find a way to determine if
......
...@@ -18,7 +18,7 @@ import unittest ...@@ -18,7 +18,7 @@ import unittest
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.optimizer as optimizer import paddle.fluid.optimizer as optimizer
from paddle.fluid.framework import Program, program_guard from paddle.fluid.framework import Program, program_guard
from paddle.fluid.memory_optimization_transpiler import memory_optimize from paddle.fluid.transpiler import memory_optimize
class TestControlFlowGraph(unittest.TestCase): class TestControlFlowGraph(unittest.TestCase):
......
...@@ -748,7 +748,7 @@ class TestFetchOp(unittest.TestCase): ...@@ -748,7 +748,7 @@ class TestFetchOp(unittest.TestCase):
data = fluid.layers.data( data = fluid.layers.data(
name='image', shape=[3, 224, 224], dtype='float32') name='image', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = lenet(data, class_dim=102) out = Lenet(data, class_dim=102)
loss = fluid.layers.cross_entropy(input=out, label=label) loss = fluid.layers.cross_entropy(input=out, label=label)
loss = fluid.layers.mean(loss) loss = fluid.layers.mean(loss)
......
...@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x, ...@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x,
class TestPool3d_Op(OpTest): class TestPool3d_Op(OpTest):
def setUp(self): def setUp(self):
self.op_type = "pool3d"
self.use_cudnn = False self.use_cudnn = False
self.dtype = np.float32
self.init_test_case() self.init_test_case()
self.init_global_pool() self.init_global_pool()
self.init_op_type() self.init_kernel_type()
self.init_pool_type() self.init_pool_type()
self.init_ceil_mode() self.init_ceil_mode()
if self.global_pool: if self.global_pool:
self.paddings = [0 for _ in range(len(self.paddings))] self.paddings = [0 for _ in range(len(self.paddings))]
input = np.random.random(self.shape).astype("float32") input = np.random.random(self.shape).astype(self.dtype)
output = self.pool3D_forward_naive(input, self.ksize, self.strides, output = self.pool3D_forward_naive(input, self.ksize, self.strides,
self.paddings, self.global_pool, self.paddings, self.global_pool,
self.ceil_mode).astype("float32") self.ceil_mode).astype(self.dtype)
self.inputs = {'X': input} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
self.attrs = { self.attrs = {
'strides': self.strides, 'strides': self.strides,
...@@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest): ...@@ -116,7 +118,7 @@ class TestPool3d_Op(OpTest):
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
self.outputs = {'Out': output.astype('float32')} self.outputs = {'Out': output}
def testcudnn(self): def testcudnn(self):
return core.is_compiled_with_cuda() and self.use_cudnn return core.is_compiled_with_cuda() and self.use_cudnn
...@@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest): ...@@ -129,6 +131,8 @@ class TestPool3d_Op(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.testcudnn() and self.pool_type != "max": if self.testcudnn() and self.pool_type != "max":
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest): ...@@ -142,8 +146,8 @@ class TestPool3d_Op(OpTest):
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [0, 0, 0] self.paddings = [0, 0, 0]
def init_op_type(self): def init_kernel_type(self):
self.op_type = "pool3d" pass
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
...@@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest): ...@@ -158,15 +162,11 @@ class TestPool3d_Op(OpTest):
class TestCase1(TestPool3d_Op): class TestCase1(TestPool3d_Op):
def init_test_case(self): def init_test_case(self):
self.op_type = "pool3d"
self.shape = [2, 3, 7, 7, 7] self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3] self.ksize = [3, 3, 3]
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [0, 0, 0] self.paddings = [0, 0, 0]
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive self.pool3D_forward_naive = avg_pool3D_forward_naive
...@@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op): ...@@ -182,9 +182,6 @@ class TestCase2(TestPool3d_Op):
self.strides = [1, 1, 1] self.strides = [1, 1, 1]
self.paddings = [1, 1, 1] self.paddings = [1, 1, 1]
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive self.pool3D_forward_naive = avg_pool3D_forward_naive
...@@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op): ...@@ -194,27 +191,18 @@ class TestCase2(TestPool3d_Op):
class TestCase3(TestPool3d_Op): class TestCase3(TestPool3d_Op):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
class TestCase4(TestCase1): class TestCase4(TestCase1):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
class TestCase5(TestCase2): class TestCase5(TestCase2):
def init_op_type(self):
self.op_type = "pool3d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive self.pool3D_forward_naive = max_pool3D_forward_naive
...@@ -222,39 +210,105 @@ class TestCase5(TestCase2): ...@@ -222,39 +210,105 @@ class TestCase5(TestCase2):
#--------------------test pool3d-------------------- #--------------------test pool3d--------------------
class TestCUDNNCase1(TestPool3d_Op): class TestCUDNNCase1(TestPool3d_Op):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase1(TestPool3d_Op):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase2(TestCase1): class TestCUDNNCase2(TestCase1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase3(TestCase2): class TestCUDNNCase3(TestCase2):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase4(TestCase3): class TestCUDNNCase4(TestCase3):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase5(TestCase4): class TestCUDNNCase5(TestCase4):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase6(TestCase5): class TestCUDNNCase6(TestCase5):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool3d"
class TestFP16CUDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCeilModeCase1(TestCUDNNCase1): class TestCeilModeCase1(TestCUDNNCase1):
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import math import math
import unittest import unittest
from paddle.fluid.distribute_transpiler import split_dense_variable from paddle.fluid.transpiler.distribute_transpiler import split_dense_variable
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import random import random
......
...@@ -19,10 +19,11 @@ import executor ...@@ -19,10 +19,11 @@ import executor
import data_feeder import data_feeder
import contextlib import contextlib
import io import io
import transpiler
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
import optimizer as opt_module import optimizer as opt_module
import distribute_transpiler from transpiler import distribute_transpiler
__all__ = [ __all__ = [
'Trainer', 'Trainer',
...@@ -172,9 +173,9 @@ class Trainer(object): ...@@ -172,9 +173,9 @@ class Trainer(object):
def save_params(self, param_path): def save_params(self, param_path):
# reference: save_persistables in io.py # reference: save_persistables in io.py
exe = executor.Executor(self.place) with self._prog_and_scope_guard():
io.save_persistables( exe = executor.Executor(self.place)
exe, dirname=param_path, main_program=self.startup_program) io.save_persistables(exe, dirname=param_path)
@staticmethod @staticmethod
def _check_and_get_place(place): def _check_and_get_place(place):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from distribute_transpiler import DistributeTranspiler
from inference_transpiler import InferenceTranspiler
from memory_optimization_transpiler import memory_optimize, release_memory
from distribute_transpiler_simple import SimpleDistributeTranspiler
__all__ = [
"DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
"memory_optimize", "release_memory"
]
...@@ -17,9 +17,8 @@ from __future__ import print_function ...@@ -17,9 +17,8 @@ from __future__ import print_function
import math import math
import distributed_splitter as splitter import distributed_splitter as splitter
import framework from .. import core
from framework import Program, default_main_program, Variable, Parameter from ..framework import Program, default_main_program, Variable, Parameter
from . import core
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
...@@ -135,6 +134,16 @@ def split_dense_variable(var_list, ...@@ -135,6 +134,16 @@ def split_dense_variable(var_list,
return blocks return blocks
def delete_ops(block, ops):
try:
start = list(block.ops).index(ops[0])
end = list(block.ops).index(ops[-1])
[block.remove_op(start) for _ in xrange(end - start + 1)]
except Exception, e:
raise e
block.program.sync_with_cpp()
class DistributeTranspiler: class DistributeTranspiler:
def transpile(self, def transpile(self,
trainer_id, trainer_id,
...@@ -317,7 +326,7 @@ class DistributeTranspiler: ...@@ -317,7 +326,7 @@ class DistributeTranspiler:
def get_trainer_program(self): def get_trainer_program(self):
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
self.delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
# FIXME(typhoonzero): serialize once will fix error occurs when clone. # FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.origin_program.__str__() self.origin_program.__str__()
return self.origin_program return self.origin_program
...@@ -601,7 +610,7 @@ class DistributeTranspiler: ...@@ -601,7 +610,7 @@ class DistributeTranspiler:
attrs={"axis": 0}) attrs={"axis": 0})
# delete lookup_table_op # delete lookup_table_op
self.delete_ops(program.global_block(), [op]) delete_ops(program.global_block(), [op])
# break for loop # break for loop
break break
...@@ -1164,12 +1173,3 @@ class DistributeTranspiler: ...@@ -1164,12 +1173,3 @@ class DistributeTranspiler:
in_name.startswith("beta2_pow_acc"): in_name.startswith("beta2_pow_acc"):
return True return True
return False return False
def delete_ops(self, block, ops):
try:
start = list(block.ops).index(ops[0])
end = list(block.ops).index(ops[-1])
[block.remove_op(start) for _ in xrange(end - start + 1)]
except Exception, e:
raise e
block.program.sync_with_cpp()
...@@ -12,10 +12,8 @@ ...@@ -12,10 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import framework from ..framework import Program, default_main_program, Parameter, Variable
from framework import Program, default_main_program, Parameter, Variable from ..layer_helper import LayerHelper
import optimizer
from layer_helper import LayerHelper
def hash_name_to_server(params_grads, pserver_endpoints): def hash_name_to_server(params_grads, pserver_endpoints):
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
from framework import Program from .. import core
from executor import global_scope from ..framework import Program
from . import core from ..executor import global_scope
class InferenceTranspiler: class InferenceTranspiler:
......
...@@ -13,11 +13,9 @@ ...@@ -13,11 +13,9 @@
# limitations under the License. # limitations under the License.
from collections import defaultdict from collections import defaultdict
import framework from .. import core
from framework import Program, default_main_program, Parameter, Variable from ..framework import Program, default_main_program, Parameter, Variable
import backward from ..backward import _rename_arg_
from backward import _rename_arg_
from . import core
dtype_to_size = { dtype_to_size = {
core.VarDesc.VarType.FP16: 2, core.VarDesc.VarType.FP16: 2,
......
...@@ -68,7 +68,8 @@ packages=['paddle', ...@@ -68,7 +68,8 @@ packages=['paddle',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers'] 'paddle.fluid.layers',
'paddle.fluid.transpiler']
if '${WITH_FLUID_ONLY}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
packages+=['paddle.proto', packages+=['paddle.proto',
......
...@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2 ...@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
'--profile_path', type=str, default='', help='Input profile file name.') '--profile_path',
type=str,
default='',
help='Input profile file name. If there are multiple file, the format '
'should be trainer1=file1,trainer2=file2,ps=file3')
parser.add_argument( parser.add_argument(
'--timeline_path', type=str, default='', help='Output timeline file name.') '--timeline_path', type=str, default='', help='Output timeline file name.')
args = parser.parse_args() args = parser.parse_args()
...@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object): ...@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
class Timeline(object): class Timeline(object):
def __init__(self, profile_pb): def __init__(self, profile_dict):
self._profile_pb = profile_pb self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
...@@ -120,35 +124,37 @@ class Timeline(object): ...@@ -120,35 +124,37 @@ class Timeline(object):
return cur_pid return cur_pid
def _allocate_pids(self): def _allocate_pids(self):
for event in self._profile_pb.events: for k, profile_pb in self._profile_dict.iteritems():
if event.type == profiler_pb2.Event.CPU: for event in profile_pb.events:
if (event.device_id, "CPU") not in self._devices: if event.type == profiler_pb2.Event.CPU:
pid = self._allocate_pid() if (k, event.device_id, "CPU") not in self._devices:
self._devices[(event.device_id, "CPU")] = pid pid = self._allocate_pid()
self._chrome_trace.emit_pid("cpu:block:%d" % self._devices[(k, event.device_id, "CPU")] = pid
(event.device_id), pid) self._chrome_trace.emit_pid("%s:cpu:block:%d" %
elif event.type == profiler_pb2.Event.GPUKernel: (k, event.device_id), pid)
if (event.device_id, "GPUKernel") not in self._devices: elif event.type == profiler_pb2.Event.GPUKernel:
pid = self._allocate_pid() if (k, event.device_id, "GPUKernel") not in self._devices:
self._devices[(event.device_id, "GPUKernel")] = pid pid = self._allocate_pid()
self._chrome_trace.emit_pid("gpu:%d" % (event.device_id), self._devices[(k, event.device_id, "GPUKernel")] = pid
pid) self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid)
def _allocate_events(self): def _allocate_events(self):
for event in self._profile_pb.events: for k, profile_pb in self._profile_dict.iteritems():
if event.type == profiler_pb2.Event.CPU: for event in profile_pb.events:
type = "CPU" if event.type == profiler_pb2.Event.CPU:
elif event.type == profiler_pb2.Event.GPUKernel: type = "CPU"
type = "GPUKernel" elif event.type == profiler_pb2.Event.GPUKernel:
pid = self._devices[(event.device_id, type)] type = "GPUKernel"
args = {'name': event.name} pid = self._devices[(k, event.device_id, type)]
if event.memcopy.bytes > 0: args = {'name': event.name}
args = {'mem_bytes': event.memcopy.bytes} if event.memcopy.bytes > 0:
# TODO(panyx0718): Chrome tracing only handles ms. However, some args = {'mem_bytes': event.memcopy.bytes}
# ops takes micro-seconds. Hence, we keep the ns here. # TODO(panyx0718): Chrome tracing only handles ms. However, some
self._chrome_trace.emit_region( # ops takes micro-seconds. Hence, we keep the ns here.
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, self._chrome_trace.emit_region(
event.sub_device_id, 'Op', event.name, args) event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args)
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
...@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline' ...@@ -163,11 +169,23 @@ timeline_path = '/tmp/timeline'
if args.timeline_path: if args.timeline_path:
timeline_path = args.timeline_path timeline_path = args.timeline_path
with open(profile_path, 'r') as f: profile_paths = profile_path.split(',')
profile_s = f.read() profile_dict = dict()
profile_pb = profiler_pb2.Profile() if len(profile_path) == 1:
profile_pb.ParseFromString(profile_s) with open(profile_path, 'r') as f:
profile_s = f.read()
tl = Timeline(profile_pb) profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
profile_dict['trainer'] = profile_pb
else:
for profile_path in profile_paths:
k, v = profile_path.split('=')
with open(v, 'r') as f:
profile_s = f.read()
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
profile_dict[k] = profile_pb
tl = Timeline(profile_dict)
with open(timeline_path, 'w') as f: with open(timeline_path, 'w') as f:
f.write(tl.generate_chrome_trace()) f.write(tl.generate_chrome_trace())
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册