提交 577c19b2 编写于 作者: Y yuyang18

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/support_op_role

...@@ -19,8 +19,9 @@ ...@@ -19,8 +19,9 @@
---------------- ----------------
PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者 可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到,您也可以
参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。 在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_ 找到 paddle_manylinux_devel
镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。
如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
......
...@@ -22,6 +22,8 @@ How To Build ...@@ -22,6 +22,8 @@ How To Build
You need to use Docker to build PaddlePaddle You need to use Docker to build PaddlePaddle
to avoid installing dependencies by yourself. We have several pre-built to avoid installing dependencies by yourself. We have several pre-built
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ , Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
you can also find how to build and use paddle_manylinux_devel Docker image from
`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`_
Or you can build your own image from source as the optional step below: Or you can build your own image from source as the optional step below:
.. code-block:: bash .. code-block:: bash
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <sstream> #include <sstream>
#include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
namespace paddle { namespace paddle {
...@@ -58,8 +59,8 @@ class EngineBase { ...@@ -58,8 +59,8 @@ class EngineBase {
struct Buffer { struct Buffer {
void* buffer{nullptr}; // buffer should be allocated only once. void* buffer{nullptr}; // buffer should be allocated only once.
int max_size; // buffer allocated space. size_t max_size; // buffer allocated space.
int size; // data size. size_t size; // data size.
DeviceType device{DeviceType::UNK}; // tells which device this buffer is on. DeviceType device{DeviceType::UNK}; // tells which device this buffer is on.
}; };
......
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto) nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(convert) add_subdirectory(convert)
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc op_converter.h DEPS ${FLUID_CORE_MODULES}) nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine) DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
...@@ -23,26 +23,42 @@ namespace tensorrt { ...@@ -23,26 +23,42 @@ namespace tensorrt {
using platform::is_gpu_place; using platform::is_gpu_place;
using platform::is_cpu_place; using platform::is_cpu_place;
class DefaultInputConverter : public EngineInputConverter { class DefaultIOConverter : public EngineIOConverter {
public: public:
DefaultInputConverter() {} DefaultIOConverter() {}
// NOTE out is GPU memory. // NOTE out is GPU memory.
virtual void operator()(const LoDTensor& in, void* out, virtual void operator()(const LoDTensor& in, void* out,
size_t max_size) override { size_t max_size) override {
PADDLE_ENFORCE(out != nullptr); PADDLE_ENFORCE(out != nullptr);
PADDLE_ENFORCE_LE(in.memory_size(), max_size); PADDLE_ENFORCE(stream_ != nullptr);
const auto& place = in.place(); const auto& place = in.place();
size_t size = in.memory_size();
PADDLE_ENFORCE_LE(size, max_size);
if (is_cpu_place(place)) { if (is_cpu_place(place)) {
PADDLE_ENFORCE(stream_ != nullptr); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
PADDLE_ENFORCE_EQ(0,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
cudaMemcpyHostToDevice, *stream_)); cudaMemcpyHostToDevice, *stream_));
} else if (is_gpu_place(place)) { } else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
cudaMemcpyAsync(out, in.data<float>(), in.memory_size(), cudaMemcpyDeviceToDevice, *stream_));
cudaMemcpyHostToHost, *stream_)); } else {
PADDLE_THROW("Unknown device for converter");
}
cudaStreamSynchronize(*stream_);
}
// NOTE in is GPU memory.
virtual void operator()(const void* in, LoDTensor* out,
size_t max_size) override {
PADDLE_ENFORCE(in != nullptr);
PADDLE_ENFORCE(stream_ != nullptr);
const auto& place = out->place();
size_t size = out->memory_size();
PADDLE_ENFORCE_LE(size, max_size);
if (is_cpu_place(place)) {
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
cudaMemcpyDeviceToHost, *stream_));
} else if (is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
cudaMemcpyDeviceToDevice, *stream_));
} else { } else {
PADDLE_THROW("Unknown device for converter"); PADDLE_THROW("Unknown device for converter");
} }
...@@ -50,7 +66,8 @@ class DefaultInputConverter : public EngineInputConverter { ...@@ -50,7 +66,8 @@ class DefaultInputConverter : public EngineInputConverter {
} }
}; };
REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter); // fluid LodTensor <-> tensorrt ITensor
REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -25,43 +26,57 @@ namespace tensorrt { ...@@ -25,43 +26,57 @@ namespace tensorrt {
using framework::LoDTensor; using framework::LoDTensor;
/* /*
* Convert Input from Fluid to an Engine. * Convert Input from Fluid to TensorRT Engine.
* TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in * Convert Output from TensorRT Engine to Fluid.
* most cases just need to copy the data. *
* Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
* major,
* so in the default case just need to copy the data.
*/ */
class EngineInputConverter { class EngineIOConverter {
public: public:
EngineInputConverter() {} EngineIOConverter() {}
virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
void SetStream(cudaStream_t* stream) { stream_ = stream; } void SetStream(cudaStream_t* stream) { stream_ = stream; }
static void Run(const std::string& in_op_type, const LoDTensor& in, void* out, static void ConvertInput(const std::string& op_type, const LoDTensor& in,
size_t max_size, cudaStream_t* stream) { void* out, size_t max_size, cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr); PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineInputConverter>::Lookup( auto* converter = Registry<EngineIOConverter>::Lookup(
in_op_type, "default" /* default_type */); op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter); PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream); converter->SetStream(stream);
(*converter)(in, out, max_size); (*converter)(in, out, max_size);
} }
virtual ~EngineInputConverter() {} static void ConvertOutput(const std::string& op_type, const void* in,
LoDTensor* out, size_t max_size,
cudaStream_t* stream) {
PADDLE_ENFORCE(stream != nullptr);
auto* converter = Registry<EngineIOConverter>::Lookup(
op_type, "default" /* default_type */);
PADDLE_ENFORCE_NOT_NULL(converter);
converter->SetStream(stream);
(*converter)(in, out, max_size);
}
virtual ~EngineIOConverter() {}
protected: protected:
cudaStream_t* stream_{nullptr}; cudaStream_t* stream_{nullptr};
}; };
#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \
struct trt_io_##op_type__##_converter { \
trt_io_##op_type__##_converter() { \
Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
} \
}; \
trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
struct trt_input_##in_op_type__##_converter { \
trt_input_##in_op_type__##_converter() { \
::paddle::inference::Registry<EngineInputConverter>::Register< \
Converter__>(#in_op_type__); \
} \
}; \
trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -26,7 +27,7 @@ namespace paddle { ...@@ -26,7 +27,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void Compare(float input, float expect) { void Compare(const std::string op_type, float input, float expect) {
framework::Scope scope; framework::Scope scope;
platform::CUDAPlace place; platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
...@@ -35,6 +36,7 @@ void Compare(float input, float expect) { ...@@ -35,6 +36,7 @@ void Compare(float input, float expect) {
auto x_var = scope.Var("X"); auto x_var = scope.Var("X");
auto x_tensor = x_var->GetMutable<framework::LoDTensor>(); auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
x_tensor->Resize({1, 1}); x_tensor->Resize({1, 1});
x_tensor->mutable_data<float>(place);
std::vector<float> init; std::vector<float> init;
init.push_back(input); init.push_back(input);
framework::TensorFromVector(init, ctx, x_tensor); framework::TensorFromVector(init, ctx, x_tensor);
...@@ -45,14 +47,15 @@ void Compare(float input, float expect) { ...@@ -45,14 +47,15 @@ void Compare(float input, float expect) {
out_tensor->mutable_data<float>(place); out_tensor->mutable_data<float>(place);
framework::OpDesc op_desc; framework::OpDesc op_desc;
op_desc.SetType("relu"); op_desc.SetType(op_type);
op_desc.SetInput("X", {"X"}); op_desc.SetInput("X", {"X"});
op_desc.SetOutput("Out", {"Out"}); op_desc.SetOutput("Out", {"Out"});
auto relu_op = framework::OpRegistry::CreateOp(*op_desc.Proto()); auto op = framework::OpRegistry::CreateOp(*op_desc.Proto());
// run fluid op // run fluid op
relu_op->Run(scope, place); op->Run(scope, place);
// get fluid output
std::vector<float> out1; std::vector<float> out1;
framework::TensorToVector(*out_tensor, ctx, &out1); framework::TensorToVector(*out_tensor, ctx, &out1);
...@@ -63,21 +66,28 @@ void Compare(float input, float expect) { ...@@ -63,21 +66,28 @@ void Compare(float input, float expect) {
engine->InitNetwork(); engine->InitNetwork();
engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1}); nvinfer1::DimsCHW{1, 1, 1});
// convert op
OpConverter op_converter; OpConverter op_converter;
op_converter.ConvertOp(*op_desc.Proto(), engine); op_converter.ConvertOp(*op_desc.Proto(), engine);
engine->DeclareOutput("Out"); engine->DeclareOutput("Out");
engine->FreezeNetwork(); engine->FreezeNetwork();
engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
// run tensorrt op // convert LoDTensor to ITensor
size_t size = x_tensor->memory_size();
EngineIOConverter::ConvertInput(op_type, *x_tensor,
engine->buffer("X").buffer, size, &stream);
// run tensorrt Outp
engine->Execute(1); engine->Execute(1);
// convert ITensor to LoDTensor
float out2; EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer,
engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float)); out_tensor, size, &stream);
// get tensorrt output
ASSERT_EQ(out1[0], out2); std::vector<float> out2;
framework::TensorToVector(*out_tensor, ctx, &out2);
// compare
ASSERT_EQ(out1[0], out2[0]);
ASSERT_EQ(out1[0], expect); ASSERT_EQ(out1[0], expect);
delete engine; delete engine;
...@@ -85,8 +95,8 @@ void Compare(float input, float expect) { ...@@ -85,8 +95,8 @@ void Compare(float input, float expect) {
} }
TEST(OpConverter, ConvertRelu) { TEST(OpConverter, ConvertRelu) {
Compare(1, 1); // relu(1) = 1 Compare("relu", 1, 1); // relu(1) = 1
Compare(-5, 0); // relu(-5) = 0 Compare("relu", -5, 0); // relu(-5) = 0
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
class EngineInputConverterTester : public ::testing::Test { void IOConverterTester(const platform::DeviceContext& ctx) {
public: cudaStream_t stream;
void SetUp() override { tensor.Resize({10, 10}); } ASSERT_EQ(0, cudaStreamCreate(&stream));
framework::LoDTensor tensor; // init fluid in_tensor
}; framework::LoDTensor in_tensor;
in_tensor.Resize({10, 10});
auto place = ctx.GetPlace();
in_tensor.mutable_data<float>(place);
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(i);
}
framework::TensorFromVector(init, ctx, &in_tensor);
TEST_F(EngineInputConverterTester, DefaultCPU) { // init tensorrt buffer
void* buffer; void* buffer;
tensor.mutable_data<float>(platform::CPUPlace()); size_t size = in_tensor.memory_size();
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); ASSERT_EQ(cudaMalloc(&buffer, size), 0);
cudaStream_t stream; // convert fluid in_tensor to tensorrt buffer
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
&stream);
// convert tensorrt buffer to fluid out_tensor
framework::LoDTensor out_tensor;
out_tensor.Resize({10, 10});
out_tensor.mutable_data<float>(place);
EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
// compare in_tensor and out_tensor
std::vector<float> result;
framework::TensorToVector(out_tensor, ctx, &result);
EXPECT_EQ(init.size(), result.size());
for (size_t i = 0; i < init.size(); i++) {
EXPECT_EQ(init[i], result[i]);
}
cudaStreamDestroy(stream);
} }
TEST_F(EngineInputConverterTester, DefaultGPU) { TEST(EngineIOConverterTester, DefaultCPU) {
void* buffer; platform::CPUPlace place;
tensor.mutable_data<float>(platform::CUDAPlace()); platform::CPUDeviceContext ctx(place);
ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); IOConverterTester(ctx);
}
cudaStream_t stream; TEST(EngineIOConverterTester, DefaultGPU) {
EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), platform::CUDAPlace place;
&stream); platform::CUDADeviceContext ctx(place);
IOConverterTester(ctx);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -170,7 +170,7 @@ def train(word_dict, ...@@ -170,7 +170,7 @@ def train(word_dict,
assert save_dirname is None assert save_dirname is None
adagrad = fluid.optimizer.Adagrad(learning_rate=0.002) adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
optimize_ops, params_grads = adagrad.minimize(cost) adagrad.minimize(cost)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
......
...@@ -33,7 +33,7 @@ def train(use_cuda, save_dirname, is_local): ...@@ -33,7 +33,7 @@ def train(use_cuda, save_dirname, is_local):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20 BATCH_SIZE = 20
......
...@@ -125,7 +125,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -125,7 +125,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimize_ops, params_grads = optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 1 PASS_NUM = 1
......
...@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
decay_steps=100000, decay_steps=100000,
decay_rate=0.5, decay_rate=0.5,
staircase=True)) staircase=True))
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
# TODO(qiao) # TODO(qiao)
# add dependency track and move this config before optimizer # add dependency track and move this config before optimizer
......
...@@ -185,7 +185,7 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -185,7 +185,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
learning_rate=1e-4, learning_rate=1e-4,
regularization=fluid.regularizer.L2DecayRegularizer( regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.1)) regularization_coeff=0.1))
optimize_ops, params_grads = optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
......
...@@ -95,7 +95,7 @@ def train(nn_type, ...@@ -95,7 +95,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimize_ops, params_grads = optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -160,7 +160,7 @@ def train(use_cuda, save_dirname, is_local=True): ...@@ -160,7 +160,7 @@ def train(use_cuda, save_dirname, is_local=True):
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
sgd_optimizer = SGDOptimizer(learning_rate=0.2) sgd_optimizer = SGDOptimizer(learning_rate=0.2)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -101,7 +101,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -101,7 +101,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
avg_cost = fluid.layers.mean(pd()) avg_cost = fluid.layers.mean(pd())
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.layers as layers
from paddle.fluid.transpiler.distribute_transpiler import delete_ops
import numpy
class TestDistTranspiler(unittest.TestCase):
def setUp(self):
self.trainer_id = 0
self.trainers = 2
self.pservers = 2
self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
self.current_pserver_ep = "127.0.0.1:6174"
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
return optimize_ops, params_grads
def test_transpiler(self):
trainer = self.get_trainer()
pserver, startup = self.get_pserver(self.current_pserver_ep)
self.assertEqual([op.type for op in trainer.global_block().ops],
self.get_expect_trainer_ops())
self.assertEqual(len(pserver.blocks), 3)
# block0: listen_and_serv
self.assertEqual([op.type for op in pserver.blocks[0].ops],
["listen_and_serv"])
# block2: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "sgd"])
# confirm startup program
self.assertEqual([op.type for op in startup.global_block().ops], [
"fill_constant", "fill_constant", "uniform_random", "uniform_random"
])
# the variable #fc_w will be split into two blocks
fc_w_var = startup.global_block().var("fc_w.block1")
self.assertEqual(fc_w_var.shape, (500, 1000))
def get_main_program(self):
main = fluid.Program()
with fluid.program_guard(main):
self.net_conf()
return main
def get_expect_trainer_ops(self):
trainer = fluid.Program()
with fluid.program_guard(trainer):
optimize_ops, params_grads = self.net_conf()
delete_ops(trainer.global_block(), optimize_ops)
return [op.type for op in trainer.global_block().ops
] + ["split_byref", "send", "concat"]
def get_trainer(self):
return self._transpiler_instance().get_trainer_program()
def get_pserver(self, ep):
t = self._transpiler_instance()
pserver = t.get_pserver_program(ep)
startup = t.get_startup_program(ep, pserver)
return pserver, startup
def _transpiler_instance(self):
main = self.get_main_program()
t = fluid.DistributeTranspiler()
t.transpile(
self.trainer_id,
program=main,
pservers=self.pserver_eps,
trainers=self.trainers)
return t
if __name__ == "__main__":
unittest.main()
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler
from inference_transpiler import InferenceTranspiler from inference_transpiler import InferenceTranspiler
from memory_optimization_transpiler import memory_optimize, release_memory from memory_optimization_transpiler import memory_optimize, release_memory
......
...@@ -17,7 +17,7 @@ from __future__ import print_function ...@@ -17,7 +17,7 @@ from __future__ import print_function
import math import math
import distributed_splitter as splitter import distributed_splitter as splitter
from .. import core from .. import core, framework
from ..framework import Program, default_main_program, \ from ..framework import Program, default_main_program, \
default_startup_program, \ default_startup_program, \
Variable, Parameter, grad_var_name Variable, Parameter, grad_var_name
...@@ -417,7 +417,7 @@ class DistributeTranspiler: ...@@ -417,7 +417,7 @@ class DistributeTranspiler:
def __append_optimize_op__(op, block, grad_to_block_id): def __append_optimize_op__(op, block, grad_to_block_id):
if self._is_opt_op(op): if self._is_opt_op(op):
self._append_pserver_ops(block, op, endpoint, grad_to_block_id, self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
default_main_program()) self.origin_program)
else: else:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
......
...@@ -28,3 +28,38 @@ git clone https://github.com/paddlepaddle/paddle ...@@ -28,3 +28,38 @@ git clone https://github.com/paddlepaddle/paddle
cd paddle/tools/manylinux1 cd paddle/tools/manylinux1
REPO=[yourrepo] ./build_all.sh REPO=[yourrepo] ./build_all.sh
``` ```
## Build PaddlePaddle for the different Python ABIs
Choose one of the following Python ABI and set the correct environment variables.
- cp27-cp27m
```bash
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
```
- cp27-cp27mu
```bash
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
```
And then add the `PYTHON_FLAGS` as your cmake flags:
```bash
cmake ..
${PYTHON_FLAGS} \
-DWITH_GPU=OFF \
...
```
You can find more details about cmake flags at [here](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html#appendix-build-options)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册