提交 c806eeff 编写于 作者: K Kavya Srinet

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into doc

# Advbox
Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
## How to use
1. train a model and save it's parameters. (like fluid_mnist.py)
2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
3. use advbox to generate the adversarial sample.
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A set of tools for generating adversarial example on paddle platform
"""
"""
The base model of the model.
"""
from abc import ABCMeta, abstractmethod
class Attack(object):
"""
Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
which search an adversarial example. subclass should implement the _apply() method.
Args:
model(Model): an instance of the class advbox.base.Model.
"""
__metaclass__ = ABCMeta
def __init__(self, model):
self.model = model
def __call__(self, image_label):
"""
Generate the adversarial sample.
Args:
image_label(list): The image and label tuple list with one element.
"""
adv_img = self._apply(image_label)
return adv_img
@abstractmethod
def _apply(self, image_label):
"""
Search an adversarial example.
Args:
image_batch(list): The image and label tuple list with one element.
"""
raise NotImplementedError
"""
This module provide the attack method for FGSM's implement.
"""
from __future__ import division
import numpy as np
from collections import Iterable
from .base import Attack
class GradientSignAttack(Attack):
"""
This attack was originally implemented by Goodfellow et al. (2015) with the
infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
the Fast Gradient Method.
Paper link: https://arxiv.org/abs/1412.6572
"""
def _apply(self, image_label, epsilons=1000):
assert len(image_label) == 1
pre_label = np.argmax(self.model.predict(image_label))
min_, max_ = self.model.bounds()
gradient = self.model.gradient(image_label)
gradient_sign = np.sign(gradient) * (max_ - min_)
if not isinstance(epsilons, Iterable):
epsilons = np.linspace(0, 1, num=epsilons + 1)
for epsilon in epsilons:
adv_img = image_label[0][0].reshape(
gradient_sign.shape) + epsilon * gradient_sign
adv_img = np.clip(adv_img, min_, max_)
adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
if pre_label != adv_label:
return adv_img
FGSM = GradientSignAttack
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Paddle model for target of attack
"""
"""
The base model of the model.
"""
from abc import ABCMeta
import abc
abstractmethod = abc.abstractmethod
class Model(object):
"""
Base class of model to provide attack.
Args:
bounds(tuple): The lower and upper bound for the image pixel.
channel_axis(int): The index of the axis that represents the color channel.
preprocess(tuple): Two element tuple used to preprocess the input. First
substract the first element, then divide the second element.
"""
__metaclass__ = ABCMeta
def __init__(self, bounds, channel_axis, preprocess=None):
assert len(bounds) == 2
assert channel_axis in [0, 1, 2, 3]
if preprocess is None:
preprocess = (0, 1)
self._bounds = bounds
self._channel_axis = channel_axis
self._preprocess = preprocess
def bounds(self):
"""
Return the upper and lower bounds of the model.
"""
return self._bounds
def channel_axis(self):
"""
Return the channel axis of the model.
"""
return self._channel_axis
def _process_input(self, input_):
res = input_
sub, div = self._preprocess
if sub != 0:
res = input_ - sub
assert div != 0
if div != 1:
res /= div
return res
@abstractmethod
def predict(self, image_batch):
"""
Calculate the prediction of the image batch.
Args:
image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
Return:
numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
"""
raise NotImplementedError
@abstractmethod
def num_classes(self):
"""
Determine the number of the classes
Return:
int: the number of the classes
"""
raise NotImplementedError
@abstractmethod
def gradient(self, image_batch):
"""
Calculate the gradient of the cross-entropy loss w.r.t the image.
Args:
image_batch(list): The image and label tuple list.
Return:
numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
the shape (height, width, channel).
"""
raise NotImplementedError
from __future__ import absolute_import
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
from paddle.v2.fluid.framework import program_guard
from .base import Model
class PaddleModel(Model):
"""
Create a PaddleModel instance.
When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
Args:
program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
input_name(string): The name of the input.
logits_name(string): The name of the logits.
predict_name(string): The name of the predict.
cost_name(string): The name of the loss in the program.
"""
def __init__(self,
program,
input_name,
logits_name,
predict_name,
cost_name,
bounds,
channel_axis=3,
preprocess=None):
super(PaddleModel, self).__init__(
bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
if preprocess is None:
preprocess = (0, 1)
self._program = program
self._place = fluid.CPUPlace()
self._exe = fluid.Executor(self._place)
self._input_name = input_name
self._logits_name = logits_name
self._predict_name = predict_name
self._cost_name = cost_name
# gradient
loss = self._program.block(0).var(self._cost_name)
param_grads = fluid.backward.append_backward(
loss, parameter_list=[self._input_name])
self._gradient = dict(param_grads)[self._input_name]
def predict(self, image_batch):
"""
Predict the label of the image_batch.
Args:
image_batch(list): The image and label tuple list.
Return:
numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
"""
feeder = fluid.DataFeeder(
feed_list=[self._input_name, self._logits_name],
place=self._place,
program=self._program)
predict_var = self._program.block(0).var(self._predict_name)
predict = self._exe.run(self._program,
feed=feeder.feed(image_batch),
fetch_list=[predict_var])
return predict
def num_classes(self):
"""
Calculate the number of classes of the output label.
Return:
int: the number of classes
"""
predict_var = self._program.block(0).var(self._predict_name)
assert len(predict_var.shape) == 2
return predict_var.shape[1]
def gradient(self, image_batch):
"""
Calculate the gradient of the loss w.r.t the input.
Args:
image_batch(list): The image and label tuple list.
Return:
list: The list of the gradient of the image.
"""
feeder = fluid.DataFeeder(
feed_list=[self._input_name, self._logits_name],
place=self._place,
program=self._program)
grad, = self._exe.run(self._program,
feed=feeder.feed(image_batch),
fetch_list=[self._gradient])
return grad
"""
CNN on mnist data using fluid api of paddlepaddle
"""
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
def mnist_cnn_model(img):
"""
Mnist cnn model
Args:
img(Varaible): the input image to be recognized
Returns:
Variable: the label prediction
"""
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
num_filters=20,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
num_filters=50,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
return logits
def main():
"""
Train the cnn model on mnist datasets
"""
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
logits = mnist_cnn_model(img)
cost = fluid.layers.cross_entropy(input=logits, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
BATCH_SIZE = 50
PASS_NUM = 3
ACC_THRESHOLD = 0.98
LOSS_THRESHOLD = 10.0
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+ str(pass_acc))
if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
break
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
fluid.io.save_params(
exe, dirname='./mnist', main_program=fluid.default_main_program())
print('train mnist done')
if __name__ == '__main__':
main()
"""
FGSM demos on mnist using advbox tool.
"""
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import matplotlib.pyplot as plt
import numpy as np
from advbox.models.paddle import PaddleModel
from advbox.attacks.gradientsign import GradientSignAttack
def cnn_model(img):
"""
Mnist cnn model
Args:
img(Varaible): the input image to be recognized
Returns:
Variable: the label prediction
"""
#conv1 = fluid.nets.conv2d()
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
num_filters=20,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
num_filters=50,
filter_size=5,
pool_size=2,
pool_stride=2,
act='relu')
logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
return logits
def main():
"""
Advbox demo which demonstrate how to use advbox.
"""
IMG_NAME = 'img'
LABEL_NAME = 'label'
img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
# gradient should flow
img.stop_gradient = False
label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
logits = cnn_model(img)
cost = fluid.layers.cross_entropy(input=logits, label=label)
avg_cost = fluid.layers.mean(x=cost)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
BATCH_SIZE = 1
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
feeder = fluid.DataFeeder(
feed_list=[IMG_NAME, LABEL_NAME],
place=place,
program=fluid.default_main_program())
fluid.io.load_params(
exe, "./mnist/", main_program=fluid.default_main_program())
# advbox demo
m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
logits.name, avg_cost.name, (-1, 1))
att = GradientSignAttack(m)
for data in train_reader():
# fgsm attack
adv_img = att(data)
plt.imshow(n[0][0], cmap='Greys_r')
plt.show()
#np.save('adv_img', adv_img)
break
if __name__ == '__main__':
main()
......@@ -38,6 +38,16 @@ elementwise_add
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
elementwise_sub
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
:noindex:
elementwise_mul
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
:noindex:
elementwise_div
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
......
......@@ -202,8 +202,8 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
```python
a = pd.Variable(shape=[20, 20])
......
......@@ -5,28 +5,28 @@
In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
- availability of Big Data
- supercomputing power to process this Big Data over very large neural networks
- modern algorithms
- Availability of Big Data
- Supercomputing power to process this Big Data over very large neural networks
- Modern algorithms
Following graph shows the details:
![](images/deep_learning.png)
Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference.
Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference.
## Solution
### Basic Strategy
There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
#### In-place Operation
In a relu activation operator:
$y = \max(x, 0)$
If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
#### Memory Sharing
......@@ -40,18 +40,18 @@ d = op2(a)
e = op3(d, f)
```
In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
### Live Variable Analysis
It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
......@@ -60,7 +60,7 @@ We can leran these techniques from compilers. There are mainly two stages to mak
#### Control Flow Graph
To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
Following is the flow graph for a simple loop.
......@@ -68,18 +68,18 @@ Following is the flow graph for a simple loop.
#### Dataflow Analysis
liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
- Flow Graph Terminology
A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
- Uses and Defs
An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
- Liveness
......@@ -168,9 +168,9 @@ class ControlFlowGraph(object):
return self._program
```
#### make dataflow analysis
#### Make dataflow analysis
We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
For example:
......
# Design Doc: The Keys of Operator Kernel Type
## Problem
An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
```cpp
struct OpKernelType {
......@@ -10,13 +10,13 @@ struct OpKernelType {
```
For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`.
It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
## Solution
......@@ -31,17 +31,17 @@ struct OpKernelType {
};
```
Following is the details:
The details are as follows:
### Place
`Place` is defined as follows:
`Place` is defined as:
```cpp
typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
```
`Place` is to represent the device memory where data is locating.
`Place` represents the device memory where data is located.
### Library
......@@ -52,10 +52,10 @@ One operator kernel is usually implemented based on one library. `Library` is de
enum Library { Plain, MKLDNN, CUDNN };
```
We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
### DataType
......@@ -67,15 +67,15 @@ If we want to support new Library, a new enumerator need to be added to `Library
Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
- The inference of Layout is at run-time, not compile-time.
- The inference of Layout is at run-time, not at compile-time.
- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
`Layout` is also defined as a enum variable:
......
......@@ -279,6 +279,26 @@ class LayerHelper(object):
return tmp
```
### Return value of layer functions
The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
1. Users can debug the network by printing parameter gradients.
2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
The sample usage is
```python
data = fluid.layers.data(...)
hidden = fluid.layers.fc(data, ...)
...
executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
```
## Optimizer
[Optimizer Design Doc](./optimizer.md)
## Background
PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
PaddlePaddle use proto message to describe compile time graph because
PaddlePaddle use proto message to describe compile time program because
1. Computation graph should be able to be saved to a file.
1. In distributed training, the graph will be serialized and send to multiple workers.
1. The computation program description must be serializable and saved in a file.
1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
| |compile time|runtime|
|---|---|---|
......
......@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
```
- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
- Every Layer has one or more operators and variables/parameters
- All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
- Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
- Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h)
- Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
- Optimizer: `fluid.optimizer.SGD`. It does the following
- Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
- Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
- Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
- Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
# Run Time
......
......@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string.h> // for strdup
#include <algorithm>
#include <string>
......@@ -60,7 +61,9 @@ void InitDevices() {
}
void InitGLOG(const std::string &prog_name) {
google::InitGoogleLogging(prog_name.c_str());
// glog will not hold the ARGV[0] inside.
// Use strdup to alloc a new string.
google::InitGoogleLogging(strdup(prog_name.c_str()));
google::InstallFailureSignalHandler();
}
......
......@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <algorithm>
......@@ -21,6 +22,10 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false,
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle {
namespace framework {
......@@ -542,8 +547,14 @@ void OperatorWithKernel::Run(const Scope& scope,
auto kernel_iter = kernels.find(expected_kernel_key);
kernel_iter->second->Compute(ExecutionContext(
*this, new_scope, *pool.Get(expected_kernel_key.place_)));
auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
kernel_iter->second->Compute(
ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/
if (FLAGS_op_sync) {
new_dev_ctx->Wait();
}
}
proto::DataType OperatorWithKernel::IndicateDataType(
......
grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "grpc_client.h"
namespace paddle {
namespace operators {
namespace detail {
bool RPCClient::AsyncSendVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out) {
sendrecv::VariableMessage req;
auto* var = scope.FindVar(var_name);
SerializeToMessage(var_name, var, ctx, &req);
// varhandle
VarHandle var_h;
var_h.ep = ep;
var_h.scope = &scope;
var_h.name = var_name;
var_h.ctx = &ctx;
// stub context
auto ch = GetChannel(ep);
SendProcessor* s = new SendProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = NULL;
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
req_count_++;
return true;
}
void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& ret_msg) {
auto* outvar = var_h.scope->FindVar(var_h.name);
std::istringstream iss(ret_msg.serialized());
DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
}
bool RPCClient::AsyncGetVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out) {
sendrecv::VariableMessage req;
req.set_varname(var_name);
auto* var = scope.FindVar(var_name);
SerializeToMessage(var_name, var, ctx, &req);
// varhandle
VarHandle var_h;
var_h.ep = ep;
var_h.scope = &scope;
var_h.name = var_name;
var_h.ctx = &ctx;
// stub context
auto ch = GetChannel(ep);
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse;
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
req_count_++;
return true;
}
bool RPCClient::wait() {
bool ok = true;
while (true) {
if (req_count_ <= 0) {
break;
}
if (!Proceed()) {
LOG(ERROR) << "Get meets CompletionQueue error";
return false;
}
}
return ok;
}
bool RPCClient::Proceed() {
void* tag = NULL;
bool ok = false;
// request counts.
if (!cq_.Next(&tag, &ok)) {
return false;
}
req_count_--;
GPR_ASSERT(ok);
PADDLE_ENFORCE(tag);
// TODO(gongwb): add more retries.
ClientBase* c = static_cast<ClientBase*>(tag);
if (!c->status_.ok()) {
delete c;
return true;
}
c->Process();
delete c;
return true;
}
std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
auto it = channels_.find(ep);
if (it != channels_.end()) {
return it->second;
}
auto ch = std::shared_ptr<grpc::Channel>(
grpc::CreateChannel(ep, grpc::InsecureChannelCredentials()));
channels_[ep] = ch;
return ch;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <grpc++/grpc++.h>
#include <grpc/support/log.h>
#include <time.h>
#include <chrono>
#include <ctime>
#include <functional>
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "paddle/framework/data_type.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/operators/detail/sendrecvop_utils.h"
#include "paddle/operators/detail/simple_block_queue.h"
namespace paddle {
namespace operators {
namespace detail {
struct VarHandle {
std::string ep;
const platform::DeviceContext* ctx;
const framework::Scope* scope;
std::string name;
std::string String() const {
std::ostringstream s;
s << "name:[" << name << "] ep:[" << ep << "]";
return s.str();
}
};
void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& msg);
class ClientBase {
public:
explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
stub_ = sendrecv::SendRecvService::NewStub(ch);
context_ = NULL;
}
virtual ~ClientBase() {}
virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
context_.reset(new grpc::ClientContext());
var_h_ = var_info;
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
context_->set_deadline(deadline);
}
virtual void Process() = 0;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
std::unique_ptr<grpc::ClientContext> context_;
grpc::Status status_;
VarHandle var_h_;
};
typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
RequestSendCallBack;
class SendProcessor : public ClientBase {
public:
explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
virtual ~SendProcessor() {}
virtual void Process() {
if (response_call_back_) {
response_call_back_(var_h_, reply_);
}
}
sendrecv::VoidMessage reply_;
RequestSendCallBack response_call_back_ = NULL;
};
typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
RequestGetCallBack;
class GetProcessor : public ClientBase {
public:
explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
virtual ~GetProcessor() {}
virtual void Process() {
if (response_call_back_) {
response_call_back_(var_h_, reply_);
}
}
sendrecv::VariableMessage reply_;
RequestGetCallBack response_call_back_ = ProcGetResponse;
};
class RPCClient {
public:
bool AsyncSendVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out = 600 * 1000);
bool AsyncGetVariable(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out = 600 * 1000);
bool wait();
private:
bool Proceed();
std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
private:
grpc::CompletionQueue cq_;
std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
int64_t req_count_ = 0;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/detail/grpc_server.h"
using grpc::ServerAsyncResponseWriter;
namespace paddle {
namespace operators {
namespace detail {
enum CallStatus { PROCESS = 0, FINISH };
// reference:
// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
class RequestBase {
public:
explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
grpc::ServerCompletionQueue* cq)
: service_(service), cq_(cq), status_(PROCESS) {}
virtual ~RequestBase() {}
virtual void Process() { assert(false); }
CallStatus Status() { return status_; }
void SetStatus(CallStatus status) { status_ = status; }
protected:
grpc::ServerContext ctx_;
sendrecv::SendRecvService::AsyncService* service_;
grpc::ServerCompletionQueue* cq_;
CallStatus status_;
};
typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class RequestSend final : public RequestBase {
public:
explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
grpc::ServerCompletionQueue* cq,
SimpleBlockQueue<MessageWithName>* queue)
: RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
this);
}
virtual ~RequestSend() {}
virtual void Process() {
MessageWithName msg_with_name =
std::make_pair(request_.varname(), std::move(request_));
queue_->Push(std::move(msg_with_name));
// TODO(gongwb): check var's info.
responder_.Finish(reply_, grpc::Status::OK, this);
}
protected:
sendrecv::VariableMessage request_;
sendrecv::VoidMessage reply_;
SimpleBlockQueue<MessageWithName>* queue_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};
class RequestGet final : public RequestBase {
public:
explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
grpc::ServerCompletionQueue* cq, framework::Scope* scope)
: RequestBase(service, cq), responder_(&ctx_), scope_(scope) {
service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
}
virtual ~RequestGet() {}
virtual void Process() {
// proc request.
std::string var_name = request_.varname();
auto* var = scope_->FindVar(var_name);
SerializeToMessage(var_name, var, platform::CPUDeviceContext(), &reply_);
// TODO(gongwb): check var's info.
responder_.Finish(reply_, grpc::Status::OK, this);
}
protected:
sendrecv::VariableMessage request_;
sendrecv::VariableMessage reply_;
ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
framework::Scope* scope_;
};
void AsyncGRPCServer::RunSyncUpdate() {
grpc::ServerBuilder builder;
builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service_);
cq_send_ = builder.AddCompletionQueue();
cq_get_ = builder.AddCompletionQueue();
server_ = builder.BuildAndStart();
LOG(INFO) << "Server listening on " << address_ << std::endl;
std::function<void()> send_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
std::function<void()> get_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
t_send_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false,
cq_send_.get(), "cq_send", send_register)));
t_get_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true,
cq_get_.get(), "cq_get", get_register)));
// wait server
server_->Wait();
t_send_->join();
t_get_->join();
}
void AsyncGRPCServer::ShutdownQueue() {
std::unique_lock<std::mutex> lock(cq_mutex_);
cq_send_->Shutdown();
cq_get_->Shutdown();
is_shut_down_ = true;
}
// This URL explains why shutdown is complicate:
// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
void AsyncGRPCServer::ShutDown() {
server_->Shutdown();
ShutdownQueue();
}
void AsyncGRPCServer::TryToRegisterNewSendOne() {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
return;
}
RequestSend* send =
new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
VLOG(4) << "create RequestSend status:" << send->Status();
}
void AsyncGRPCServer::TryToRegisterNewGetOne() {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
return;
}
RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_);
VLOG(4) << "create Requestget status:" << get->Status();
}
void AsyncGRPCServer::SetFinishOrDelete(RequestBase*& last) {
std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) {
delete last;
last = NULL;
return;
}
last->SetStatus(FINISH);
return;
}
void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
std::string cq_name,
std::function<void()> TryToRegisterNewOne) {
TryToRegisterNewOne();
void* tag = NULL;
bool ok = false;
while (true) {
if (!cq->Next(&tag, &ok)) {
LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
break;
}
if (wait && !done_) {
Wait();
}
RequestBase* base = (RequestBase*)tag;
if (!ok) {
VLOG(4) << cq_name << " recv no regular event";
TryToRegisterNewOne();
delete base;
continue;
}
switch (base->Status()) {
case PROCESS: {
VLOG(4) << cq_name << " status:" << base->Status();
TryToRegisterNewOne();
base->Process();
SetFinishOrDelete(base);
break;
}
case FINISH: {
VLOG(4) << cq_name << " status:" << base->Status();
delete base;
break;
}
default: { assert(false); }
}
}
}
void AsyncGRPCServer::Wait() {
std::unique_lock<std::mutex> lock(this->mutex_);
condition_.wait(lock, [=] { return this->done_ == true; });
}
void AsyncGRPCServer::Reset() {
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = false;
}
void AsyncGRPCServer::Done() {
{
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = true;
}
condition_.notify_all();
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/var_type.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h"
#include <grpc++/grpc++.h>
#include <grpc/support/log.h>
#include <thread>
#include "paddle/operators/detail/sendrecvop_utils.h"
namespace paddle {
namespace operators {
namespace detail {
typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class RequestBase;
class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
public:
explicit AsyncGRPCServer(std::string address) { address_ = address; }
void RunSyncUpdate();
void Reset();
void Done();
void SetScope(framework::Scope *scope) { scope_ = scope; }
const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
void ShutDown();
protected:
void Wait();
void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
std::string cq_name,
std::function<void()> TryToRegisterNewOne);
void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne();
void SetFinishOrDelete(RequestBase *&last);
void ShutdownQueue();
private:
std::mutex cq_mutex_;
volatile bool is_shut_down_ = false;
std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
sendrecv::SendRecvService::AsyncService service_;
std::unique_ptr<grpc::Server> server_;
std::string address_;
framework::Scope *scope_;
// received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<MessageWithName> var_recv_queue_;
// condition of the sub program
std::mutex mutex_;
volatile mutable bool done_;
std::condition_variable condition_;
std::unique_ptr<std::thread> t_send_;
std::unique_ptr<std::thread> t_get_;
};
}; // namespace detail
}; // namespace operators
}; // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
namespace paddle {
namespace operators {
namespace detail {
Status SendRecvServerImpl::SendVariable(ServerContext *context,
const VariableMessage *in_var,
VoidMessage *out_var) {
MessageWithName msg_with_name =
std::make_pair(in_var->varname(), std::move(*in_var));
var_recv_queue_.Push(std::move(msg_with_name));
return Status::OK;
}
Status SendRecvServerImpl::GetVariable(ServerContext *context,
const VariableMessage *in_var,
VariableMessage *out_var) {
std::string get_var_name = in_var->varname();
auto *var = scope_->FindVar(get_var_name);
SerializeToMessage(get_var_name, var, platform::CPUDeviceContext(), out_var);
return Status::OK;
}
Status SendRecvServerImpl::Wait(ServerContext *context,
const VoidMessage *in_var,
VoidMessage *out_var) {
{
std::unique_lock<std::mutex> lock(this->mutex_);
condition_.wait(lock, [=] { return this->done_ == true; });
}
return Status::OK;
}
void SendRecvServerImpl::Reset() {
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = false;
}
void SendRecvServerImpl::Done() {
{
std::lock_guard<std::mutex> lock(this->mutex_);
done_ = true;
}
condition_.notify_all();
}
} // namespace detail
} // namespace operators
} // namespace paddle
......@@ -21,8 +21,6 @@ service SendRecvService {
rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// wait for one execution of the program
rpc Wait(VoidMessage) returns (VoidMessage) {}
}
// VariableMessage is serialized paddle variable message.
......
......@@ -12,87 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/var_type.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h"
#include <grpc++/grpc++.h>
using grpc::Channel;
using grpc::Server;
using grpc::ServerContext;
using grpc::ServerReader;
using grpc::ServerBuilder;
using grpc::ClientContext;
using grpc::ClientReader;
using grpc::ClientReaderWriter;
using grpc::ClientWriter;
using grpc::Status;
using sendrecv::SendRecvService;
using sendrecv::VariableMessage;
using sendrecv::VoidMessage;
#include "paddle/operators/detail/sendrecvop_utils.h"
namespace paddle {
namespace operators {
namespace detail {
typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class SendRecvServerImpl final : public SendRecvService::Service {
public:
explicit SendRecvServerImpl() {}
Status SendVariable(ServerContext *context, const VariableMessage *in_var,
VoidMessage *out_var) override;
Status GetVariable(ServerContext *context, const VariableMessage *in_var,
VariableMessage *out_var) override;
Status Wait(ServerContext *context, const VoidMessage *in_var,
VoidMessage *out_var) override;
void Reset();
void Done();
void SetScope(framework::Scope *scope) { scope_ = scope; };
const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
private:
// received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<MessageWithName> var_recv_queue_;
framework::Scope *scope_;
// condition of the sub program
std::mutex mutex_;
bool done_;
std::condition_variable condition_;
};
// RPCClient is a class to send tensors to pserver sub-network
// using different hashing methods.
class RPCClient {
public:
RPCClient(std::shared_ptr<Channel> channel)
: stub_(SendRecvService::NewStub(channel)) {}
bool SendVariable(const framework::Scope &scope, const std::string &inname);
bool GetVariable(const framework::Scope &scope, const std::string &outname);
void Wait();
private:
std::unique_ptr<SendRecvService::Stub> stub_;
};
inline void SerializeToMessage(const std::string &name,
const framework::Variable *var,
const platform::DeviceContext &ctx,
VariableMessage *msg) {
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg) {
msg->set_varname(name);
std::ostringstream oss;
switch (framework::ToVarType(var->Type())) {
......@@ -114,10 +42,9 @@ inline void SerializeToMessage(const std::string &name,
msg->set_serialized(oss.str());
}
inline void DeserializeFromMessage(const VariableMessage &msg,
const platform::DeviceContext &ctx,
framework::Variable *var) {
using namespace paddle::framework::proto;
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var) {
std::istringstream iss(msg.serialized());
switch (msg.type()) {
case sendrecv::VarType::LOD_TENSOR:
......
......@@ -12,56 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "send_recv_impl.h"
#pragma once
#include <iostream>
#include <string>
#include <vector>
#include "paddle/framework/data_type.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/var_type.h"
#include "paddle/operators/detail/send_recv.grpc.pb.h"
#include "paddle/operators/detail/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace detail {
bool RPCClient::SendVariable(const framework::Scope& scope,
const std::string& inname) {
ClientContext context;
VariableMessage msg;
VoidMessage out_msg;
// FIXME(typhoonzero): pass device context to here.
auto ctx = platform::CPUDeviceContext();
auto* var = scope.FindVar(inname);
PADDLE_ENFORCE(var);
SerializeToMessage(inname, var, ctx, &msg);
Status status = stub_->SendVariable(&context, msg, &out_msg);
if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message();
return false;
}
return true;
}
bool RPCClient::GetVariable(const framework::Scope& scope,
const std::string& outname) {
ClientContext context;
VariableMessage call_msg, ret_msg;
call_msg.set_varname(outname);
auto ctx = platform::CPUDeviceContext();
Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
auto* outvar = scope.FindVar(outname);
if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message();
return false;
}
std::istringstream iss(ret_msg.serialized());
DeserializeFromMessage(ret_msg, ctx, outvar);
return true;
}
void RPCClient::Wait() {
ClientContext context;
VoidMessage call_msg, ret_msg;
stub_->Wait(&context, call_msg, &ret_msg);
}
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg);
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
} // namespace detail
} // namespace operators
} // namespace paddle
......@@ -21,7 +21,7 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
public:
ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Add", "$Out = X + Y$");
SetComment("Add", "Out = X + Y");
AddComment(comment_);
}
};
......
......@@ -21,7 +21,7 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
public:
ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Div", "$Out = X / Y$");
SetComment("Div", "Out = X / Y");
AddComment(comment_);
}
};
......
......@@ -22,7 +22,7 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
public:
ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Mul", "$Out = X \\odot\\ Y$");
SetComment("Mul", "Out = X \\odot\\ Y");
AddComment(comment_);
}
};
......
......@@ -58,7 +58,8 @@ Limited Elementwise {name} Operator.
The equation is:
{equation}
.. math::
{equation}
X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
or equal to the dimensions of X.
......@@ -71,15 +72,16 @@ For case 2:
Y will be broadcasted to match the shape of X and axis should be
the starting dimension index for broadcasting Y onto X.
example:
For example
.. code-block:: python
shape(X) = (2, 3, 4, 5), shape(Y) = (,)
shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
Both the input X and Y can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD information with input X.
Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
)DOC";
AddComment(comment_);
......
......@@ -21,7 +21,7 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
public:
ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Sub", "$Out = X - Y$");
SetComment("Sub", "Out = X - Y");
AddComment(comment_);
}
};
......
......@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/grpc_server.h"
#include "paddle/operators/detail/sendrecvop_utils.h"
#include "paddle/operators/detail/simple_block_queue.h"
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
......@@ -32,6 +33,11 @@ limitations under the License. */
namespace paddle {
namespace operators {
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
}
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
......@@ -46,18 +52,6 @@ static void CreateTensorFromMessageType(framework::Variable *var,
}
}
void RunServer(Server **rpc_server,
std::shared_ptr<detail::SendRecvServerImpl> service,
const std::string &server_address) {
ServerBuilder builder;
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
builder.RegisterService(service.get());
std::unique_ptr<Server> server(builder.BuildAndStart());
*rpc_server = server.get();
LOG(INFO) << "Server listening on " << server_address;
server->Wait();
}
class RecvOp : public framework::OperatorBase {
public:
RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
......@@ -65,10 +59,9 @@ class RecvOp : public framework::OperatorBase {
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
rpc_service_.reset(new detail::SendRecvServerImpl());
std::string endpoint = Attr<std::string>("endpoint");
server_thread_.reset(
new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
server_thread_.reset(new std::thread(RunServer, rpc_service_));
}
}
......@@ -76,7 +69,7 @@ class RecvOp : public framework::OperatorBase {
detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg);
rpc_server_->Shutdown();
rpc_service_->ShutDown();
server_thread_->join();
}
......@@ -99,10 +92,12 @@ class RecvOp : public framework::OperatorBase {
auto grad_list = Attr<std::vector<std::string>>("GradList");
auto trainer_count = Attr<int>("Trainers");
size_t param_count = param_list.size();
rpc_service_->Reset();
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
while (!exit_flag) {
// TODO(gognwb): simply this loop.
// Get from multiple trainers, we don't care about order in which
// the gradient arrives, just add suffix 0~n then average the gradient.
for (size_t i = 0; i < param_count * trainer_count; ++i) {
......@@ -110,6 +105,7 @@ class RecvOp : public framework::OperatorBase {
const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
exit_flag = true;
break;
}
......@@ -118,10 +114,12 @@ class RecvOp : public framework::OperatorBase {
if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()];
} else {
LOG(ERROR) << "grad have no paired param found!";
LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
<< "\"";
}
VLOG(3) << "recved grad: " << grad_var_name
<< " updating param: " << param_var_name;
auto *merged_grad = recv_scope.FindVar(grad_var_name);
if (merged_grad == nullptr) {
auto *ptr = recv_scope.Var(grad_var_name);
......@@ -141,9 +139,11 @@ class RecvOp : public framework::OperatorBase {
auto &dev_ctx = *pool.Get(dev_place);
detail::DeserializeFromMessage(v.second, dev_ctx, var);
}
if (exit_flag) {
break;
}
rpc_service_->Reset();
std::string program_str = Attr<std::string>("OptimizeProgram");
......@@ -158,17 +158,14 @@ class RecvOp : public framework::OperatorBase {
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
rpc_service_->Done();
grads_counter_.clear();
} // while(true)
}
protected:
// grpc server instance to track status and gracefully shutdown.
// borrow an pointer from server thread.
Server *rpc_server_{nullptr};
// grpc send/recv service implement to register.
std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
mutable std::unordered_map<std::string, int> grads_counter_;
};
......
......@@ -19,59 +19,45 @@ limitations under the License. */
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
#include <future>
#include "paddle/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
// TODO(typhoonzero): this is a simple implementation which only send
// one tensor
class SendOp : public framework::OperatorBase {
public:
SendOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
// init client when the operator is created at runtime.
std::vector<std::string> endpoints =
Attr<std::vector<std::string>>("endpoints");
for (auto ep : endpoints) {
client_map_[ep].reset(new detail::RPCClient(
grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
}
}
SendOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override {
void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto ins = Inputs("X");
auto outs = Outputs("Out");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
// TODO(typhoonzero): use async calls to send multiple variable asyncly.
for (size_t i = 0; i < ins.size(); ++i) {
bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
if (!ret) {
LOG(ERROR) << "send variable error: " << ins[i];
}
}
// TODO(typhoonzero): support async optimization
client_map_[epmap[0]]->Wait();
for (size_t i = 0; i < outs.size(); ++i) {
bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]);
if (!ret) {
LOG(ERROR) << "GetVariable error: " << outs[i];
// FIXME(gongwb): DeviceContext?
auto ctx = platform::CPUDeviceContext();
for (size_t i = 0; i < ins.size(); i++) {
client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
}
for (size_t i = 0; i < outs.size(); i++) {
client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
}
client_.wait();
}
protected:
mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
client_map_;
private:
mutable detail::RPCClient client_;
};
class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
AddOutput("Out", "(Tensor) Output tensor to get from server")
......
......@@ -140,7 +140,7 @@ void StartServerNet(bool is_sparse) {
TEST(SendRecvOp, CPUDense) {
std::thread server_thread(StartServerNet, false);
sleep(3); // wait server to start
sleep(10); // wait server to start
// local net
f::Scope scope;
p::CPUPlace place;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/sequence_erase_op.h"
namespace paddle {
namespace operators {
class SequenceEraseOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SequenceEraseOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequenceEraseOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
"Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
"with the 2nd dimension equal to 1.");
ctx->SetOutputDim("Out", x_dims);
}
};
class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(2-D LoDTensor with the 2nd dim. equal to 1) "
"Input LoDTensor of SequenceEraseOp.");
AddOutput("Out",
"(2-D LoDTensor with the 2nd dim. equal to 1) "
"Output LoDTensor of SequenceEraseOp.");
AddAttr<std::vector<int>>("tokens",
"(vector<int>) Tokens need to be erased from "
"input sequences.");
AddComment(R"DOC(
Sequence Erase Operator.
Sequence erase operator erases tokens specified by Attr(tokens) from the input
sequences Input(X), and outputs the remaining data and modifies the LoD
information at the same time. For example, given a 2-D LoDTensor
X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
with lod = [[0, 3, 6, 10]], there are three sequences in the input:
X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing
operation, the three sequences become
X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
Hence the LoDTensor Output(Out) should be
Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
with lod = [[0, 1, 3, 7]].
An example usage for this operator is to remove the special tokens when
computing the edit distance between two strings, such as blank, start token,
and end token.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
ops::SequenceEraseOpMaker);
REGISTER_OP_CPU_KERNEL(
sequence_erase,
ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/operators/sequence_erase_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
const T* tokens, const int tokens_len,
int* num_erased) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < in_len) {
int erased = 0;
for (int i = 0; i < tokens_len; ++i) {
if (in_dat[index] == tokens[i]) {
erased = 1;
}
}
num_erased[index + 1] = erased;
if (index == 0) {
num_erased[0] = 0;
}
}
}
template <typename T>
__global__ void GetOutLod(const T* num_erased, const int* in_lod,
const int lod_len, int* out_lod0) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < lod_len) {
out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
}
}
template <typename T>
__global__ void SetOutput(const T* in_dat, const int in_len,
const int* num_erased, T* out_dat) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < in_len) {
if (in_dat[index] != in_dat[index + 1]) {
out_dat[index - num_erased[index]] = in_dat[index];
}
}
}
template <typename T>
class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
auto lod = in->lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
"The actual size mismatches with the LoD information.");
auto tokens = ctx.Attr<std::vector<T>>("tokens");
auto tokens_len = tokens.size();
auto in_len = in->numel();
auto in_dat = in->data<T>();
auto lod0 = lod[0];
thrust::host_vector<T> host_tokens(tokens_len);
for (size_t i = 0; i < tokens.size(); ++i) {
host_tokens[i] = tokens[i];
}
thrust::device_vector<T> dev_tokens = host_tokens;
thrust::device_vector<int> num_erased(in_len + 1);
T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
auto stream = ctx.cuda_device_context().stream();
LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
num_erased.begin() + 1);
// Calc LoD
auto lod_len = lod0.size();
thrust::host_vector<int> host_lod(lod_len);
for (size_t i = 0; i < lod_len; ++i) {
host_lod[i] = lod0[i];
}
thrust::device_vector<int> dev_in_lod = host_lod;
thrust::device_vector<int> dev_out_lod(lod_len);
int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
thrust::host_vector<int> host_out_lod = dev_out_lod;
std::vector<int> out_lod0(lod_len, 0);
for (size_t i = 0; i < lod_len; i++) {
out_lod0[i] = host_out_lod[i];
}
framework::LoD out_lod;
out_lod.push_back(out_lod0);
out->set_lod(out_lod);
// Set output
out->Resize({out_lod0.back(), 1});
auto out_dat = out->mutable_data<T>(ctx.GetPlace());
SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
num_erased_ptr, out_dat);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(sequence_erase,
paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SequenceEraseKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::LoDTensor>("X");
auto* out = ctx.Output<framework::LoDTensor>("Out");
auto lod = in->lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
"The actual size mismatches with the LoD information.");
auto tokens = ctx.Attr<std::vector<int>>("tokens");
auto in_len = in->numel();
auto in_dat = in->data<T>();
auto lod0 = lod[0];
std::vector<size_t> num_erased(in_len + 1, 0);
std::vector<size_t> out_lod0(1, 0);
for (size_t i = 0; i < lod0.size() - 1; ++i) {
size_t num_out = 0;
for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
num_erased[j] = num_erased[j - 1];
if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
tokens.end()) {
num_erased[j] += 1;
} else {
num_out += 1;
}
}
out_lod0.push_back(out_lod0.back() + num_out);
}
auto out_len = in_len - num_erased[in_len];
out->Resize({static_cast<int64_t>(out_len), 1});
auto out_dat = out->mutable_data<T>(ctx.GetPlace());
for (int64_t i = 0; i < in_len; ++i) {
if (num_erased[i] == num_erased[i + 1]) {
out_dat[i - num_erased[i]] = in_dat[i];
}
}
framework::LoD out_lod;
out_lod.push_back(out_lod0);
out->set_lod(out_lod);
}
};
} // namespace operators
} // namespace paddle
......@@ -58,7 +58,7 @@ def __bootstrap__():
read_env_flags = ['use_pinned_memory', 'check_nan_inf']
if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use')
read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
......
......@@ -236,6 +236,9 @@ class Variable(object):
__repr__ = __str__
def set_desc(self, input):
self.desc = input
@property
def persistable(self):
return self.desc.persistable()
......
......@@ -14,7 +14,7 @@ __all__ = [
'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
'sequence_first_step', 'sequence_last_step'
'sequence_first_step', 'sequence_last_step', 'dropout'
]
......@@ -386,6 +386,21 @@ def cos_sim(X, Y, **kwargs):
return out
def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
helper = LayerHelper('dropout', **kwargs)
out = helper.create_tmp_variable(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
helper.append_op(
type='dropout',
inputs={'X': [x]},
outputs={'Out': [out],
'Mask': [mask]},
attrs={'dropout_prob': dropout_prob,
'is_test': is_test,
'seed': seed})
return out
def cross_entropy(input, label, **kwargs):
"""
**Cross Entropy Layer**
......@@ -968,7 +983,7 @@ def batch_norm(input,
default_initializer=Constant(1.0))
bias = helper.create_parameter(
attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable(
dtype=input.dtype,
......
from ..registry import register_layer
__activations__ = [
'abs',
'ceil',
'exp',
'floor',
'log',
'relu',
'round',
'sigmoid',
'sqrt',
'square',
'tanh',
'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round'
]
__all__ = [
'mean',
'mul',
'dropout',
'reshape',
'scale',
'transpose',
......
......@@ -3,6 +3,17 @@ import framework
from framework import Program, default_main_program, Parameter, Variable
import backward
from backward import _rename_arg_
from . import core
dtype_to_size = {
core.DataType.FP16: 2,
core.DataType.FP32: 4,
core.DataType.FP64: 8,
core.DataType.INT16: 2,
core.DataType.INT32: 4,
core.DataType.INT64: 8,
core.DataType.BOOL: 1
}
class ControlFlowGraph(object):
......@@ -28,18 +39,33 @@ class ControlFlowGraph(object):
block_size = program_desc.num_blocks()
# TODO(qijun) handle Program with if/while operators
self.global_block = program_desc.block(0)
self.op_size = self.global_block.op_size()
self.global_block_desc = program_desc.block(0)
self.op_size = self.global_block_desc.op_size()
op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
self._add_connections(op_node_connections)
self.ops = [self.global_block.op(i) for i in range(self.op_size)]
self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
for i in range(self.op_size):
self._uses[i].update(self.ops[i].input_arg_names())
self._defs[i].update(self.ops[i].output_arg_names())
def _update_graph(self, old_name, new_name, begin_idx=0):
for i in range(begin_idx, self.op_size):
if old_name in self._uses[i]:
self._uses[i].remove(old_name)
self._uses[i].add(new_name)
if old_name in self._defs[i]:
self._defs[i].remove(old_name)
self._defs[i].add(new_name)
if old_name in self._live_in[i]:
self._live_in[i].remove(old_name)
self._live_out[i].add(new_name)
if old_name in self._live_out[i]:
self._live_out[i].remove(old_name)
self._live_out[i].add(new_name)
def _reach_fixed_point(self, live_in, live_out):
if len(live_in) != len(self._live_in):
return False
......@@ -79,30 +105,45 @@ class ControlFlowGraph(object):
self.pool = []
for i in range(self.op_size):
if self.pool:
out_pair = [(x, self.global_block.var(str(x)).shape())
out_pair = [(x, self.global_block_desc.var(str(x)).shape())
for x in self._defs[i]]
for x, x_shape in out_pair:
if not self.global_block_desc.var(str(x)).persistable():
for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0]
cache_shape = cache_pair[1]
if x_shape == cache_shape:
x_dtype = self.global_block_desc.var(str(
x)).dtype()
cache_dtype = self.global_block_desc.var(
str(cache_var)).dtype()
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
# and dtype_to_size[cache_dtype]
if x_dtype == cache_dtype:
print(
"Hit Cache !!!! cache pool index is %d, var name is %s, cached var name is %s, var shape is %s "
% (index, x, cache_var, str(cache_shape)))
%
(index, x, cache_var, str(cache_shape)))
self.pool.pop(index)
_rename_arg_(self.ops, x, cache_var, begin_idx=i)
self._dataflow_analyze()
_rename_arg_(
self.ops, x, cache_var, begin_idx=i)
self._program.current_block().var(str(
x)).desc = self.global_block_desc.var(
str(cache_var))
self._update_graph(
x, cache_var, begin_idx=i)
break
in_diff, out_diff = self._get_diff(self._live_in[i],
self._live_out[i])
can_optimize = filter(
lambda x: not self.global_block.var(str(x)).persistable(),
lambda x: not self.global_block_desc.var(str(x)).persistable(),
in_diff)
if can_optimize:
for var_name in can_optimize:
self.pool.append((
var_name, self.global_block.var(str(var_name)).shape()))
self.pool.append(
(var_name,
self.global_block_desc.var(str(var_name)).shape()))
def get_program(self):
return self._program
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import os
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv("TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
else:
trainer_prog = t.get_trainer_program()
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader():
avg_loss_value, = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0:
exit(0)
exit(1)
import math
import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid as fluid
import time
import os
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
BATCH_SIZE = 20
embedding_name = 'emb'
def load_parameter(file_name, h, w):
with open(file_name, 'rb') as f:
f.read(16) # skip header.
return np.fromfile(f, dtype=np.float32).reshape(h, w)
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
**ignored):
# 8 features
predicate_embedding = fluid.layers.embedding(
input=predicate,
size=[pred_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark,
size=[mark_dict_len, mark_dim],
dtype='float32',
is_sparse=IS_SPARSE)
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
fluid.layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
]
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
lstm_0 = fluid.layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
])
lstm = fluid.layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = fluid.layers.sums(input=[
fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
])
return feature_out
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
# define network topology
word = fluid.layers.data(
name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=mix_hidden_lr))
avg_cost = fluid.layers.mean(x=crf_cost)
# TODO(qiao)
# check other optimizers and check why out will be NAN
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
# TODO(qiao)
# add dependency track and move this config before optimizer
crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
chunk_evaluator = fluid.evaluator.ChunkEvaluator(
input=crf_decode,
label=target,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
],
place=place)
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv(
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
start_time = time.time()
batch_id = 0
exe.run(fluid.default_startup_program())
embedding_param = fluid.global_scope().find_var(
embedding_name).get_tensor()
embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
place)
for pass_id in xrange(PASS_NUM):
chunk_evaluator.reset(exe)
for data in train_data():
cost, precision, recall, f1_score = exe.run(
trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics)
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
exe)
if batch_id % 10 == 0:
print("avg_cost:" + str(cost) + " precision:" + str(
precision) + " recall:" + str(recall) + " f1_score:" +
str(f1_score) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(
pass_recall) + " pass_f1_score:" + str(
pass_f1_score))
if batch_id != 0:
print("second per batch: " + str((time.time(
) - start_time) / batch_id))
batch_id = batch_id + 1
if __name__ == '__main__':
main()
from __future__ import print_function
import os
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
hid_dim=32):
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
conv_3 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=3,
act="tanh",
pool_type="sqrt")
conv_4 = fluid.nets.sequence_conv_pool(
input=emb,
num_filters=hid_dim,
filter_size=4,
act="tanh",
pool_type="sqrt")
prediction = fluid.layers.fc(input=[conv_3, conv_4],
size=class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv(
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
exe.run(fluid.default_startup_program())
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
exe.run(pserver_prog)
elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
for pass_id in xrange(PASS_NUM):
accuracy.reset(exe)
for data in train_data():
cost_val, acc_val = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
" pass_acc=" + str(pass_acc))
if cost_val < 1.0 and pass_acc > 0.8:
exit(0)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
if __name__ == '__main__':
main()
import unittest
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid as fluid
from paddle.v2.fluid.framework import Program
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward
import numpy as np
import paddle.v2.fluid.core as core
import numpy
class BaseParallelForTest(unittest.TestCase):
def run_test(self, callback, feed, fetch):
"""
Run the unittest for parallel.for
Args:
callback(callable): A callable function returns a generator. There
are two yields in the generator function. The first yield
returns the data layers, and the second yield returns the loss.
The modified data variables will be sent back during the first
yield.
feed(dict): The executor feeding dictionary.
fetch(list|basestr): The fetch name lists.
Returns:
None
Raises:
AssertionError when the computation of cpu, parallel.for in cpu,
gpu, parallel.for in gpu are different.
"""
cpu = fluid.CPUPlace()
result_cpu = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=cpu,
use_parallel=False)
result_cpu_parallel = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=cpu,
use_parallel=True)
if fluid.core.is_compile_gpu():
gpu = fluid.CUDAPlace(0)
result_gpu = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=gpu,
use_parallel=False)
result_gpu_parallel = self._run_test_impl_(
callback=callback,
feed=feed,
fetch=fetch,
place=gpu,
use_parallel=True)
self._assert_same_(fetch, result_cpu, result_cpu_parallel,
result_gpu, result_gpu_parallel)
else:
self._assert_same_(fetch, result_cpu, result_cpu_parallel)
class ParallelOpTest(unittest.TestCase):
def setUp(self):
x = layers.data(
shape=[-1, 30, 40],
dtype='float32',
name='x',
append_batch_size=False,
stop_gradient=False)
def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False):
"""
Run a single test, returns the fetch values
Args:
place(Place): the computation place.
use_parallel(bool): Whether use parallel.for or not.
places = layers.get_places(device_count=4)
pd = layers.ParallelDo(places=places)
Returns:
Fetched numpy arrays.
"""
if isinstance(fetch, basestring):
fetch = [fetch]
main = fluid.Program()
startup = fluid.Program()
# Fix seed
main.random_seed = 10
startup.random_seed = 10
with fluid.program_guard(main, startup):
generator = callback()
# Automatically insert parallel do if use_parallel = True
if use_parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
data = next(generator)
if isinstance(data, fluid.Variable):
data = [data]
with pd.do():
data = pd.read_input(x)
hidden = layers.fc(input=data, size=7)
pd.write_output(hidden)
data = pd()
loss = layers.mean(x=data)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.run(fluid.default_main_program(),
feed={
x.name: np.random.uniform(0.1, 0.6,
(20, 30, 40)).astype("float32")
})
ins = map(pd.read_input, data)
if len(ins) == 1:
ins = ins[0]
loss = generator.send(ins) # patch input
pd.write_output(loss)
loss = pd()
else:
data = next(generator)
loss = generator.send(data)
self.assertIsNotNone(loss)
avg_loss = fluid.layers.mean(x=loss)
fluid.backward.append_backward(loss=avg_loss)
def test_forward(self):
pass
exe = fluid.Executor(place)
exe.run(startup)
return exe.run(main, feed=feed, fetch_list=fetch)
def _assert_same_(self, fetch, *args):
"""
Assert the return values of `run_test` are same.
Args:
fetch: Fetch list. Used for print error message
*args: The fetch result lists of each situations.
Returns:
None
Raises:
AssertionError
"""
def _impl_(a, b, fetch_id, item_id):
item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU']
flag = numpy.allclose(a, b, rtol=0.1)
self.assertTrue(flag, "The {0} are different in {1}".format(
fetch[fetch_id], item_str[item_id]))
for i, items in enumerate(zip(*args)):
self.assertGreater(len(items), 0)
for j in range(1, len(items)):
_impl_(items[0], items[j], fetch_id=i, item_id=j)
class ParallelOpTest(BaseParallelForTest):
def test_simple_fc(self):
def __network__():
x = fluid.layers.data(shape=[784], dtype='float32', name='img')
# FIXME: This is a bug of parallel.do
x.stop_gradient = False
x = yield x
hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
loss = fluid.layers.mean(x=hidden)
yield loss
self.run_test(
callback=__network__,
feed={
'img':
numpy.random.random(size=(128 * 3, 784)).astype('float32')
},
fetch='fc1.w@GRAD')
if __name__ == '__main__':
......
import unittest
import numpy as np
from op_test import OpTest
def sequence_erase(in_seq, lod0, tokens):
new_lod0 = [0]
out_seq = []
for i in range(0, len(lod0) - 1):
num_out = 0
for dat in in_seq[lod0[i]:lod0[i + 1]]:
if dat not in tokens:
out_seq.append(dat)
num_out += 1
new_lod0.append(new_lod0[-1] + num_out)
return np.array(out_seq).astype("int32"), new_lod0
class TestSequenceEraseOp(OpTest):
def setUp(self):
self.op_type = "sequence_erase"
in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
lod = [[0, 9, 13, 24, 30]]
tokens = [2, 3, 5]
out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
self.attrs = {'tokens': tokens}
self.inputs = {'X': (in_seq, lod)}
self.outputs = {'Out': (out_seq, [new_lod0])}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
FROM ubuntu:16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG ANDROID_ABI
ARG ANDROID_API
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV ANDROID_API=${ANDROID_API:-21}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_TOOLCHAINS_DIR=/opt/toolchains
RUN apt-get update && \
apt-get install -y \
git python-dev python-pip python-numpy \
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
apt-get clean -y
# Install Go and glide
RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
# git credential to skip password typing
RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN pip install --upgrade pip && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel sphinx && \
pip install pre-commit
# Android NDK
RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
mkdir -p /opt/android-ndk-tmp && \
cd /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
# order to satisfy the build of capnproto library (a nupic.core dependency),
# which requires some headers and symbols not present on CentOS-5 (e.g.,
# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
# https://github.com/sandstorm-io/capnproto/issues/350.
FROM nvidia/cuda:<baseimg>
MAINTAINER Numenta, based on the ManyLinux project
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && rm -r build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
# for paddle
RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
# protobuf 3.1.0
RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
tar xzf protobuf-cpp-3.1.0.tar.gz && \
cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool
RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
go get github.com/Masterminds/glide && \
rm -rf /root/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
make -j `nproc` install <NCCL_MAKE_OPTS> && cd .. && rm -rf nccl
# buildtools
We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
which, we hope could be released as wheel packages on PyPI, so we need
to make sure that the build follows the
[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
The manylinux standard suggests building Python modules on an old
system, because that a module would anyway depend on some shared
libraries, and Linux's shared library standard states that those built
with newer version compilers cannot work with those with older
versions. The suggested building environment is as old as CentOS 5.
However, PaddlePaddle relies on CUDA, and the earlies version of
[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
So, here we provide a Docker image basing on CentOS 6 and CUDA for
building PaddlePaddle and making the release supports "as-manylinux as
possible." or "sufficiently many Linux" according to [this
discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
The build output of our Docker image includes multiple wheel files --
some contain the CPU-only binary, some others support CUDA; some are
compatible with the cp27m Python ABI, some others with cp27.
To build these wheels, please run the following commands:
```bash
git clone https://github.com/paddlepaddle/paddle
cd paddle/tools/manylinux1
REPO=[yourrepo] ./build_all.sh
```
#!/bin/bash
set -xe
REPO="${REPO:-typhoon1986}"
# NOTE: version matches are determined!
sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
#!/bin/bash
# Top-level build script called from Dockerfile
# Stop at any error, show all commands
set -ex
# Python versions to be installed in /opt/$VERSION_NO
# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
# remove others to expedite build and reduce docker image size. The original
# manylinux docker image project builds many python versions.
# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
CPYTHON_VERSIONS="2.7.11 3.5.1"
# openssl version to build, with expected sha256 hash of .tar.gz
# archive
OPENSSL_ROOT=openssl-1.0.2l
OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
CURL_ROOT=curl-7.49.1
CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
AUTOCONF_ROOT=autoconf-2.69
AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
# Dependencies for compiling Python that we want to remove from
# the final image after compiling Python
PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
# Libraries that are allowed as part of the manylinux1 profile
MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
# Get build utilities
MY_DIR=$(dirname "${BASH_SOURCE[0]}")
source $MY_DIR/build_utils.sh
# EPEL support
yum -y install wget curl
curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
# Dev toolset (for LLVM and other projects requiring C++11 support)
curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
check_sha256sum devtools-2.repo $DEVTOOLS_HASH
mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
rpm -Uvh --replacepkgs epel-release-6*.rpm
rm -f epel-release-6*.rpm
# Development tools and libraries
yum -y install bzip2 make git patch unzip bison yasm diffutils \
automake which file \
kernel-devel-`uname -r` \
devtoolset-2-binutils devtoolset-2-gcc \
devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
${PYTHON_COMPILE_DEPS}
# Install more recent version of cmake
# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
# rm cmake-3.8.1-Linux-x86_64.sh
wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
cd cmake-3.5.2 && ./bootstrap && \
make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
# Install newest autoconf
build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
autoconf --version
# Compile the latest Python releases.
# (In order to have a proper SSL module, Python is compiled
# against a recent openssl [see env vars above], which is linked
# statically. We delete openssl afterwards.)
build_openssl $OPENSSL_ROOT $OPENSSL_HASH
mkdir -p /opt/python
build_cpythons $CPYTHON_VERSIONS
PY35_BIN=/opt/python/cp35-cp35m/bin
# NOTE Since our custom manylinux image builds pythons with shared
# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
# python.
ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
# Our openssl doesn't know how to find the system CA trust store
# (https://github.com/pypa/manylinux/issues/53)
# And it's not clear how up-to-date that is anyway
# So let's just use the same one pip and everyone uses
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
/opt/_internal/certs.pem
# If you modify this line you also have to modify the versions in the
# Dockerfiles:
export SSL_CERT_FILE=/opt/_internal/certs.pem
# Install newest curl
build_curl $CURL_ROOT $CURL_HASH
rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
hash -r
curl --version
curl-config --features
# Now we can delete our built SSL
rm -rf /usr/local/ssl
# Install patchelf (latest with unreleased bug fixes)
curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
tar -xzf patchelf-0.9njs2.tar.gz
(cd patchelf-0.9njs2 && ./configure && make && make install)
rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
# Install latest pypi release of auditwheel
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
# Clean up development headers and other unnecessary stuff for
# final image
yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
avahi freetype bitstream-vera-fonts \
${PYTHON_COMPILE_DEPS} > /dev/null 2>&1
yum -y install ${MANYLINUX1_DEPS}
yum -y clean all > /dev/null 2>&1
yum list installed
# we don't need libpython*.a, and they're many megabytes
find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
# Strip what we can -- and ignore errors, because this just attempts to strip
# *everything*, including non-ELF files:
find /opt/_internal -type f -print0 \
| xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
# We do not need the Python test suites, or indeed the precompiled .pyc and
# .pyo files. Partially cribbed from:
# https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
find /opt/_internal \
\( -type d -a -name test -o -name tests \) \
-o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
-print0 | xargs -0 rm -f
for PYTHON in /opt/python/*/bin/python; do
# Add matching directory of libpython shared library to library lookup path
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
# Smoke test to make sure that our Pythons work, and do indeed detect as
# being manylinux compatible:
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
# Make sure that SSL cert checking works
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
done
# Restore LD_LIBRARY_PATH
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
#!/bin/bash
# Helper utilities for build
PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
# XXX: the official https server at www.openssl.org cannot be reached
# with the old versions of openssl and curl in Centos 5.11 hence the fallback
# to the ftp mirror:
# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
# Ditto the curl sources
CURL_DOWNLOAD_URL=http://curl.askapache.com/download
GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
function check_var {
if [ -z "$1" ]; then
echo "required variable not defined"
exit 1
fi
}
function lex_pyver {
# Echoes Python version string padded with zeros
# Thus:
# 3.2.1 -> 003002001
# 3 -> 003000000
echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
}
function do_cpython_build {
local py_ver=$1
check_var $py_ver
local ucs_setting=$2
check_var $ucs_setting
tar -xzf Python-$py_ver.tgz
pushd Python-$py_ver
if [ "$ucs_setting" = "none" ]; then
unicode_flags=""
dir_suffix=""
else
local unicode_flags="--enable-unicode=$ucs_setting"
local dir_suffix="-$ucs_setting"
fi
local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
mkdir -p ${prefix}/lib
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
# NOTE --enable-shared for generating libpython shared library needed for
# linking of some of the nupic.core test executables.
CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
make -j2 > /dev/null
make install > /dev/null
popd
echo "ZZZ looking for libpython"
find / -name 'libpython*.so*'
rm -rf Python-$py_ver
# Some python's install as bin/python3. Make them available as
# bin/python.
if [ -e ${prefix}/bin/python3 ]; then
ln -s python3 ${prefix}/bin/python
fi
# NOTE Make libpython shared library visible to python calls below
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
ln -s ${prefix} /opt/python/${abi_tag}
}
function build_cpython {
local py_ver=$1
check_var $py_ver
check_var $PYTHON_DOWNLOAD_URL
wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
# NOTE We only need wide unicode for nupic.bindings wheel
do_cpython_build $py_ver ucs2
do_cpython_build $py_ver ucs4
else
do_cpython_build $py_ver none
fi
rm -f Python-$py_ver.tgz
}
function build_cpythons {
check_var $GET_PIP_URL
curl -sLO $GET_PIP_URL
for py_ver in $@; do
build_cpython $py_ver
done
rm get-pip.py
}
function do_openssl_build {
./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
make > /dev/null
make install > /dev/null
}
function check_sha256sum {
local fname=$1
check_var ${fname}
local sha256=$2
check_var ${sha256}
echo "${sha256} ${fname}" > ${fname}.sha256
sha256sum -c ${fname}.sha256
rm ${fname}.sha256
}
function build_openssl {
local openssl_fname=$1
check_var ${openssl_fname}
local openssl_sha256=$2
check_var ${openssl_sha256}
check_var ${OPENSSL_DOWNLOAD_URL}
curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
tar -xzf ${openssl_fname}.tar.gz
(cd ${openssl_fname} && do_openssl_build)
rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
}
function do_curl_build {
LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
make > /dev/null
make install > /dev/null
}
function build_curl {
local curl_fname=$1
check_var ${curl_fname}
local curl_sha256=$2
check_var ${curl_sha256}
check_var ${CURL_DOWNLOAD_URL}
curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
tar -jxf ${curl_fname}.tar.bz2
(cd ${curl_fname} && do_curl_build)
rm -rf ${curl_fname} ${curl_fname}.tar.bz2
}
function do_standard_install {
./configure > /dev/null
make > /dev/null
make install > /dev/null
}
function build_autoconf {
local autoconf_fname=$1
check_var ${autoconf_fname}
local autoconf_sha256=$2
check_var ${autoconf_sha256}
check_var ${AUTOCONF_DOWNLOAD_URL}
curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
tar -zxf ${autoconf_fname}.tar.gz
(cd ${autoconf_fname} && do_standard_install)
rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
}
# Logic copied from PEP 513
def is_manylinux1_compatible():
# Only Linux, and only x86-64 / i686
from distutils.util import get_platform
if get_platform() not in ["linux-x86_64", "linux-i686"]:
return False
# Check for presence of _manylinux module
try:
import _manylinux
return bool(_manylinux.manylinux1_compatible)
except (ImportError, AttributeError):
# Fall through to heuristic check below
pass
# Check glibc version. CentOS 5 uses glibc 2.5.
return have_compatible_glibc(2, 5)
def have_compatible_glibc(major, minimum_minor):
import ctypes
process_namespace = ctypes.CDLL(None)
try:
gnu_get_libc_version = process_namespace.gnu_get_libc_version
except AttributeError:
# Symbol doesn't exist -> therefore, we are not linked to
# glibc.
return False
# Call gnu_get_libc_version, which returns a string like "2.5".
gnu_get_libc_version.restype = ctypes.c_char_p
version_str = gnu_get_libc_version()
# py2 / py3 compatibility:
if not isinstance(version_str, str):
version_str = version_str.decode("ascii")
# Parse string and check against requested version.
version = [int(piece) for piece in version_str.split(".")]
assert len(version) == 2
if major != version[0]:
return False
if minimum_minor > version[1]:
return False
return True
import sys
if is_manylinux1_compatible():
print("%s is manylinux1 compatible" % (sys.executable, ))
sys.exit(0)
else:
print("%s is NOT manylinux1 compatible" % (sys.executable, ))
sys.exit(1)
# Utility script to print the python tag + the abi tag for a Python
# See PEP 425 for exactly what these are, but an example would be:
# cp27-cp27mu
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
# cf. https://github.com/pypa/manylinux/issues/53
GOOD_SSL = "https://google.com"
BAD_SSL = "https://self-signed.badssl.com"
import sys
print("Testing SSL certificate checking for Python:", sys.version)
if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
print("This version never checks SSL certs; skipping tests")
sys.exit(0)
if sys.version_info[0] >= 3:
from urllib.request import urlopen
EXC = OSError
else:
from urllib import urlopen
EXC = IOError
print("Connecting to %s should work" % (GOOD_SSL, ))
urlopen(GOOD_SSL)
print("...it did, yay.")
print("Connecting to %s should fail" % (BAD_SSL, ))
try:
urlopen(BAD_SSL)
# If we get here then we failed:
print("...it DIDN'T!!!!!11!!1one!")
sys.exit(1)
except EXC:
print("...it did, yay.")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册