“068bfbb817611c856acd8c535de2b33a6126786c”上不存在“paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp”
提交 a37f6ad3 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_elementwise_max_min

......@@ -364,6 +364,12 @@ split
.. autofunction:: paddle.v2.fluid.layers.split
:noindex:
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex:
logsigmoid
----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid
......
......@@ -25,3 +25,9 @@ glu
.. autofunction:: paddle.v2.fluid.nets.glu
:noindex:
dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
:noindex:
......@@ -19,7 +19,7 @@
### 基本使用概念
- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输入,每一个输入/输入都会对应有自己的`Argument`
- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`
- `Argument` 并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。
-`Argument`内部由`IVector`(对应着上文提到的一维整型数组)和`Matrix`(对应着上文提到的二维浮点型矩阵)来实际存储数据;由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
......
......@@ -135,6 +135,65 @@ bool operator==(const LoD &a, const LoD &b) {
return true;
}
bool CheckLoD(const LoD &in, int tensor_height) {
if (in.empty()) return true;
for (const auto &level : in) {
// check: there should be more than 2 offsets existing in each level.
if (level.size() < 2) return false;
// check: the first offset(the begin offset) of each level should be 0.
if (level.front() != 0) return false;
// check: all the offsets in a level should be ascending(no same items
// allows).
if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
if (a < b) return true;
return false;
})) {
LOG(INFO) << "ascending error";
return false;
}
}
// check: the lowest level's last offset should equals `tensor_height` if
// tensor_height>0.
if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
return false;
// check: the higher level's last offset should equals the lower level's
// size-1.
// NOTE LoD store the levels from top to bottom, so the higher level goes
// first.
for (size_t level = 0; level < in.size() - 1; level++) {
if (in[level].back() != in[level + 1].size() - 1) return false;
}
return true;
}
bool CheckAbsLoD(const LoD &in, int tensor_height) {
if (in.empty()) return true;
for (const auto &level : in) {
// check: all the offsets in a level should be ascending(no same items
// allows).
if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
if (a < b) return true;
return false;
})) {
return false;
}
// check: there should be more than 2 offsets existing in each level.
if (level.size() < 2) return false;
// check: the first offset of each level should be 0, and the last should be
// the same(the height of underlying tensor).
if (level.front() != 0) return false;
if (tensor_height < 0) {
tensor_height = level.back();
} else if ((size_t)tensor_height != level.back()) {
return false;
}
}
return true;
}
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) {
......
......@@ -71,6 +71,38 @@ LoD ToAbsOffset(const LoD& in);
bool operator==(const LoD& a, const LoD& b);
/*
* Check whether this lod's format is valid.
*
* ATTENTION:
* - Empty lod is treated as valid.
*
* It will check two things:
*
* 1. all the offsets in a level should be ascending(no same items allows).
* 2. there should be more than 2 offsets existing in each level.
* 3. the higher level's last offset should equals the lower level's size-1.
* 4. the first offset(the begin offset) of each level should be 0.
* 5. the lowest level's last offset should equals `tensor_height` if
* tensor_height>0.
*/
bool CheckLoD(const LoD& in, int tensor_height = -1);
/*
* Check whether this absolute lod's format is valid.
*
* ATTENTION:
* - Empty lod is treated as valid.
*
* It will check two things:
* 1. all the offsets in a level should be ascending(no same items allows)
* 2. there should be more than 2 offsets existing in each level.
* 3. the first offset of each level should be 0, and the last should be the
* same(the height of underlying tensor) or `tensor_height` if
* tensor_height>0.
*/
bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
/*
* LoDTensor (Level of details Tensor)
* see https://en.wikipedia.org/wiki/Level_of_details for reference.
......
......@@ -37,36 +37,6 @@ namespace framework {
const int kLodTensorSize = 20 * 128;
class LoDTensorTester : public ::testing::Test {
public:
virtual void SetUp() override {
// tensor's batch_size: 30
// 3 levels
// 0 10 20
// 0 5 10 15 20
// 0 2 5 7 10 12 15 20
LoD lod;
lod.push_back(std::vector<size_t>{0, 2, 3});
lod.push_back(std::vector<size_t>{0, 2, 5, 8});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
ASSERT_EQ(lod.size(), 3UL);
lod_tensor_.Resize({20 /*batch size*/, 128 /*dim*/});
// malloc memory
float* dst_ptr = lod_tensor_.mutable_data<float>(place);
for (int i = 0; i < kLodTensorSize; ++i) {
dst_ptr[i] = i;
}
lod_tensor_.set_lod(lod);
}
protected:
platform::CPUPlace place;
LoDTensor lod_tensor_;
};
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
......@@ -144,5 +114,53 @@ TEST(LoD, ToAbsOffset) {
EXPECT_EQ(abs_lod, expected);
}
TEST(LoD, CheckLoD) {
LoD relative_lod;
relative_lod.push_back(std::vector<size_t>({0, 2}));
relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
// check compatible
ASSERT_TRUE(CheckLoD(relative_lod));
relative_lod[1].back()++;
ASSERT_FALSE(CheckLoD(relative_lod));
relative_lod[1].back()--; // recover it
// check empty
LoD empty_lod;
ASSERT_TRUE(CheckLoD(empty_lod));
// check less than 2 offsets in a level
LoD some_lod0;
some_lod0.push_back(std::vector<size_t>({0}));
ASSERT_FALSE(CheckLoD(some_lod0));
// check with underlying tensor storage.
ASSERT_TRUE(CheckLoD(relative_lod, 5));
ASSERT_FALSE(CheckLoD(relative_lod, 9));
}
TEST(LoD, CheckAbsLoD) {
LoD relative_lod;
relative_lod.push_back(std::vector<size_t>({0, 2}));
relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
auto abs_lod = ToAbsOffset(relative_lod);
ASSERT_TRUE(CheckAbsLoD(abs_lod));
// check less than 2 offsets in a level.
// check the last item should be compatible with tensor height.
abs_lod.back().back()++;
ASSERT_FALSE(CheckAbsLoD(abs_lod));
abs_lod.back().back()--; // restore
// check less than 2 offsets in a lod.
LoD abs_lod0;
abs_lod0.push_back(std::vector<size_t>({0}));
ASSERT_FALSE(CheckAbsLoD(abs_lod0));
}
} // namespace framework
} // namespace paddle
......@@ -177,16 +177,16 @@ class OpKernelRegistrar : public Registrar {
/**
* Macro to register OperatorKernel.
*/
#define REGISTER_OP_KERNEL(op_type, DEVICE_TYPE, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##DEVICE_TYPE##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type, \
#DEVICE_TYPE); \
int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() { \
__op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch(); \
return 0; \
#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op_kernel_##op_type##_##LIBRARY_TYPE##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type, \
#LIBRARY_TYPE); \
int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() { \
__op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch(); \
return 0; \
}
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
......@@ -208,14 +208,14 @@ class OpKernelRegistrar : public Registrar {
static int use_op_itself_##op_type##_ __attribute__((unused)) = \
TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL(op_type, DEVICE_TYPE) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_kernel_##op_type##_##DEVICE_TYPE##__, \
"USE_OP_DEVICE_KERNEL must be in global namespace"); \
extern int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE(); \
static int use_op_kernel_##op_type##_##DEVICE_TYPE##_ \
__attribute__((unused)) = \
TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \
"USE_OP_DEVICE_KERNEL must be in global namespace"); \
extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ \
__attribute__((unused)) = \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
// TODO(fengjiayi): The following macros
// seems ugly, do we have better method?
......
......@@ -43,7 +43,7 @@ void MKLDNNConcatLayer::reshape(
channels_[0] = ic;
oc = ic;
for (size_t i = 1; i < inputLayers_.size(); i++) {
int batchsize, height, witdh;
int batchsize = 0, height = 0, witdh = 0;
reshapeInput(batchsize, height, witdh, i);
CHECK_EQ(bs, batchsize);
CHECK_EQ(ih, height);
......@@ -84,6 +84,7 @@ void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
bool has8c = false, has16c = false, hasnc = false;
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i, channels_[i]);
inputs[i]->downSpatial();
CHECK(inputs[i]);
auto dm = inputs[i]->getDims();
// inputs format can be different, but ndims must equal
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import numpy
import struct
import traceback
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from paddle.trainer.PyDataProvider2 import *
# Note that each config should has an independent provider
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import sys
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
......@@ -11,20 +11,6 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
#edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import random
from paddle.trainer.PyDataProvider2 import *
......
......@@ -50,6 +50,7 @@ __all__ = [
'sequence_last_step',
'dropout',
'split',
'matmul',
]
......@@ -1597,3 +1598,73 @@ def split(input, num_or_sections, dim=-1):
'axis': dim
})
return outs
def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
"""
Applies matrix multipication to two tensors. Currently only rank 1 to rank
3 input tensors are supported.
The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
- If a transpose flag is specified, the last two dimensions of the tensor
are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
:math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
:math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
:math:`[1, D]` in transposed form.
- After transpose, the two tensors are 2-D or 3-D and matrix multipication
performs in the following way.
- If both are 2-D, they are multiplied like conventional matrices.
- If either is 3-D, it is treated as a stack of matrices residing in the
last two dimensions and a batched matrix multiply supporting broadcast
applies on the two tensors.
Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
nontransposed, the prepended or appended dimension :math:`1` will be
removed after matrix multipication.
Args:
x (Variable): The input variable which is a Tensor or LoDTensor.
y (Variable): The input variable which is a Tensor or LoDTensor.
transpose_x (bool): Whether to transpose :math:`x` before multiplication.
transpose_y (bool): Whether to transpose :math:`y` before multiplication.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The product Tensor variable.
Examples:
.. code-block:: python
# Examples to clarify shapes of the inputs and output
# x: [B, M, K], y: [B, K, N]
fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K, N]
fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K]
fluid.layers.matmul(x, y) # out: [B, M]
# x: [M, K], y: [K, N]
fluid.layers.matmul(x, y) # out: [M, N]
# x: [K], y: [K]
fluid.layers.matmul(x, y) # out: [1]
# x: [M], y: [N]
fluid.layers.matmul(x, y, True, True) # out: [M, N]
"""
helper = LayerHelper('matmul', **locals())
assert max(
len(x.shape), len(y.shape)
) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.'
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='matmul',
inputs={'X': x,
'Y': y},
outputs={'Out': out},
attrs={'transpose_X': transpose_x,
'transpose_Y': transpose_y})
return out
......@@ -17,6 +17,7 @@ __all__ = [
"simple_img_conv_pool",
"sequence_conv_pool",
"glu",
"dot_product_attention",
]
......@@ -150,3 +151,55 @@ def glu(input, dim=-1):
act_b = layers.sigmoid(x=b)
out = layers.elementwise_mul(x=a, y=act_b)
return out
def dot_product_attention(querys, keys, values):
"""
The dot-product attention.
Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key.
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor.
value (Variable): The input variable which is a Tensor or LoDTensor.
Returns:
tuple: The Tensor variables representing the output and attention scores.
Examples:
.. code-block:: python
# Suppose q, k, v are tensor variables with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10]
attn_scores.shape # [3, 5, 6]
"""
assert keys.shape[-2] == values.shape[
-2], 'The shapes of keys and values mismatch.'
assert querys.shape[-1] == keys.shape[
-1], 'The shapes of querys and keys mismatch.'
product = layers.matmul(x=querys, y=keys, transpose_y=True)
attn_scores = layers.reshape(
x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act='softmax'),
shape=product.shape)
out = layers.matmul(attn_scores, values)
return out, attn_scores
......@@ -96,18 +96,18 @@ class Generator(object):
self.outputs = {'Out': Out}
def test_check_output(self):
self.check_output(atol=1e-2)
self.check_output(atol=1e-3)
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
def test_check_grad_ignore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
def test_check_grad_ignore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
# Generate test cases for all possibilities
......
The examples in v1_api_demo are using v1_api currently, and will be upgraded to v2_api later.
Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and
[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
output/
uniform_params/
cifar_params/
mnist_params/
*.png
.pydevproject
.project
*.log
*.pyc
data/mnist_data/
data/cifar-10-batches-py/
# Generative Adversarial Networks (GAN)
This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
In order to run the model, first download the corresponding data by running the shell script in ./data.
Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).
$python gan_trainer.py -d cifar --use_gpu 1
The generated images will be stored in ./cifar_samples/
The corresponding models will be stored in ./cifar_params/
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
tar zxf cifar-10-python.tar.gz
rm cifar-10-python.tar.gz
#!/usr/bin/env sh
# This script downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/mnist_data"
mkdir "$DIR/mnist_data"
cd "$DIR/mnist_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
mode = get_config_arg("mode", str, "generator")
assert mode in set([
"generator", "discriminator", "generator_training", "discriminator_training"
])
is_generator_training = mode == "generator_training"
is_discriminator_training = mode == "discriminator_training"
is_generator = mode == "generator"
is_discriminator = mode == "discriminator"
# The network structure below follows the ref https://arxiv.org/abs/1406.2661
# Here we used two hidden layers and batch_norm
print('mode=%s' % mode)
# the dim of the noise (z) as the input of the generator network
noise_dim = 10
# the dim of the hidden layer
hidden_dim = 10
# the dim of the generated sample
sample_dim = 2
settings(
batch_size=128,
learning_rate=1e-4,
learning_method=AdamOptimizer(beta1=0.5))
def discriminator(sample):
"""
discriminator ouputs the probablity of a sample is from generator
or real data.
The output has two dimenstional: dimension 0 is the probablity
of the sample is from generator and dimension 1 is the probabblity
of the sample is from real data.
"""
param_attr = ParamAttr(is_static=is_generator_training)
bias_attr = ParamAttr(
is_static=is_generator_training, initial_mean=1.0, initial_std=0)
hidden = fc_layer(
input=sample,
name="dis_hidden",
size=hidden_dim,
bias_attr=bias_attr,
param_attr=param_attr,
act=ReluActivation())
hidden2 = fc_layer(
input=hidden,
name="dis_hidden2",
size=hidden_dim,
bias_attr=bias_attr,
param_attr=param_attr,
act=LinearActivation())
hidden_bn = batch_norm_layer(
hidden2,
act=ReluActivation(),
name="dis_hidden_bn",
bias_attr=bias_attr,
param_attr=ParamAttr(
is_static=is_generator_training, initial_mean=1.0,
initial_std=0.02),
use_global_stats=False)
return fc_layer(
input=hidden_bn,
name="dis_prob",
size=2,
bias_attr=bias_attr,
param_attr=param_attr,
act=SoftmaxActivation())
def generator(noise):
"""
generator generates a sample given noise
"""
param_attr = ParamAttr(is_static=is_discriminator_training)
bias_attr = ParamAttr(
is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
hidden = fc_layer(
input=noise,
name="gen_layer_hidden",
size=hidden_dim,
bias_attr=bias_attr,
param_attr=param_attr,
act=ReluActivation())
hidden2 = fc_layer(
input=hidden,
name="gen_hidden2",
size=hidden_dim,
bias_attr=bias_attr,
param_attr=param_attr,
act=LinearActivation())
hidden_bn = batch_norm_layer(
hidden2,
act=ReluActivation(),
name="gen_layer_hidden_bn",
bias_attr=bias_attr,
param_attr=ParamAttr(
is_static=is_discriminator_training,
initial_mean=1.0,
initial_std=0.02),
use_global_stats=False)
return fc_layer(
input=hidden_bn,
name="gen_layer1",
size=sample_dim,
bias_attr=bias_attr,
param_attr=param_attr,
act=LinearActivation())
if is_generator_training:
noise = data_layer(name="noise", size=noise_dim)
sample = generator(noise)
if is_discriminator_training:
sample = data_layer(name="sample", size=sample_dim)
if is_generator_training or is_discriminator_training:
label = data_layer(name="label", size=1)
prob = discriminator(sample)
cost = cross_entropy(input=prob, label=label)
classification_error_evaluator(
input=prob, label=label, name=mode + '_error')
outputs(cost)
if is_generator:
noise = data_layer(name="noise", size=noise_dim)
outputs(generator(noise))
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
mode = get_config_arg("mode", str, "generator")
dataSource = get_config_arg("data", str, "mnist")
assert mode in set([
"generator", "discriminator", "generator_training", "discriminator_training"
])
is_generator_training = mode == "generator_training"
is_discriminator_training = mode == "discriminator_training"
is_generator = mode == "generator"
is_discriminator = mode == "discriminator"
# The network structure below follows the dcgan paper
# (https://arxiv.org/abs/1511.06434)
print('mode=%s' % mode)
# the dim of the noise (z) as the input of the generator network
noise_dim = 100
# the number of filters in the layer in generator/discriminator that is
# closet to the image
gf_dim = 64
df_dim = 64
if dataSource == "mnist":
sample_dim = 28 # image dim
c_dim = 1 # image color
else:
sample_dim = 32
c_dim = 3
s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
s8, s16 = int(sample_dim / 8), int(sample_dim / 16)
settings(
batch_size=128,
learning_rate=2e-4,
learning_method=AdamOptimizer(beta1=0.5))
def conv_bn(input,
channels,
imgSize,
num_filters,
output_x,
stride,
name,
param_attr,
bias_attr,
param_attr_bn,
bn,
trans=False,
act=ReluActivation()):
"""
conv_bn is a utility function that constructs a convolution/deconv layer
with an optional batch_norm layer
:param bn: whether to use batch_norm_layer
:type bn: bool
:param trans: whether to use conv (False) or deconv (True)
:type trans: bool
"""
# calculate the filter_size and padding size based on the given
# imgSize and ouput size
tmp = imgSize - (output_x - 1) * stride
if tmp <= 1 or tmp > 5:
raise ValueError("conv input-output dimension does not fit")
elif tmp <= 3:
filter_size = tmp + 2
padding = 1
else:
filter_size = tmp
padding = 0
print(imgSize, output_x, stride, filter_size, padding)
if trans:
nameApx = "_convt"
else:
nameApx = "_conv"
if bn:
conv = img_conv_layer(
input,
filter_size=filter_size,
num_filters=num_filters,
name=name + nameApx,
num_channels=channels,
act=LinearActivation(),
groups=1,
stride=stride,
padding=padding,
bias_attr=bias_attr,
param_attr=param_attr,
shared_biases=True,
layer_attr=None,
filter_size_y=None,
stride_y=None,
padding_y=None,
trans=trans)
conv_bn = batch_norm_layer(
conv,
act=act,
name=name + nameApx + "_bn",
bias_attr=bias_attr,
param_attr=param_attr_bn,
use_global_stats=False)
return conv_bn
else:
conv = img_conv_layer(
input,
filter_size=filter_size,
num_filters=num_filters,
name=name + nameApx,
num_channels=channels,
act=act,
groups=1,
stride=stride,
padding=padding,
bias_attr=bias_attr,
param_attr=param_attr,
shared_biases=True,
layer_attr=None,
filter_size_y=None,
stride_y=None,
padding_y=None,
trans=trans)
return conv
def generator(noise):
"""
generator generates a sample given noise
"""
param_attr = ParamAttr(
is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
bias_attr = ParamAttr(
is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
param_attr_bn = ParamAttr(
is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
h1 = fc_layer(
input=noise,
name="gen_layer_h1",
size=s8 * s8 * gf_dim * 4,
bias_attr=bias_attr,
param_attr=param_attr,
act=LinearActivation())
h1_bn = batch_norm_layer(
h1,
act=ReluActivation(),
name="gen_layer_h1_bn",
bias_attr=bias_attr,
param_attr=param_attr_bn,
use_global_stats=False)
h2_bn = conv_bn(
h1_bn,
channels=gf_dim * 4,
output_x=s8,
num_filters=gf_dim * 2,
imgSize=s4,
stride=2,
name="gen_layer_h2",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=True,
trans=True)
h3_bn = conv_bn(
h2_bn,
channels=gf_dim * 2,
output_x=s4,
num_filters=gf_dim,
imgSize=s2,
stride=2,
name="gen_layer_h3",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=True,
trans=True)
return conv_bn(
h3_bn,
channels=gf_dim,
output_x=s2,
num_filters=c_dim,
imgSize=sample_dim,
stride=2,
name="gen_layer_h4",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=False,
trans=True,
act=TanhActivation())
def discriminator(sample):
"""
discriminator ouputs the probablity of a sample is from generator
or real data.
The output has two dimenstional: dimension 0 is the probablity
of the sample is from generator and dimension 1 is the probabblity
of the sample is from real data.
"""
param_attr = ParamAttr(
is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
bias_attr = ParamAttr(
is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
param_attr_bn = ParamAttr(
is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
h0 = conv_bn(
sample,
channels=c_dim,
imgSize=sample_dim,
num_filters=df_dim,
output_x=s2,
stride=2,
name="dis_h0",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=False)
h1_bn = conv_bn(
h0,
channels=df_dim,
imgSize=s2,
num_filters=df_dim * 2,
output_x=s4,
stride=2,
name="dis_h1",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=True)
h2_bn = conv_bn(
h1_bn,
channels=df_dim * 2,
imgSize=s4,
num_filters=df_dim * 4,
output_x=s8,
stride=2,
name="dis_h2",
param_attr=param_attr,
bias_attr=bias_attr,
param_attr_bn=param_attr_bn,
bn=True)
return fc_layer(
input=h2_bn,
name="dis_prob",
size=2,
bias_attr=bias_attr,
param_attr=param_attr,
act=SoftmaxActivation())
if is_generator_training:
noise = data_layer(name="noise", size=noise_dim)
sample = generator(noise)
if is_discriminator_training:
sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)
if is_generator_training or is_discriminator_training:
label = data_layer(name="label", size=1)
prob = discriminator(sample)
cost = cross_entropy(input=prob, label=label)
classification_error_evaluator(
input=prob, label=label, name=mode + '_error')
outputs(cost)
if is_generator:
noise = data_layer(name="noise", size=noise_dim)
outputs(generator(noise))
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import random
import numpy
import cPickle
import sys, os
from PIL import Image
from paddle.trainer.config_parser import parse_config
from paddle.trainer.config_parser import logger
import py_paddle.swig_paddle as api
import matplotlib.pyplot as plt
def plot2DScatter(data, outputfile):
'''
Plot the data as a 2D scatter plot and save to outputfile
data needs to be two dimensinoal
'''
x = data[:, 0]
y = data[:, 1]
logger.info("The mean vector is %s" % numpy.mean(data, 0))
logger.info("The std vector is %s" % numpy.std(data, 0))
heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.scatter(x, y)
plt.savefig(outputfile, bbox_inches='tight')
def CHECK_EQ(a, b):
assert a == b, "a=%s, b=%s" % (a, b)
def copy_shared_parameters(src, dst):
'''
copy the parameters from src to dst
:param src: the source of the parameters
:type src: GradientMachine
:param dst: the destination of the parameters
:type dst: GradientMachine
'''
src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
src_params = dict([(p.getName(), p) for p in src_params])
for i in xrange(dst.getParameterSize()):
dst_param = dst.getParameter(i)
src_param = src_params.get(dst_param.getName(), None)
if src_param is None:
continue
src_value = src_param.getBuf(api.PARAMETER_VALUE)
dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
CHECK_EQ(len(src_value), len(dst_value))
dst_value.copyFrom(src_value)
dst_param.setValueUpdated()
def print_parameters(src):
src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
print "***************"
for p in src_params:
print "Name is %s" % p.getName()
print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
)
def load_mnist_data(imageFile):
f = open(imageFile, "rb")
f.read(16)
# Define number of samples for train/test
if "train" in imageFile:
n = 60000
else:
n = 10000
data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
data = data / 255.0 * 2.0 - 1.0
f.close()
return data.astype('float32')
def load_cifar_data(cifar_path):
batch_size = 10000
data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
for i in range(1, 6):
file = cifar_path + "/data_batch_" + str(i)
fo = open(file, 'rb')
dict = cPickle.load(fo)
fo.close()
data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
data = data / 255.0 * 2.0 - 1.0
return data
# synthesize 2-D uniform data
def load_uniform_data():
data = numpy.random.rand(1000000, 2).astype('float32')
return data
def merge(images, size):
if images.shape[1] == 28 * 28:
h, w, c = 28, 28, 1
else:
h, w, c = 32, 32, 3
img = numpy.zeros((h * size[0], w * size[1], c))
for idx in xrange(size[0] * size[1]):
i = idx % size[1]
j = idx // size[1]
img[j*h:j*h+h, i*w:i*w+w, :] = \
((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
return img.astype('uint8')
def save_images(images, path):
merged_img = merge(images, [8, 8])
if merged_img.shape[2] == 1:
im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
else:
im = Image.fromarray(merged_img, mode="RGB")
im.save(path)
def get_real_samples(batch_size, data_np):
return data_np[numpy.random.choice(
data_np.shape[0], batch_size, replace=False), :]
def get_noise(batch_size, noise_dim):
return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
def get_fake_samples(generator_machine, batch_size, noise):
gen_inputs = api.Arguments.createArguments(1)
gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
gen_outputs = api.Arguments.createArguments(0)
generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
return fake_samples
def get_training_loss(training_machine, inputs):
outputs = api.Arguments.createArguments(0)
training_machine.forward(inputs, outputs, api.PASS_TEST)
loss = outputs.getSlotValue(0).copyToNumpyMat()
return numpy.mean(loss)
def prepare_discriminator_data_batch_pos(batch_size, data_np):
real_samples = get_real_samples(batch_size, data_np)
labels = numpy.ones(batch_size, dtype='int32')
inputs = api.Arguments.createArguments(2)
inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
return inputs
def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
fake_samples = get_fake_samples(generator_machine, batch_size, noise)
labels = numpy.zeros(batch_size, dtype='int32')
inputs = api.Arguments.createArguments(2)
inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
return inputs
def prepare_generator_data_batch(batch_size, noise):
label = numpy.ones(batch_size, dtype='int32')
inputs = api.Arguments.createArguments(2)
inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
return inputs
def find(iterable, cond):
for item in iterable:
if cond(item):
return item
return None
def get_layer_size(model_conf, layer_name):
layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
return layer_conf.size
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
parser.add_argument(
"--use_gpu", default="1", help="1 means use gpu for training")
parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
args = parser.parse_args()
data_source = args.data_source
use_gpu = args.use_gpu
assert data_source in ["mnist", "cifar", "uniform"]
assert use_gpu in ["0", "1"]
if not os.path.exists("./%s_samples/" % data_source):
os.makedirs("./%s_samples/" % data_source)
if not os.path.exists("./%s_params/" % data_source):
os.makedirs("./%s_params/" % data_source)
api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
'--log_period=100', '--gpu_id=' + args.gpu_id,
'--save_dir=' + "./%s_params/" % data_source)
if data_source == "uniform":
conf = "gan_conf.py"
num_iter = 10000
else:
conf = "gan_conf_image.py"
num_iter = 1000
gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
dis_conf = parse_config(conf,
"mode=discriminator_training,data=" + data_source)
generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
batch_size = dis_conf.opt_config.batch_size
noise_dim = get_layer_size(gen_conf.model_config, "noise")
if data_source == "mnist":
data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
elif data_source == "cifar":
data_np = load_cifar_data("./data/cifar-10-batches-py/")
else:
data_np = load_uniform_data()
# this creates a gradient machine for discriminator
dis_training_machine = api.GradientMachine.createFromConfigProto(
dis_conf.model_config)
# this create a gradient machine for generator
gen_training_machine = api.GradientMachine.createFromConfigProto(
gen_conf.model_config)
# generator_machine is used to generate data only, which is used for
# training discriminator
logger.info(str(generator_conf.model_config))
generator_machine = api.GradientMachine.createFromConfigProto(
generator_conf.model_config)
dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
dis_trainer.startTrain()
gen_trainer.startTrain()
# Sync parameters between networks (GradientMachine) at the beginning
copy_shared_parameters(gen_training_machine, dis_training_machine)
copy_shared_parameters(gen_training_machine, generator_machine)
# constrain that either discriminator or generator can not be trained
# consecutively more than MAX_strike times
curr_train = "dis"
curr_strike = 0
MAX_strike = 5
for train_pass in xrange(100):
dis_trainer.startTrainPass()
gen_trainer.startTrainPass()
for i in xrange(num_iter):
# Do forward pass in discriminator to get the dis_loss
noise = get_noise(batch_size, noise_dim)
data_batch_dis_pos = prepare_discriminator_data_batch_pos(
batch_size, data_np)
dis_loss_pos = get_training_loss(dis_training_machine,
data_batch_dis_pos)
data_batch_dis_neg = prepare_discriminator_data_batch_neg(
generator_machine, batch_size, noise)
dis_loss_neg = get_training_loss(dis_training_machine,
data_batch_dis_neg)
dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
# Do forward pass in generator to get the gen_loss
data_batch_gen = prepare_generator_data_batch(batch_size, noise)
gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
if i % 100 == 0:
print "d_pos_loss is %s d_neg_loss is %s" % (dis_loss_pos,
dis_loss_neg)
print "d_loss is %s g_loss is %s" % (dis_loss, gen_loss)
# Decide which network to train based on the training history
# And the relative size of the loss
if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
if curr_train == "dis":
curr_strike += 1
else:
curr_train = "dis"
curr_strike = 1
dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
copy_shared_parameters(dis_training_machine,
gen_training_machine)
else:
if curr_train == "gen":
curr_strike += 1
else:
curr_train = "gen"
curr_strike = 1
gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
# TODO: add API for paddle to allow true parameter sharing between different GradientMachines
# so that we do not need to copy shared parameters.
copy_shared_parameters(gen_training_machine,
dis_training_machine)
copy_shared_parameters(gen_training_machine, generator_machine)
dis_trainer.finishTrainPass()
gen_trainer.finishTrainPass()
# At the end of each pass, save the generated samples/images
fake_samples = get_fake_samples(generator_machine, batch_size, noise)
if data_source == "uniform":
plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
(data_source, train_pass))
else:
save_images(fake_samples, "./%s_samples/train_pass%s.png" %
(data_source, train_pass))
dis_trainer.finishTrain()
gen_trainer.finishTrain()
if __name__ == '__main__':
main()
data/raw_data
data/*.list
mnist_vgg_model
plot.png
train.log
*pyc
.ipynb_checkpoints
params.pkl
params.tar
params.tar.gz
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
"""
A very basic example for how to use current Raw SWIG API to train mnist network.
Current implementation uses Raw SWIG, which means the API call is directly \
passed to C++ side of Paddle.
The user api could be simpler and carefully designed.
"""
import random
import numpy as np
import paddle.v2 as paddle_v2
import py_paddle.swig_paddle as api
from paddle.trainer_config_helpers import *
from py_paddle import DataProviderConverter
from mnist_util import read_from_mnist
def init_parameter(network):
assert isinstance(network, api.GradientMachine)
for each_param in network.getParameters():
assert isinstance(each_param, api.Parameter)
array_size = len(each_param)
array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
def generator_to_batch(generator, batch_size):
ret_val = list()
for each_item in generator:
ret_val.append(each_item)
if len(ret_val) == batch_size:
yield ret_val
ret_val = list()
if len(ret_val) != 0:
yield ret_val
class BatchPool(object):
def __init__(self, generator, batch_size):
self.data = list(generator)
self.batch_size = batch_size
def __call__(self):
random.shuffle(self.data)
for offset in xrange(0, len(self.data), self.batch_size):
limit = min(offset + self.batch_size, len(self.data))
yield self.data[offset:limit]
def input_order_converter(generator):
for each_item in generator:
yield each_item['pixel'], each_item['label']
def main():
api.initPaddle("-use_gpu=false", "-trainer_count=4") # use 4 cpu cores
optimizer = paddle_v2.optimizer.Adam(
learning_rate=1e-4,
batch_size=1000,
model_average=ModelAverage(average_window=0.5),
regularization=L2Regularization(rate=0.5))
# Create Local Updater. Local means not run in cluster.
# For a cluster training, here we can change to createRemoteUpdater
# in future.
updater = optimizer.create_local_updater()
assert isinstance(updater, api.ParameterUpdater)
# define network
images = paddle_v2.layer.data(
name='pixel', type=paddle_v2.data_type.dense_vector(784))
label = paddle_v2.layer.data(
name='label', type=paddle_v2.data_type.integer_value(10))
hidden1 = paddle_v2.layer.fc(input=images, size=200)
hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
inference = paddle_v2.layer.fc(input=hidden2,
size=10,
act=paddle_v2.activation.Softmax())
cost = paddle_v2.layer.classification_cost(input=inference, label=label)
# Create Simple Gradient Machine.
model_config = paddle_v2.layer.parse_network(cost)
m = api.GradientMachine.createFromConfigProto(model_config,
api.CREATE_MODE_NORMAL,
optimizer.enable_types())
# This type check is not useful. Only enable type hint in IDE.
# Such as PyCharm
assert isinstance(m, api.GradientMachine)
# Initialize Parameter by numpy.
init_parameter(network=m)
# Initialize ParameterUpdater.
updater.init(m)
# DataProvider Converter is a utility convert Python Object to Paddle C++
# Input. The input format is as same as Paddle's DataProvider.
converter = DataProviderConverter(input_types=[images.type, label.type])
train_file = './data/raw_data/train'
test_file = './data/raw_data/t10k'
# start gradient machine.
# the gradient machine must be started before invoke forward/backward.
# not just for training, but also for inference.
m.start()
# evaluator can print error rate, etc. It is a C++ class.
batch_evaluator = m.makeEvaluator()
test_evaluator = m.makeEvaluator()
# Get Train Data.
# TrainData will stored in a data pool. Currently implementation is not care
# about memory, speed. Just a very naive implementation.
train_data_generator = input_order_converter(read_from_mnist(train_file))
train_data = BatchPool(train_data_generator, 512)
# outArgs is Neural Network forward result. Here is not useful, just passed
# to gradient_machine.forward
outArgs = api.Arguments.createArguments(0)
for pass_id in xrange(2): # we train 2 passes.
updater.startPass()
for batch_id, data_batch in enumerate(train_data()):
# data_batch is input images.
# here, for online learning, we could get data_batch from network.
# Start update one batch.
pass_type = updater.startBatch(len(data_batch))
# Start BatchEvaluator.
# batch_evaluator can be used between start/finish.
batch_evaluator.start()
# forwardBackward is a shortcut for forward and backward.
# It is sometimes faster than invoke forward/backward separately,
# because in GradientMachine, it may be async.
m.forwardBackward(converter(data_batch), outArgs, pass_type)
for each_param in m.getParameters():
updater.update(each_param)
# Get cost. We use numpy to calculate total cost for this batch.
cost_vec = outArgs.getSlotValue(0)
cost_vec = cost_vec.copyToNumpyMat()
cost = cost_vec.sum() / len(data_batch)
# Make evaluator works.
m.eval(batch_evaluator)
# Print logs.
print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
cost, batch_evaluator
batch_evaluator.finish()
# Finish batch.
# * will clear gradient.
# * ensure all values should be updated.
updater.finishBatch(cost)
# testing stage. use test data set to test current network.
updater.apply()
test_evaluator.start()
test_data_generator = input_order_converter(read_from_mnist(test_file))
for data_batch in generator_to_batch(test_data_generator, 512):
# in testing stage, only forward is needed.
m.forward(converter(data_batch), outArgs, api.PASS_TEST)
m.eval(test_evaluator)
# print error rate for test data set
print 'Pass', pass_id, ' test evaluator: ', test_evaluator
test_evaluator.finish()
updater.restore()
updater.catchUpWith()
params = m.getParameters()
for each_param in params:
assert isinstance(each_param, api.Parameter)
value = each_param.getBuf(api.PARAMETER_VALUE)
value = value.copyToNumpyArray()
# Here, we could save parameter to every where you want
print each_param.getName(), value
updater.finishPass()
m.finish()
if __name__ == '__main__':
main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" + "\n")
o.close()
o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" + "\n")
o.close()
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/raw_data"
mkdir "$DIR/raw_data"
cd "$DIR/raw_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
cd $DIR
rm -f *.list
python generate_list.py
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir = './data/'
define_py_data_sources2(
train_list=data_dir + 'train.list',
test_list=data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer())
#######################Network Configuration #############
data_size = 1 * 28 * 28
label_size = 10
img = data_layer(name='pixel', size=data_size)
# light cnn
# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1
# Easier to train for mnist dataset and quite efficient
# Final performance is close to deeper ones on tasks such as digital and character classification
def light_cnn(input_image, num_channels, num_classes):
def __light__(ipt,
num_filter=128,
times=1,
conv_filter_size=3,
dropouts=0,
num_channels_=None):
return img_conv_group(
input=ipt,
num_channels=num_channels_,
pool_size=2,
pool_stride=2,
conv_padding=0,
conv_num_filter=[num_filter] * times,
conv_filter_size=conv_filter_size,
conv_act=ReluActivation(),
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type=MaxPooling())
tmp = __light__(input_image, num_filter=128, num_channels_=num_channels)
tmp = __light__(tmp, num_filter=128)
tmp = __light__(tmp, num_filter=128)
tmp = __light__(tmp, num_filter=128, conv_filter_size=1)
tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
return tmp
predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
inputs(img, lbl)
outputs(classification_cost(input=predict, label=lbl))
else:
outputs(predict)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from paddle.trainer.PyDataProvider2 import *
from mnist_util import read_from_mnist
# Define a py data provider
@provider(
input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10)},
cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename): # settings is not used currently.
for each in read_from_mnist(filename):
yield each
import numpy
__all__ = ['read_from_mnist']
def read_from_mnist(filename):
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
f = open(imgf, "rb")
l = open(labelf, "rb")
f.read(16)
l.read(8)
# Define number of samples for train/test
if "train" in filename:
n = 60000
else:
n = 10000
images = numpy.fromfile(
f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
images = images / 255.0 * 2.0 - 1.0
labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
for i in xrange(n):
yield {"pixel": images[i, :], 'label': labels[i]}
f.close()
l.close()
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
config=vgg_16_mnist.py
output=./mnist_vgg_model
log=train.log
paddle train \
--config=$config \
--dot_period=10 \
--log_period=100 \
--test_all_data_in_one_period=1 \
--use_gpu=0 \
--trainer_count=1 \
--num_passes=100 \
--save_dir=$output \
2>&1 | tee $log
paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
python -m paddle.utils.plotcurve -i $log > plot.png
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir = './data/'
define_py_data_sources2(
train_list=data_dir + 'train.list',
test_list=data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(
batch_size=128,
learning_rate=0.1 / 128.0,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128))
#######################Network Configuration #############
data_size = 1 * 28 * 28
label_size = 10
img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
inputs(img, lbl)
outputs(classification_cost(input=predict, label=lbl))
else:
outputs(predict)
#!/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python extract_para.py --preModel PREMODEL --preDict PREDICT \
--usrModel USRMODEL --usrDict USRDICT -d DIM
Options:
-h, --help show this help message and exit
--preModel PREMODEL the name of pretrained embedding model
--preDict PREDICT the name of pretrained dictionary
--usrModel usrModel the name of output usr embedding model
--usrDict usrDict the name of user specified dictionary
-d DIM dimension of parameter
"""
from optparse import OptionParser
import struct
def get_row_index(preDict, usrDict):
"""
Get the row positions for all words in user dictionary from pre-trained dictionary.
return: a list of row positions
Example: preDict='a\nb\nc\n', usrDict='a\nc\n', then return [0,2]
"""
pos = []
index = dict()
with open(preDict, "r") as f:
for line_index, line in enumerate(f):
word = line.strip().split()[0]
index[word] = line_index
with open(usrDict, "r") as f:
for line in f:
word = line.strip().split()[0]
pos.append(index[word])
return pos
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
paraDim):
"""
Extract desired parameters from a pretrained embedding model based on user dictionary
"""
if paraDim not in [32, 64, 128, 256]:
raise RuntimeError("We only support 32, 64, 128, 256 dimensions now")
fi = open(preModel, "rb")
fo = open(usrModel, "wb")
# write filehead
rowIndex = get_row_index(preDict, usrDict)
newHead = struct.pack("iil", 0, 4, len(rowIndex) * paraDim)
fo.write(newHead)
bytes = 4 * paraDim
for i in range(0, len(rowIndex)):
# find the absolute position of input file
fi.seek(rowIndex[i] * bytes + 16, 0)
fo.write(fi.read(bytes))
print "extract parameters finish, total", len(rowIndex), "lines"
fi.close()
def main():
"""
Main entry for running paraconvert.py
"""
usage = "usage: \n" \
"python %prog --preModel PREMODEL --preDict PREDICT" \
" --usrModel USRMODEL --usrDict USRDICT -d DIM"
parser = OptionParser(usage)
parser.add_option(
"--preModel",
action="store",
dest="preModel",
help="the name of pretrained embedding model")
parser.add_option(
"--preDict",
action="store",
dest="preDict",
help="the name of pretrained dictionary")
parser.add_option(
"--usrModel",
action="store",
dest="usrModel",
help="the name of output usr embedding model")
parser.add_option(
"--usrDict",
action="store",
dest="usrDict",
help="the name of user specified dictionary")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict,
int(options.dim))
if __name__ == '__main__':
main()
#!/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
python paraconvert.py --t2b -i INPUT -o OUTPUT
Options:
-h, --help show this help message and exit
--b2t convert parameter file of embedding model from binary to text
--t2b convert parameter file of embedding model from text to binary
-i INPUT input parameter file name
-o OUTPUT output parameter file name
-d DIM dimension of parameter
"""
from optparse import OptionParser
import struct
def binary2text(input, output, paraDim):
"""
Convert a binary parameter file of embedding model to be a text file.
input: the name of input binary parameter file, the format is:
1) the first 16 bytes is filehead:
version(4 bytes): version of paddle, default = 0
floatSize(4 bytes): sizeof(float) = 4
paraCount(8 bytes): total number of parameter
2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
output: the name of output text parameter file, for example:
0,4,32156096
-0.7845433,1.1937413,-0.1704215,...
0.0000909,0.0009465,-0.0008813,...
...
the format is:
1) the first line is filehead:
version=0, floatSize=4, paraCount=32156096
2) other lines print the paramters
a) each line prints paraDim paramters splitted by ','
b) there is paraCount/paraDim lines (embedding words)
paraDim: dimension of parameters
"""
fi = open(input, "rb")
fo = open(output, "w")
"""
"""
version, floatSize, paraCount = struct.unpack("iil", fi.read(16))
newHead = ','.join([str(version), str(floatSize), str(paraCount)])
print >> fo, newHead
bytes = 4 * int(paraDim)
format = "%df" % int(paraDim)
context = fi.read(bytes)
line = 0
while context:
numbers = struct.unpack(format, context)
lst = []
for i in numbers:
lst.append('%8.7f' % i)
print >> fo, ','.join(lst)
context = fi.read(bytes)
line += 1
fi.close()
fo.close()
print "binary2text finish, total", line, "lines"
def get_para_count(input):
"""
Compute the total number of embedding parameters in input text file.
input: the name of input text file
"""
numRows = 1
paraDim = 0
with open(input) as f:
line = f.readline()
paraDim = len(line.split(","))
for line in f:
numRows += 1
return numRows * paraDim
def text2binary(input, output, paddle_head=True):
"""
Convert a text parameter file of embedding model to be a binary file.
input: the name of input text parameter file, for example:
-0.7845433,1.1937413,-0.1704215,...
0.0000909,0.0009465,-0.0008813,...
...
the format is:
1) it doesn't have filehead
2) each line stores the same dimension of parameters,
the separator is commas ','
output: the name of output binary parameter file, the format is:
1) the first 16 bytes is filehead:
version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
"""
fi = open(input, "r")
fo = open(output, "wb")
newHead = struct.pack("iil", 0, 4, get_para_count(input))
fo.write(newHead)
count = 0
for line in fi:
line = line.strip().split(",")
for i in range(0, len(line)):
binary_data = struct.pack("f", float(line[i]))
fo.write(binary_data)
count += 1
fi.close()
fo.close()
print "text2binary finish, total", count, "lines"
def main():
"""
Main entry for running paraconvert.py
"""
usage = "usage: \n" \
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
"python %prog --t2b -i INPUT -o OUTPUT"
parser = OptionParser(usage)
parser.add_option(
"--b2t",
action="store_true",
help="convert parameter file of embedding model from binary to text")
parser.add_option(
"--t2b",
action="store_true",
help="convert parameter file of embedding model from text to binary")
parser.add_option(
"-i", action="store", dest="input", help="input parameter file name")
parser.add_option(
"-o", action="store", dest="output", help="output parameter file name")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
if options.b2t:
binary2text(options.input, options.output, options.dim)
if options.t2b:
text2binary(options.input, options.output)
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
f88c8325ee6da6187f1080e8fe66c1cd
927cf70f27f860aff1a5703ebf7f1584
a52e43655cd25d279777ed509a1ae27b
b92c67fe9ff70fea53596080e351ac80)
for ((i=0; i<${#ITEM_MD5[@]}; i++))
do
FILENAME=${DOWNLOAD_ITEMS[${i}]}
REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
EXPECTED_MD5=${ITEM_MD5[${i}]}
[ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
done
fea_output/
features/
model.list
ResNet_50.dot
ResNet_50.png
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import cPickle
import logging
from PIL import Image
import numpy as np
from optparse import OptionParser
import paddle.utils.image_util as image_util
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
class ImageClassifier():
def __init__(self,
train_conf,
model_dir=None,
resize_dim=256,
crop_dim=224,
use_gpu=True,
mean_file=None,
output_layer=None,
oversample=False,
is_color=True):
"""
train_conf: network configure.
model_dir: string, directory of model.
resize_dim: int, resized image size.
crop_dim: int, crop size.
mean_file: string, image mean file.
oversample: bool, oversample means multiple crops, namely five
patches (the four corner patches and the center
patch) as well as their horizontal reflections,
ten crops in all.
"""
self.train_conf = train_conf
self.model_dir = model_dir
if model_dir is None:
self.model_dir = os.path.dirname(train_conf)
self.resize_dim = resize_dim
self.crop_dims = [crop_dim, crop_dim]
self.oversample = oversample
self.is_color = is_color
self.output_layer = output_layer
if self.output_layer:
assert isinstance(self.output_layer, basestring)
self.output_layer = self.output_layer.split(",")
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.transformer.set_channel_swap((2, 1, 0))
self.mean_file = mean_file
if self.mean_file is not None:
mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
else:
# if you use three mean value, set like:
# this three mean value is calculated from ImageNet.
self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
slots = [dense_vector(data_size)]
self.converter = DataProviderConverter(slots)
def get_data(self, img_path):
"""
1. load image from img_path.
2. resize or oversampling.
3. transformer data: transpose, channel swap, sub mean.
return K x H x W ndarray.
img_path: image path.
"""
image = image_util.load_image(img_path, self.is_color)
# Another way to extract oversampled features is that
# cropping and averaging from large feature map which is
# calculated by large size of image.
# This way reduces the computation.
if self.oversample:
# image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
for img in input:
img = self.transformer.transformer(img).flatten()
data_in.append([img.tolist()])
# paddle input: [[[]],[[]],...], [[]] is one sample.
return data_in
def forward(self, input_data):
"""
return output arguments which are the Outputs() in network configure.
input_data: py_paddle input data.
call forward.
"""
in_arg = self.converter(input_data)
return self.network.forwardTest(in_arg)
def forward(self, data, output_layer):
"""
return output arguments which are the Outputs() in network configure.
input_data: py_paddle input data.
call forward.
"""
input = self.converter(data)
self.network.forwardTest(input)
output = self.network.getLayerOutputs(output_layer)
res = {}
if isinstance(output_layer, basestring):
output_layer = [output_layer]
for name in output_layer:
# For oversampling, average predictions across crops.
# If not, the shape of output[name]: (1, class_number),
# the mean is also applicable.
res[name] = output[name]['value'].mean(0)
return res
def predict(self, data_file):
"""
call forward and predicting.
data_file: input image list.
"""
image_files = open(data_file, 'rb').readlines()
results = {}
if self.output_layer is None:
self.output_layer = ["output"]
for line in image_files:
image = line.split()[0]
data = self.get_data(image)
prob = self.forward(data, self.output_layer)
lab = np.argsort(-prob[self.output_layer[0]])
results[image] = lab[0]
logging.info("Label of %s is: %d", image, lab[0])
return results
def extract(self, data_file, output_dir, batch_size=10000):
"""
extract and save features of output layers, which are
specify in Outputs() in network configure.
data_file: file name of input data.
output_dir: saved directory of extracted features.
batch_size: sample number of one batch file.
"""
if not os.path.exists(output_dir):
os.mkdir(output_dir)
sample_num = 0
batch_num = 0
image_feature = {}
image_files = open(data_file, 'rb').readlines()
for idx, line in enumerate(image_files):
image = line.split()[0]
data = self.get_data(image)
feature = self.forward(data, self.output_layer)
# save extracted features
file_name = image.split("/")[-1]
image_feature[file_name] = feature
sample_num += 1
if sample_num == batch_size:
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
batch_num += 1
sample_num = 0
image_feature = {}
if idx % 1000 == 0:
logging.info('%d/%d, %s', idx, len(image_files), file_name)
if sample_num > 0:
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
logging.info('Done: make image feature batch')
def save_file(self, data, file):
of = open(file, 'wb')
cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
def option_parser():
"""
Main entry for predciting
"""
usage = "%prog -c config -i data_list -w model_dir [options]"
parser = OptionParser(usage="usage: %s" % usage)
parser.add_option(
"-j",
"--job",
action="store",
dest="job_type",
help="job type: predict, extract\
predict: predicting,\
extract: extract features")
parser.add_option(
"-c",
"--conf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-i", "--data", action="store", dest="data_file", help="image list")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
parser.add_option(
"-g",
"--use_gpu",
action="store",
dest="use_gpu",
default=True,
help="Whether to use gpu mode.")
parser.add_option(
"-o",
"--output_dir",
action="store",
dest="output_dir",
default="output",
help="output path")
parser.add_option(
"-m",
"--mean",
action="store",
dest="mean",
default=None,
help="mean file.")
parser.add_option(
"-p",
"--multi_crop",
action="store_true",
dest="multi_crop",
default=False,
help="Wether to use multiple crops on image.")
parser.add_option("-l", "--output_layer", action="store",
dest="output_layer", default=None,
help="--job=extract, specify layers to extract "\
"features, --job=predict, specify layer of "
"classification probability, output in resnet.py.")
return parser.parse_args()
def main():
"""
1. parse input arguments.
2. predicting or extract features according job type.
"""
options, args = option_parser()
obj = ImageClassifier(
options.train_conf,
options.model_path,
use_gpu=options.use_gpu,
mean_file=options.mean,
output_layer=options.output_layer,
oversample=options.multi_crop)
if options.job_type == "predict":
obj.predict(options.data_file)
elif options.job_type == "extract":
obj.extract(options.data_file, options.output_dir)
if __name__ == '__main__':
main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.utils.image_util import *
from paddle.trainer.PyDataProvider2 import *
def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
"""
Description: Init with a list of data file
file_list is the name list of input files.
kwargs["load_data_args"] is the value of 'load_data_args'
which can be set in config.
Each args is separated by a column.
image_size: the crop image size.
mean_meta: the path of the meta file to store the mean image.
mean_value: can be mean value, not a file.
can not set mean_meta and mean_value at the same time.
color: 'color' means a color image. Otherwise, it means a gray image.
is_train: whether the data provider is used for training.
Data argumentation might be different for training and testing.
"""
settings.img_size = image_size
settings.crop_size = crop_size
settings.mean_img_size = settings.crop_size
settings.color = color # default is color
settings.is_train = is_train
settings.is_swap_channel = kwargs.get('swap_channel', None)
if settings.is_swap_channel is not None:
settings.swap_channel = settings.is_swap_channel
settings.is_swap_channel = True
if settings.color:
settings.img_input_size = settings.crop_size * settings.crop_size * 3
else:
settings.img_input_size = settings.crop_size * settings.crop_size
settings.file_list = file_list
settings.mean_meta = kwargs.get('mean_meta', None)
settings.mean_value = kwargs.get('mean_value', None)
# can not specify both mean_meta and mean_value.
assert not (settings.mean_meta and settings.mean_value)
if not settings.mean_meta:
settings.mean_value = kwargs.get('mean_value')
sz = settings.crop_size * settings.crop_size
settings.img_mean = np.zeros(sz * 3, dtype=np.single)
for idx, value in enumerate(settings.mean_value):
settings.img_mean[idx * sz:(idx + 1) * sz] = value
settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
settings.crop_size)
else:
settings.img_mean = load_meta(settings.mean_meta,
settings.mean_img_size,
settings.crop_size, settings.color)
settings.input_types = [
dense_vector(settings.img_input_size), # image feature
integer_value(1)
] # labels
settings.logger.info('Image short side: %s', settings.img_size)
settings.logger.info('Crop size: %s', settings.crop_size)
settings.logger.info('Meta path: %s', settings.mean_meta)
if settings.is_swap_channel:
settings.logger.info('swap channel: %s', settings.swap_channel)
settings.logger.info('DataProvider Initialization finished')
@provider(init_hook=hook, should_shuffle=False)
def processData(settings, file_list):
"""
The main function for loading data.
Load the batch, iterate all the images and labels in this batch.
file_name: the batch file name.
"""
img_path, lab = file_list.strip().split(' ')
img = Image.open(img_path)
img.load()
img = img.resize((settings.img_size, settings.img_size), Image.ANTIALIAS)
img = np.array(img).astype(np.float32)
if len(img.shape) == 3:
img = np.swapaxes(img, 1, 2)
img = np.swapaxes(img, 1, 0)
# swap channel
if settings.is_swap_channel:
img = img[settings.swap_channel, :, :]
img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
settings.is_train, settings.color)
yield img_feat.tolist(), int(lab.strip())
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#set names of layer which you want to extract feature
#in Outputs() of resnet.py
#like: Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
layer_num=50
configure=./resnet.py
model_path=./model/resnet_$layer_num
fea_dir=fea_output
#Output is text file.
#Each line is one sample's features.
#If you set N layer names in Outputs()
#each line contains N features sperated by ";".
# create model list file.
model_list=./model.list
touch $model_list | echo $model_path > $model_list
paddle train \
--local=true \
--job=test \
--config=$configure \
--model_list=$model_list \
--use_gpu=1 \
--predict_output_dir=$fea_dir \
--config_args=is_test=1,layer_num=$layer_num
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#Note if you use CPU mode, you need to set use_gpu=0 in classify.py. like this:
#conf_args = "is_test=0,use_gpu=1,is_predict=1"
#conf = parse_config(train_conf, conf_args)
#swig_paddle.initPaddle("--use_gpu=0")
python classify.py \
--job=extract \
--conf=resnet.py \
--use_gpu=1 \
--mean=model/mean_meta_224/mean.meta \
--model=model/resnet_50 \
--data=./example/test.list \
--output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
--output_dir=features
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
mkdir model
cd model
echo "Downloading ResNet models..."
for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz
do
wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
tar -xvf $file
rm $file
done
echo "Done."
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import cPickle
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
def load_feature_c(file):
"""
Load feature extracted by C++ interface.
Return a list.
file: feature file.
"""
features = []
f = open(file, 'r')
for line in f:
sample = []
for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()]
if fea:
sample.append(fea)
features.append(sample)
f.close()
return features
def load_feature_py(feature_dir):
"""
Load feature extracted by python interface.
Return a dictionary.
feature_dir: directory of feature file.
"""
file_list = os.listdir(feature_dir)
file_list = [os.path.join(feature_dir, f) for f in file_list]
features = {}
for file_name in file_list:
with open(file_name, 'rb') as f:
feature = cPickle.load(f)
features.update(feature)
logging.info('Load feature file %s', file_name)
return features
if __name__ == '__main__':
print load_feature_py(sys.argv[1])
#print load_feature_c(sys.argv[1])
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
:'
Visual deep residual network
1. Using make_model_diagram.py to generate dot file.
2. Using graphviz to convert dot file.
Usage:
./net_diagram.sh
'
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
img_type=png
img_fileprefix=ResNet_50
conf_filename=resnet.py
dot_filename=ResNet_50.dot
config_str="layer_num=50,data_provider=0"
python -m paddle.utils.make_model_diagram $conf_filename $dot_filename $config_str
# If you have installed graphviz, running like this:
# dot -Tpng -o ResNet.png ResNet.dot
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
python classify.py \
--job=predict \
--conf=resnet.py\
--model=model/resnet_50 \
--multi_crop \
--use_gpu=1 \
--data=./example/test.list
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
"""
paper: https://arxiv.org/abs/1512.03385
"""
is_test = get_config_arg("is_test", bool, False)
is_predict = get_config_arg("is_predict", bool, False)
data_provider = get_config_arg("data_provider", bool, True)
layer_num = get_config_arg("layer_num", int, 50)
if not is_predict and data_provider:
train_list = 'train.list' if not is_test else None
# mean.meta is mean file of ImageNet dataset.
# mean.meta size : 3 x 224 x 224.
# If you use three mean value, set like:
# "mean_value:103.939,116.779,123.68;"
args = {
'mean_meta': "model/mean_meta_224/mean.meta",
'image_size': 224,
'crop_size': 224,
'color': True,
'swap_channel:': [2, 1, 0]
}
define_py_data_sources2(
train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
batch_size = 1
learning_rate = 0.1 / batch_size
momentum = 0.9
weight_decay = 0.0001 * batch_size
default_momentum(momentum)
default_decay_rate(weight_decay)
Settings(
algorithm='sgd',
batch_size=batch_size,
learning_rate=learning_rate,
# set the appropriate parameters according your schedule
learning_method='momentum',
learning_rate_decay_a=0.5,
learning_rate_decay_b=1200000 * 10,
learning_rate_schedule="discexp", )
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
Note:
conv layer has no activation.
"""
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
def bottleneck_block(name, input, num_filters1, num_filters2):
"""
A wrapper for bottlenect building block in ResNet.
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
"""
A wrapper for middile projection in ResNet.
projection shortcuts are used for increasing dimensions,
and other shortcuts are identity
branch1: projection shortcuts are used for increasing
dimensions, has no activation.
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
"""
A wrapper for 50,101,152 layers of ResNet.
res2_num: number of blocks stacked in conv2_x
res3_num: number of blocks stacked in conv3_x
res4_num: number of blocks stacked in conv4_x
res5_num: number of blocks stacked in conv5_x
"""
# For ImageNet
# conv1: 112x112
img = data_layer(name='input', size=224 * 224 * 3)
tmp = conv_bn_layer(
"conv1",
img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
output = fc_layer(
name='output', input=tmp, size=1000, act=SoftmaxActivation())
if not is_predict:
classification_cost(
input=output, label=data_layer(
name='label', size=1))
def res_net_50():
deep_res_net(3, 4, 6, 3)
def res_net_101():
deep_res_net(3, 4, 23, 3)
def res_net_152():
deep_res_net(3, 8, 36, 3)
if not is_predict:
Inputs("input", "label")
else:
Inputs("input")
# Outputs("cost-softmax" if not is_predict else "output")
Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
if layer_num == 50:
res_net_50()
elif layer_num == 101:
res_net_101()
elif layer_num == 152:
res_net_152()
else:
print("Wrong layer number.")
*.pyc
data/dict.txt
data/dict_all.txt
data/labels.list
data/mosesdecoder-master/
data/reviews_Electronics_5.json.gz
data/test.list
data/test.txt
data/train.list
data/train.txt
data/pred.list
data/pred.txt
dataprovider_copy_1.py
train.log
output
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys
import numpy as np
from optparse import OptionParser
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import sparse_binary_vector
from paddle.trainer.config_parser import parse_config
"""
Usage: run following command to show help message.
python api_predict.py -h
"""
class QuickStartPrediction():
def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
model_dir: directory of model.
"""
self.train_conf = train_conf
self.dict_file = dict_file
self.word_dict = {}
self.dict_dim = self.load_dict()
self.model_dir = model_dir
if model_dir is None:
self.model_dir = os.path.dirname(train_conf)
self.label = None
if label_file is not None:
self.load_label(label_file)
conf = parse_config(train_conf, "is_predict=1")
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(self.model_dir)
input_types = [sparse_binary_vector(self.dict_dim)]
self.converter = DataProviderConverter(input_types)
def load_dict(self):
"""
Load dictionary from self.dict_file.
"""
for line_count, line in enumerate(open(self.dict_file, 'r')):
self.word_dict[line.strip().split('\t')[0]] = line_count
return len(self.word_dict)
def load_label(self, label_file):
"""
Load label.
"""
self.label = {}
for v in open(label_file, 'r'):
self.label[int(v.split('\t')[1])] = v.split('\t')[0]
def get_index(self, data):
"""
transform word into integer index according to the dictionary.
"""
words = data.strip().split()
word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
return word_slot
def batch_predict(self, data_batch):
input = self.converter(data_batch)
output = self.network.forwardTest(input)
prob = output[0]["id"].tolist()
print("predicting labels is:")
print prob
def option_parser():
usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option(
"-n",
"--tconf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-d",
"--dict",
action="store",
dest="dict_file",
help="dictionary file")
parser.add_option(
"-b",
"--label",
action="store",
dest="label",
default=None,
help="dictionary file")
parser.add_option(
"-c",
"--batch_size",
type="int",
action="store",
dest="batch_size",
default=1,
help="the batch size for prediction")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
return parser.parse_args()
def main():
options, args = option_parser()
train_conf = options.train_conf
batch_size = options.batch_size
dict_file = options.dict_file
model_path = options.model_path
label = options.label
swig_paddle.initPaddle("--use_gpu=0")
predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
batch = []
labels = []
for line in sys.stdin:
[label, text] = line.split("\t")
labels.append(int(label))
batch.append([predict.get_index(text)])
print("labels is:")
print labels
predict.batch_predict(batch)
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#Note the default model is pass-00002, you shold make sure the model path
#exists or change the mode path.
#only test on trainer_config.lr.py
model=output/model/pass-00001/
config=trainer_config.lr.py
label=data/labels.list
dict=data/dict.txt
batch_size=20
head -n$batch_size data/test.txt | python api_predict.py \
--tconf=$config\
--model=$model \
--label=$label \
--dict=$dict \
--batch_size=$batch_size
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import itertools
import random
from paddle.trainer.config_parser import parse_config
from py_paddle import swig_paddle as api
from py_paddle import DataProviderConverter
from paddle.trainer.PyDataProvider2 \
import integer_value, integer_value_sequence, sparse_binary_vector
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--train_data", type=str, required=False, help="train data file")
parser.add_argument("--test_data", type=str, help="test data file")
parser.add_argument(
"--config", type=str, required=True, help="config file name")
parser.add_argument("--dict_file", required=True, help="dictionary file")
parser.add_argument(
"--seq", default=1, type=int, help="whether use sequence training")
parser.add_argument(
"--use_gpu", default=0, type=int, help="whether use GPU for training")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Number of threads for training")
parser.add_argument(
"--num_passes", default=5, type=int, help="Number of training passes")
return parser.parse_args()
UNK_IDX = 0
def load_data(file_name, word_dict):
with open(file_name, 'r') as f:
for line in f:
label, comment = line.strip().split('\t')
words = comment.split()
word_slot = [word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label)
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
return word_dict
def main():
options = parse_arguments()
api.initPaddle("--use_gpu=%s" % options.use_gpu,
"--trainer_count=%s" % options.trainer_count)
word_dict = load_dict(options.dict_file)
train_dataset = list(load_data(options.train_data, word_dict))
if options.test_data:
test_dataset = list(load_data(options.test_data, word_dict))
else:
test_dataset = None
trainer_config = parse_config(options.config,
"dict_file=%s" % options.dict_file)
# No need to have data provider for trainer
trainer_config.ClearField('data_config')
trainer_config.ClearField('test_data_config')
# create a GradientMachine from the model configuratin
model = api.GradientMachine.createFromConfigProto(
trainer_config.model_config)
# create a trainer for the gradient machine
trainer = api.Trainer.create(trainer_config, model)
# create a data converter which converts data to PaddlePaddle
# internal format
input_types = [
integer_value_sequence(len(word_dict)) if options.seq else
sparse_binary_vector(len(word_dict)), integer_value(2)
]
converter = DataProviderConverter(input_types)
batch_size = trainer_config.opt_config.batch_size
trainer.startTrain()
for train_pass in xrange(options.num_passes):
trainer.startTrainPass()
random.shuffle(train_dataset)
for pos in xrange(0, len(train_dataset), batch_size):
batch = itertools.islice(train_dataset, pos, pos + batch_size)
size = min(batch_size, len(train_dataset) - pos)
trainer.trainOneDataBatch(size, converter(batch))
trainer.finishTrainPass()
if test_dataset:
trainer.startTestPeriod()
for pos in xrange(0, len(test_dataset), batch_size):
batch = itertools.islice(test_dataset, pos, pos + batch_size)
size = min(batch_size, len(test_dataset) - pos)
trainer.testOneDataBatch(size, converter(batch))
trainer.finishTestPeriod()
trainer.finishTrain()
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Note: if using trainer_config.emb.py, trainer_config.cnn.py
# or trainer_config.lstm.py, you need to change --seq to --seq=1
# because they are sequence models.
python api_train.py \
--config=trainer_config.lr.py \
--trainer_count=2 \
--num_passes=15 \
--use_gpu=0 \
--seq=0 \
--train_data=data/train.txt \
--test_data=data/test.txt \
--dict_file=data/dict.txt \
2>&1 | tee 'train.log'
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Should run pserver.sh before run this script.
bin_dir=$(cd `dirname $0`; pwd)
home_dir=$(cd "${bin_dir}/.."; pwd)
source "$bin_dir/env.sh"
model_dir="$bin_dir/output"
log_file="$bin_dir/train.log"
pushd "$home_dir"
cfg=trainer_config.lr.py
paddle train \
--start_pserver=false \
--config=$cfg \
--save_dir=${model_dir} \
--trainer_count=4 \
--local=0 \
--log_period=100 \
--num_passes=15 \
--use_gpu=false \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
--num_gradient_servers=1 \
--nics=`get_nics` \
--port=7164 \
--ports_num=1 \
--pservers="127.0.0.1" \
--comment="paddle_trainer" \
2>&1 | tee "$log_file"
popd
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
function get_nics() {
machine=`uname -s`
local nics=""
if [ "$machine" == "Linux" ]; then
nics="lo"
elif [ "$machine" == "Darwin" ]; then
nics="lo0"
else
nics="unsupport"
fi
echo $nics
}
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
bin_dir=$(cd `dirname $0`; pwd)
source "$bin_dir/env.sh"
paddle pserver \
--nics=`get_nics` \
--port=7164 \
--ports_num=1 \
--ports_num_for_sparse=1 \
--num_gradient_servers=1 \
--comment="paddle_pserver" \
2>&1 | tee 'pserver.log'
This dataset consists of electronics product reviews associated with
binary labels (positive/negative) for sentiment classification.
The preprocessed data can be downloaded by script `get_data.sh`.
The data was derived from reviews_Electronics_5.json.gz at
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
# Download the preprocessed data
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
# Extract package
tar zxvf preprocessed_data.tar.gz
# Remove compressed package
rm preprocessed_data.tar.gz
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 1. size of pos : neg = 1:1.
# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
# 3. distinct train set and test set.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
# Download data
echo "Downloading Amazon Electronics reviews data..."
# http://jmcauley.ucsd.edu/data/amazon/
wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
echo "Downloading mosesdecoder..."
# https://github.com/moses-smt/mosesdecoder
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
unzip master.zip
rm master.zip
##################
# Preprocess data
echo "Preprocess data..."
export LC_ALL=C
UNAME_STR=`uname`
if [ ${UNAME_STR} == 'Linux' ]; then
SHUF_PROG='shuf'
else
SHUF_PROG='gshuf'
fi
mkdir -p tmp
python preprocess.py -i reviews_Electronics_5.json.gz
# uniq and shuffle
cd tmp
echo 'Uniq and shuffle...'
cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
min_len=`sed -n '$=' neg.shuffed`
test_num=$((min_len/10))
if [ $test_num -gt 12500 ];then
test_num=12500
fi
train_num=$((min_len-test_num))
head -n$train_num pos.shuffed >train.pos
head -n$train_num neg.shuffed >train.neg
tail -n$test_num pos.shuffed >test.pos
tail -n$test_num neg.shuffed >test.neg
cat train.pos train.neg | ${SHUF_PROG} >../train.txt
cat test.pos test.neg | ${SHUF_PROG} >../test.txt
cd -
echo 'train.txt' > train.list
echo 'test.txt' > test.list
# use 30k dict
rm -rf tmp
mv dict.txt dict_all.txt
cat dict_all.txt | head -n 30001 > dict.txt
echo 'Done.'
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# -*- coding: UTF-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
1. Tokenize the words and punctuation
2. pos sample : rating score 5; neg sample: rating score 1-2.
Usage:
python preprocess.py -i data_file [random seed]
"""
import sys
import os
import operator
import gzip
from subprocess import Popen, PIPE
from optparse import OptionParser
import json
from multiprocessing import Queue
from multiprocessing import Pool
import multiprocessing
batch_size = 5000
word_count = {}
num_tokenize = max(1,
multiprocessing.cpu_count() - 2) # parse + tokenize + save
max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
def create_dict(data):
"""
Create dictionary based on data, and saved in data_dir/dict.txt.
The first line is unk \t -1.
data: list, input data by batch.
"""
for seq in data:
try:
for w in seq.lower().split():
if w not in word_count:
word_count[w] = 1
else:
word_count[w] += 1
except:
sys.stderr.write(seq + "\tERROR\n")
def parse(path):
"""
Open .gz file.
"""
sys.stderr.write(path)
g = gzip.open(path, 'r')
for l in g:
yield json.loads(l)
g.close()
def tokenize(sentences):
"""
Use tokenizer.perl to tokenize input sentences.
tokenizer.perl is tool of Moses.
sentences : a list of input sentences.
return: a list of processed text.
"""
dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
if not os.path.exists(dir):
sys.exit(
"The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
)
tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
assert isinstance(sentences, list)
text = "\n".join(sentences)
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
tok_text, _ = tokenizer.communicate(text)
toks = tok_text.split('\n')[:-1]
return toks
def save_data(instance, data_dir, pre_fix, batch_num):
"""
save data by batch
"""
label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
file(file_name, 'w').write('\n'.join(lines) + '\n')
def tokenize_batch(id):
"""
tokenize data by batch
"""
while True:
num_batch, instance, pre_fix = parse_queue.get()
if num_batch == -1: ### parse_queue finished
tokenize_queue.put((-1, None, None))
sys.stderr.write("Thread %s finish\n" % (id))
break
tokenize_instance = tokenize(instance)
tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
sys.stderr.write('.')
def save_batch(data_dir, num_tokenize, data_dir_dict):
"""
save data by batch
build dict.txt
"""
token_count = 0
while True:
num_batch, instance, pre_fix = tokenize_queue.get()
if num_batch == -1:
token_count += 1
if token_count == num_tokenize: #### tokenize finished.
break
else:
continue
save_data(instance, data_dir, pre_fix, num_batch)
create_dict(instance) ## update dict
sys.stderr.write("save file finish\n")
f = open(data_dir_dict, 'w')
f.write('%s\t%s\n' % ('unk', '-1'))
for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
reverse=True):
f.write('%s\t%s\n' % (k, v))
f.close()
sys.stderr.write("build dict finish\n")
def parse_batch(data, num_tokenize):
"""
parse data by batch
parse -> tokenize -> save
"""
raw_txt = parse(data)
neg, pos = [], []
count = 0
sys.stderr.write("extract raw data\n")
for l in raw_txt:
rating = l["overall"]
text = l["reviewText"].lower() # # convert words to lower case
if rating == 5.0 and text:
pos.append(text)
if rating < 3.0 and text:
neg.append(text)
if len(pos) == batch_size or len(neg) == batch_size:
if len(pos) == batch_size:
batch = pos
pre_fix = 'pos'
else:
batch = neg
pre_fix = 'neg'
parse_queue.put((count, batch, pre_fix))
count += 1
if pre_fix == 'pos':
pos = []
else:
neg = []
if len(pos) > 0:
parse_queue.put((count, pos, 'pos'))
count += 1
if len(neg) > 0:
parse_queue.put((count, neg, 'neg'))
count += 1
for i in range(num_tokenize):
parse_queue.put((-1, None, None)) #### for tokenize's input finished
sys.stderr.write("parsing finish\n")
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_path [options]")
parser.add_option(
"-i", "--data", action="store", dest="input", help="Input data path.")
parser.add_option(
"-s",
"--seed",
action="store",
dest="seed",
default=1024,
help="Set random seed.")
return parser.parse_args()
def main():
reload(sys)
sys.setdefaultencoding('utf-8')
options, args = option_parser()
data = options.input
seed = options.seed
data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
data_dir = os.path.join(os.path.dirname(data), 'tmp')
pool = Pool(processes=num_tokenize + 2)
pool.apply_async(parse_batch, args=(data, num_tokenize))
for i in range(num_tokenize):
pool.apply_async(tokenize_batch, args=(str(i), ))
pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
pool.close()
pool.join()
file(os.path.join(os.path.dirname(data), 'labels.list'),
'w').write('neg\t0\npos\t1\n')
if __name__ == '__main__':
main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
# id of the word not in dictionary
UNK_IDX = 0
# initializer is called by the framework during initialization.
# It allows the user to describe the data types and setup the
# necessary data structure for later use.
# `settings` is an object. initializer need to properly fill settings.input_types.
# initializer can also store other data structures needed to be used at process().
# In this example, dictionary is stored in settings.
# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
def initializer(settings, dictionary, **kwargs):
# Put the word dictionary into settings
settings.word_dict = dictionary
# setting.input_types specifies what the data types the data provider
# generates.
settings.input_types = {
# The first input is a sparse_binary_vector,
# which means each dimension of the vector is either 0 or 1. It is the
# bag-of-words (BOW) representation of the texts.
'word': sparse_binary_vector(len(dictionary)),
# The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative)
'label': integer_value(2)
}
# Delaring a data provider. It has an initializer 'data_initialzer'.
# It will cache the generated data of the first pass in memory, so that
# during later pass, no on-the-fly data generation will be needed.
# `setting` is the same object used by initializer()
# `file_name` is the name of a file listed train_list or test_list file given
# to define_py_data_sources2(). See trainer_config.lr.py.
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
# Open the input data file.
with open(file_name, 'r') as f:
# Read each line.
for line in f:
# Each line contains the label and text of the comment, separated by \t.
label, comment = line.strip().split('\t')
# Split the words into a list.
words = comment.split()
# convert the words into a list of ids by looking them up in word_dict.
word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
# Return the features for the current comment. The first is a list
# of ids representing a 0-1 binary sparse vector of the text,
# the second is the integer id of the label.
yield {'word': word_vector, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
# Declaring a data provider for prediction. The difference with process
# is that label is not generated.
@provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
comment = line.strip().split()
word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield {'word': word_vector}
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 0
def initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = {
# Define the type of the first input as sequence of integer.
# The value of the integers range from 0 to len(dictrionary)-1
'word': integer_value_sequence(len(dictionary)),
# Define the second input for label id
'label': integer_value(2)
}
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
label, comment = line.strip().split('\t')
words = comment.split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
yield {'word': word_slot, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = {'word': integer_value_sequence(len(dictionary))}
@provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
comment = line.strip().split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield {'word': word_slot}
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
model="output/pass-00003"
paddle train \
--config=$cfg \
--use_gpu=false \
--job=test \
--init_model_path=$model \
--config_args=is_predict=1 \
--predict_output_dir=. \
2>&1 | tee 'predict.log'
paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
mv rank-00000 result.txt
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
#cfg=trainer_config.resnet-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
--trainer_count=4 \
--log_period=100 \
--num_passes=15 \
--use_gpu=false \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(
input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
conv = sequence_conv_pool(input=embedding, context_len=3, hidden_size=512)
output = fc_layer(input=conv, size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0]
for i in range(1, 8):
fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(
input=fc,
layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1, )
input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(
input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
avg = pooling_layer(input=embedding, pooling_type=AvgPooling())
output = fc_layer(input=avg, size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
# define the data sources for the model.
# We need to use different process for training and prediction.
# For training, the input data includes both word IDs and labels.
# For prediction, the input data only includs word Ids.
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_bow",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
# Define the data for text features. The size of the data layer is the number
# of words in the dictionary.
data = data_layer(name="word", size=len(word_dict))
# Define a fully connected layer with logistic activation.
# (also called softmax activation).
output = fc_layer(input=data, size=2, act=SoftmaxActivation())
if not is_predict:
# For training, we need label and cost
# define the category id for each example.
# The size of the data layer is the number of labels.
label = data_layer(name="label", size=2)
# Define cross-entropy classification loss and error.
cls = classification_cost(input=output, label=label)
outputs(cls)
else:
# For prediction, no label is needed. We need to output
# We need to output classification result, and class probabilities.
maxid = maxid_layer(output)
outputs([maxid, output])
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
lstm = simple_lstm(
input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This configuration is a demonstration of how to implement the stacked LSTM
with residual connections, i.e. an LSTM layer takes the sum of the hidden states
and inputs of the previous LSTM layer instead of only the hidden states.
This architecture is from:
Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa,
Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016.
Google's Neural Machine Translation System: Bridging the Gap between Human and
Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf
Different from the architecture described in the paper, we use a stack single
direction LSTM layers as the first layer instead of bi-directional LSTM. Also,
since this is a demo code, to reduce computation time, we stacked 4 layers
instead of 8 layers.
"""
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
previous_input, previous_hidden_state = emb, lstm
for i in range(3):
# The input to the current layer is the sum of the hidden state
# and input of the previous layer.
current_input = addto_layer(input=[previous_input, previous_hidden_state])
hidden_state = simple_lstm(
input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
previous_input, previous_hidden_state = current_input, hidden_state
lstm = previous_hidden_state
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(
input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import gzip
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)
OOV_POLICY_IGNORE = 0
OOV_POLICY_USE = 1
OOV_POLICY_ERROR = 2
num_original_columns = 3
# Feature combination patterns.
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2, 0]],
[[-1, 0]],
[[0, 0]],
[[1, 0]],
[[2, 0]],
[[-1, 0], [0, 0]],
[[0, 0], [1, 0]],
[[-2, 1]],
[[-1, 1]],
[[0, 1]],
[[1, 1]],
[[2, 1]],
[[-2, 1], [-1, 1]],
[[-1, 1], [0, 1]],
[[0, 1], [1, 1]],
[[1, 1], [2, 1]],
[[-2, 1], [-1, 1], [0, 1]],
[[-1, 1], [0, 1], [1, 1]],
[[0, 1], [1, 1], [2, 1]],
]
dict_label = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:
return ['#B%s' % -pos] * num_features
if pos >= length:
return ['#E%s' % (pos - length + 1)] * num_features
return sequence[pos]
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
sequence[i].append(fname)
'''
Source file format:
Each line is for one timestep. The features are separated by space.
An empty line indicates end of a sequence.
cutoff: a list of numbers. If count of a feature is smaller than this,
it will be ignored.
if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
i-th column.
return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
for features in sequence:
l = len(features)
assert l == num_features, "Wrong number of features " + line
for i in xrange(l):
if features[i] in dicts[i]:
dicts[i][features[i]] += 1
else:
dicts[i][features[i]] = 1
num_features = len(cutoff)
dicts = []
for i in xrange(num_features):
dicts.append(dict())
f = gzip.open(filename, 'rb')
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
add_to_dict(sequence, dicts)
sequence = []
continue
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
todo = []
for k, v in dct.iteritems():
if v < cutoff[i]:
todo.append(k)
else:
dct[k] = n
n += 1
if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features
# including OOV
dct['#OOV#'] = 0
logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
for k in todo:
del dct[k]
f.close()
return dicts
def initializer(settings, **xargs):
cutoff = [3, 1, 0]
cutoff += [3] * len(patterns)
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
dicts[2] = dict_label
settings.dicts = dicts
settings.oov_policy = oov_policy
input_types = []
num_features = len(dicts)
for i in xrange(num_original_columns):
input_types.append(integer_sequence(len(dicts[i])))
logger.info("slot %s size=%s" % (i, len(dicts[i])))
if patterns:
dim = 0
for i in xrange(num_original_columns, num_features):
dim += len(dicts[i])
input_types.append(sparse_binary_vector_sequence(dim))
logger.info("feature size=%s" % dim)
settings.input_types = input_types
'''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
input_file = filename
dicts = settings.dicts
oov_policy = settings.oov_policy
def gen_sample(sequence):
num_features = len(dicts)
sample = [list() for i in xrange(num_original_columns)]
if patterns:
sample.append([])
for features in sequence:
assert len(features) == num_features, \
"Wrong number of features: " + line
for i in xrange(num_original_columns):
id = dicts[i].get(features[i], -1)
if id != -1:
sample[i].append(id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
sample[i].append(0xffffffff)
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
sample[i].append(0)
if patterns:
dim = 0
vec = []
for i in xrange(num_original_columns, num_features):
id = dicts[i].get(features[i], -1)
if id != -1:
vec.append(dim + id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
pass
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
vec.ids.append(dim + 0)
dim += len(dicts[i])
sample[-1].append(vec)
return sample
num_features = len(dicts)
f = gzip.open(input_file, 'rb')
num_sequences = 0
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
yield gen_sample(sequence)
sequence = []
num_sequences += 1
continue
features = line.split(' ')
sequence.append(features)
f.close()
logger.info("num_sequences=%s" % num_sequences)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(
train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 1
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-4),
model_average=ModelAverage(0.5),
learning_rate=1e-1,
learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25, )
num_label_types = 23
def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True,
# the size has to be aligned.
num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk", size=num_label_types)
crf_input = fc_layer(
input=features,
size=num_label_types,
act=LinearActivation(),
bias_attr=False,
param_attr=ParamAttr(
initial_std=0, sparse_update=True))
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(
name="crfw", initial_std=0), )
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"), )
sum_evaluator(
name="error",
input=crf_decoding, )
chunk_evaluator(
name="chunk_f1",
input=crf_decoding,
label=chunk,
chunk_scheme="IOB",
num_chunk_types=11, )
inputs(word, pos, chunk, features)
outputs(crf)
# Sequence Tagging
This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
## Download data
```bash
cd demo/sequence_tagging
./data/get_data.sh
```
## Train model
```bash
cd demo/sequence_tagging
./train.sh
```
## Model description
We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
<center>
<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
<thead>
<th scope="col" class="left">Model name</th>
<th scope="col" class="left">Number of parameters</th>
<th scope="col" class="left">F1 score</th>
</thead>
<tbody>
<tr>
<td class="left">linear_crf</td>
<td class="left"> 1.8M </td>
<td class="left"> 0.937</td>
</tr>
<tr>
<td class="left">rnn_crf</td>
<td class="left"> 960K </td>
<td class="left">0.941</td>
</tr>
</tbody>
</table>
</center>
<br>
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(
train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 16
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-5),
model_average=ModelAverage(0.5),
learning_rate=2e-3,
learning_rate_decay_a=5e-7,
learning_rate_decay_b=0.5, )
word_dim = 128
hidden_dim = 128
with_rnn = True
initial_std = 1 / math.sqrt(hidden_dim)
param_attr = ParamAttr(initial_std=initial_std)
cpu_layer_attr = ExtraLayerAttribute(device=-1)
default_device(0)
num_label_types = 23
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(
name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
emb = embedding_layer(
input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
hidden1 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[
full_matrix_projection(emb), table_projection(
pos, param_attr=param_attr)
])
if with_rnn:
rnn1 = recurrent_layer(
act=ReluActivation(),
bias_attr=True,
input=hidden1,
param_attr=ParamAttr(initial_std=0), )
hidden2 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(hidden1)] +
([full_matrix_projection(
rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
if with_rnn:
rnn2 = recurrent_layer(
reverse=True,
act=ReluActivation(),
bias_attr=True,
input=hidden2,
param_attr=ParamAttr(initial_std=0), )
crf_input = mixed_layer(
size=num_label_types,
bias_attr=False,
input=[full_matrix_projection(hidden2), ] +
([full_matrix_projection(
rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(
name="crfw", initial_std=0),
layer_attr=cpu_layer_attr, )
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
layer_attr=cpu_layer_attr, )
sum_evaluator(
name="error",
input=crf_decoding, )
chunk_evaluator(
name="chunk_f1",
input=crf_decoding,
label=chunk,
chunk_scheme="IOB",
num_chunk_types=11, )
inputs(word, pos, chunk, features)
outputs(crf)
#!/bin/bash
paddle train \
--config rnn_crf.py \
--parallel_nn=1 \
--use_gpu=1 \
--dot_period=10 \
--log_period=1000 \
--test_period=0 \
--num_passes=10 \
2>&1 | tee 'train.log'
paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
#!/bin/bash
paddle train \
--config linear_crf.py \
--use_gpu=0 \
--dot_period=100 \
--log_period=10000 \
--test_period=0 \
--num_passes=10
2>&1 | tee 'train_linear.log'
paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
run by:
cd ./data
sh get_data.sh
cd ..
sh train.sh
sh predict.sh
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
set -x
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
#download the dataset
echo "Downloading traffic data..."
wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
#extract package
echo "Unzipping..."
tar -zxvf traffic_data.tar.gz
echo "data/speeds.csv" > train.list
echo "data/speeds.csv" > test.list
echo "data/speeds.csv" > pred.list
echo "Done."
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import sys
import numpy as np
TERM_NUM = 24
FORECASTING_NUM = 24
LABEL_VALUE_NUM = 4
def initHook(settings, file_list, **kwargs):
"""
Init hook is invoked before process data. It will set obj.slots and store data meta.
:param settings: global object. It will passed to process routine.
:type obj: object
:param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
:param kwargs: unused other arguments.
"""
del kwargs #unused
settings.pool_size = sys.maxint
#Use a time seires of the past as feature.
#Dense_vector's expression form is [float,float,...,float]
settings.input_types = [dense_vector(TERM_NUM)]
#There are next FORECASTING_NUM fragments you need predict.
#Every predicted condition at time point has four states.
for i in range(FORECASTING_NUM):
settings.input_types.append(integer_value(LABEL_VALUE_NUM))
@provider(
init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
def process(settings, file_name):
with open(file_name) as f:
#abandon fields name
f.next()
for row_num, line in enumerate(f):
speeds = map(int, line.rstrip('\r\n').split(",")[1:])
# Get the max index.
end_time = len(speeds)
# Scanning and generating samples
for i in range(TERM_NUM, end_time - FORECASTING_NUM):
# For dense slot
pre_spd = map(float, speeds[i - TERM_NUM:i])
# Integer value need predicting, values start from 0, so every one minus 1.
fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
# Predicting label is missing, abandon the sample.
if -1 in fol_spd:
continue
yield [pre_spd] + fol_spd
def predict_initHook(settings, file_list, **kwargs):
settings.pool_size = sys.maxint
settings.input_types = [dense_vector(TERM_NUM)]
@provider(init_hook=predict_initHook, should_shuffle=False)
def process_predict(settings, file_name):
with open(file_name) as f:
#abandon fields name
f.next()
for row_num, line in enumerate(f):
speeds = map(int, line.rstrip('\r\n').split(","))
end_time = len(speeds)
pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
yield pre_spd
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
res = []
with open('./rank-00000') as f:
for line in f:
pred = map(int, line.strip('\r\n;').split(";"))
#raw prediction range from 0 to 3
res.append([i + 1 for i in pred])
file_name = open('./data/pred.list').read().strip('\r\n')
FORECASTING_NUM = 24
header = [
'id',
'201604200805',
'201604200810',
'201604200815',
'201604200820',
'201604200825',
'201604200830',
'201604200835',
'201604200840',
'201604200845',
'201604200850',
'201604200855',
'201604200900',
'201604200905',
'201604200910',
'201604200915',
'201604200920',
'201604200925',
'201604200930',
'201604200935',
'201604200940',
'201604200945',
'201604200950',
'201604200955',
'201604201000',
]
###################
## To CSV format ##
###################
with open(file_name) as f:
f.next()
print ','.join(header)
for row_num, line in enumerate(f):
fields = line.rstrip('\r\n').split(',')
linkid = fields[0]
print linkid + ',' + ','.join(map(str, res[row_num]))
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
cfg=trainer_config.py
# pass choice
model="output/pass-00000"
paddle train \
--config=$cfg \
--use_gpu=false \
--job=test \
--init_model_path=$model \
--config_args=is_predict=1 \
--predict_output_dir=.
python gen_result.py > result.csv
rm -rf rank-00000
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
cfg=trainer_config.py
paddle train \
--config=$cfg \
--save_dir=./output \
--trainer_count=4 \
--log_period=1000 \
--dot_period=10 \
--num_passes=10 \
--use_gpu=false \
--show_parameter_stats_period=3000 \
2>&1 | tee 'train.log'
# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
################################### DATA Configuration #############################################
is_predict = get_config_arg('is_predict', bool, False)
trn = './data/train.list' if not is_predict else None
tst = './data/test.list' if not is_predict else './data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(
train_list=trn, test_list=tst, module="dataprovider", obj=process)
################################### Parameter Configuaration #######################################
TERM_NUM = 24
FORECASTING_NUM = 24
emb_size = 16
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=1e-3,
learning_method=RMSPropOptimizer())
################################### Algorithm Configuration ########################################
output_label = []
link_encode = data_layer(name='link_encode', size=TERM_NUM)
for i in xrange(FORECASTING_NUM):
# Each task share same weight.
link_param = ParamAttr(
name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(score)
output_label.append(maxid)
else:
# Multi-task training.
label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
cls = classification_cost(
input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
output_label.append(cls)
outputs(output_label)
#Variational Autoencoder (VAE)
This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114).
In order to run the model, first download the MNIST dataset by running the shell script in ./data.
Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).
$python vae_train.py [--use_gpu 1]
The generated images will be stored in ./samples/
The corresponding models will be stored in ./params/
#!/usr/bin/env sh
# This script downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/mnist_data"
mkdir "$DIR/mnist_data"
cd "$DIR/mnist_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
class MNISTloader():
def __init__(self,
data_path="./data/mnist_data/",
batch_size=60,
process='train'):
self.batch_size = batch_size
self.data_path = data_path
self._pointer = 0
self.image_batches = np.array([])
self.process = process
def _extract_images(self, filename, n):
f = open(filename, 'rb')
f.read(16)
data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
#Mapping data into [-1, 1]
data = data / 255. * 2. - 1
data_batches = np.split(data, 60000 / self.batch_size, 0)
f.close()
return data_batches
@property
def pointer(self):
return self._pointer
def load_data(self):
TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path
TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path
if self.process == 'train':
self.image_batches = self._extract_images(TRAIN_IMAGES, 60000)
else:
self.image_batches = self._extract_images(TEST_IMAGES, 10000)
def next_batch(self):
batch = self.image_batches[self._pointer]
self._pointer = (self._pointer + 1) % (60000 / self.batch_size)
return np.array(batch)
def reset_pointer(self):
self._pointer = 0
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import numpy as np
is_generating = get_config_arg("is_generating", bool, False)
settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer())
X_dim = 28 * 28
h_dim = 128
z_dim = 100
def reparameterization(mu, logvar):
eps = ParamAttr(initial_mean=0., initial_std=1)
with mixed_layer() as sigma:
sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps)
return mu + sigma
def q_func(X):
"""
xavier initialization
"""
param_attr = ParamAttr(
name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.))
mu_param = ParamAttr(
name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
logvar_param = ParamAttr(
name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.)
mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.)
logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.)
share_layer = fc_layer(
X,
size=h_dim,
param_attr=param_attr,
bias_attr=bias_attr,
act=ReluActivation())
return (fc_layer(
share_layer,
size=z_dim,
param_attr=mu_param,
bias_attr=mu_bias,
act=LinearActivation()), fc_layer(
share_layer,
size=z_dim,
param_attr=logvar_param,
bias_attr=logvar_bias,
act=LinearActivation()))
def generator(z):
hidden_param = ParamAttr(
name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.))
hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.)
prob_param = ParamAttr(
name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.)
hidden_layer = fc_layer(
z,
size=h_dim,
act=ReluActivation(),
param_attr=hidden_param,
bias_attr=hidden_bias)
prob = fc_layer(
hidden_layer,
size=X_dim,
act=SigmoidActivation(),
param_attr=prob_param,
bias_attr=prob_bias)
return prob
def reconstruct_error(prob, X):
cost = multi_binary_label_cross_entropy(input=prob, label=X)
return cost
def KL_loss(mu, logvar):
with mixed_layer() as mu_square:
mu_square += dotmul_operator(mu, mu, scale=1.)
cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar)
return cost
if not is_generating:
x_batch = data_layer(name='x_batch', size=X_dim)
mu, logvar = q_func(x_batch)
z_samples = reparameterization(mu, logvar)
prob = generator(z_samples)
outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar))
else:
z_samples = data_layer(name='noise', size=z_dim)
outputs(generator(z_samples))
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import random
import numpy as np
import cPickle
import sys, os
from PIL import Image
from paddle.trainer.config_parser import parse_config
from paddle.trainer.config_parser import logger
import py_paddle.swig_paddle as api
import dataloader
import matplotlib.pyplot as plt
def plot_samples(samples):
fig = plt.figure(figsize=(4, 4))
gs = gridspec.GridSpec(4, 4)
gs.update(wspace=0.05, hspace=0.05)
for i, sample in enumerate(samples):
plt.subplot(gs[i])
plt.axis('off')
plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
return fig
def CHECK_EQ(a, b):
assert a == b, "a=%s, b=%s" % (a, b)
def get_fake_samples(generator_machine, batch_size, noise):
gen_inputs = api.Arguments.createArguments(1)
gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
gen_outputs = api.Arguments.createArguments(0)
generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
return fake_samples
def copy_shared_parameters(src, dst):
'''
copy the parameters from src to dst
:param src: the source of the parameters
:type src: GradientMachine
:param dst: the destination of the parameters
:type dst: GradientMachine
'''
src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
src_params = dict([(p.getName(), p) for p in src_params])
for i in xrange(dst.getParameterSize()):
dst_param = dst.getParameter(i)
src_param = src_params.get(dst_param.getName(), None)
if src_param is None:
continue
src_value = src_param.getBuf(api.PARAMETER_VALUE)
dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
CHECK_EQ(len(src_value), len(dst_value))
dst_value.copyFrom(src_value)
dst_param.setValueUpdated()
def find(iterable, cond):
for item in iterable:
if cond(item):
return item
return None
def get_layer_size(model_conf, layer_name):
layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
return layer_conf.size
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--use_gpu", default="1", help="1 means use gpu for training")
parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
args = parser.parse_args()
use_gpu = args.use_gpu
assert use_gpu in ["0", "1"]
if not os.path.exists("./samples/"):
os.makedirs("./samples/")
if not os.path.exists("./params/"):
os.makedirs("./params/")
api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
'--log_period=1000', '--gpu_id=' + args.gpu_id,
'--save_dir=' + "./params/")
conf = "vae_conf.py"
trainer_conf = parse_config(conf, "is_generating=False")
gener_conf = parse_config(conf, "is_generating=True")
batch_size = trainer_conf.opt_config.batch_size
noise_dim = get_layer_size(gener_conf.model_config, "noise")
mnist = dataloader.MNISTloader(batch_size=batch_size)
mnist.load_data()
training_machine = api.GradientMachine.createFromConfigProto(
trainer_conf.model_config)
generator_machine = api.GradientMachine.createFromConfigProto(
gener_conf.model_config)
trainer = api.Trainer.create(trainer_conf, training_machine)
trainer.startTrain()
for train_pass in xrange(100):
trainer.startTrainPass()
mnist.reset_pointer()
i = 0
it = 0
while mnist.pointer != 0 or i == 0:
X = mnist.next_batch().astype('float32')
inputs = api.Arguments.createArguments(1)
inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X))
trainer.trainOneDataBatch(batch_size, inputs)
if it % 1000 == 0:
outputs = api.Arguments.createArguments(0)
training_machine.forward(inputs, outputs, api.PASS_TEST)
loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat())
print "\niter: {}".format(str(it).zfill(3))
print "VAE loss: {}".format(str(loss).zfill(3))
#Sync parameters between networks (GradientMachine) at the beginning
copy_shared_parameters(training_machine, generator_machine)
z_samples = np.random.randn(batch_size,
noise_dim).astype('float32')
samples = get_fake_samples(generator_machine, batch_size,
z_samples)
#Generating the first 16 images for a picture.
figure = plot_samples(samples[:16])
plt.savefig(
"./samples/{}_{}.png".format(
str(train_pass).zfill(3), str(i).zfill(3)),
bbox_inches='tight')
plt.close(figure)
i += 1
it += 1
trainer.finishTrainPass()
trainer.finishTrain()
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册