diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) if(NOT APPLE AND NOT ANDROID) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt") + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif(NOT APPLE AND NOT ANDROID) function(merge_static_libs TARGET_NAME) diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -82,6 +82,11 @@ maxout .. autoclass:: paddle.v2.layer.maxout :noindex: +roi_pool +-------- +.. autoclass:: paddle.v2.layer.roi_pool + :noindex: + Norm Layer ========== diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -15,6 +15,7 @@ - [CMake](#cmake) - [Layers](#layers) - [Activations](#activations) + - [Weights](#weights) - [Unit Tests](#unit-tests) - [Protobuf Messages](#protobuf-messages) - [Python API](#python-api) @@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA. ### Layers 所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 -`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。 +`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。 -所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 +所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 + +在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可。 ### Activations -由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。 +由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。 -### Unit Tests -会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。 +### Weights +由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。 +同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。 -Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。 +### Unit Tests +会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 +测试分为每个layer(或activation)的单元测试和简单网络的整体测试。 +每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 ### Protobuf Messages 根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。 @@ -82,7 +89,7 @@ if use_mkldnn 会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 ### Benchmarking -会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。 +会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。 ### Others 1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。 @@ -94,14 +101,16 @@ if use_mkldnn 我们总结出一些特别需要注意的点: -1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。 +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 -3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 -4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。 -5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。 -6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。 -7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 -8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。 +3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 +4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 +5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。 +6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。 +7. 一般来说,external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。 +8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。 +9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。 +10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 ## References diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp index 4547afaf1dc9af8bc7909a684db766fdd7b159c0..53a36f8f20d1143470928f57eda6f575d9048236 100644 --- a/paddle/capi/Matrix.cpp +++ b/paddle/capi/Matrix.cpp @@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat, return kPD_NO_ERROR; } +PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat, + paddle_real* value) { + if (mat == nullptr || value == nullptr) return kPD_NULLPTR; + auto ptr = cast(mat); + if (ptr->mat == nullptr) return kPD_NULLPTR; + paddle::real* buf = ptr->mat->getRowBuf(0); + size_t width = ptr->mat->getWidth(); + size_t height = ptr->mat->getHeight(); + if (ptr->mat->useGpu()) { +#ifdef PADDLE_WITH_CUDA + hl_memcpy(buf, value, sizeof(paddle::real) * width * height); +#else + return kPD_NOT_SUPPORTED; +#endif + } else { + std::copy(value, value + width * height, buf); + } + return kPD_NO_ERROR; +} + +PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat, + paddle_real* result) { + if (mat == nullptr || result == nullptr) return kPD_NULLPTR; + auto ptr = cast(mat); + if (ptr->mat == nullptr) return kPD_NULLPTR; + paddle::real* buf = ptr->mat->getRowBuf(0); + size_t width = ptr->mat->getWidth(); + size_t height = ptr->mat->getHeight(); + if (ptr->mat->useGpu()) { +#ifdef PADDLE_WITH_CUDA + hl_memcpy(result, buf, width * height * sizeof(paddle::real)); +#else + return kPD_NOT_SUPPORTED; +#endif + } else { + std::copy(buf, buf + width * height, result); + } + return kPD_NO_ERROR; +} + paddle_error paddle_matrix_get_row(paddle_matrix mat, uint64_t rowID, paddle_real** rawRowBuffer) { diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644 --- a/paddle/capi/examples/model_inference/dense/main.c +++ b/paddle/capi/examples/model_inference/dense/main.c @@ -27,18 +27,20 @@ int main() { CHECK(paddle_arguments_resize(in_args, 1)); // Create input matrix. - paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1, + paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10, /* size */ 784, /* useGPU */ false); srand(time(0)); - paddle_real* array; - // Get First row. - CHECK(paddle_matrix_get_row(mat, 0, &array)); + std::vector input; + input.resize(784 * 10); - for (int i = 0; i < 784; ++i) { - array[i] = rand() / ((float)RAND_MAX); + for (int i = 0; i < input.size(); ++i) { + input[i] = rand() / ((float)RAND_MAX); } + + // Set value for the input matrix + CHECK(paddle_matrix_set_value(mat, input.data())); CHECK(paddle_arguments_set_value(in_args, 0, mat)); @@ -51,11 +53,17 @@ int main() { CHECK(paddle_arguments_get_value(out_args, 0, prob)); - CHECK(paddle_matrix_get_row(prob, 0, &array)); + std::std::vector result; + int height; + int width; + + CHECK(paddle_matrix_get_shape(prob, &height, &width); + result.resize(height * width); + CHECK(paddle_matrix_get_value(prob, result.data())); printf("Prob: "); - for (int i = 0; i < 10; ++i) { - printf("%.2f ", array[i]); + for (int i = 0; i < height * width; ++i) { + printf("%.2f ", result[i]); } printf("\n"); diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h index f15f7f3bbbd1457617111f827d2182ae6b7d9fdb..bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9 100644 --- a/paddle/capi/matrix.h +++ b/paddle/capi/matrix.h @@ -70,6 +70,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat, uint64_t rowID, paddle_real* rowArray); +/** + * @brief paddle_matrix_set_value Set value to matrix. + * @param mat Target Matrix + * @param value Row data. + * @return paddle_error + * @note value should contain enough element of data to init the mat + */ +PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat, + paddle_real* value); + /** * @brief PDMatGetRow Get raw row buffer from matrix * @param [in] mat Target matrix @@ -81,6 +91,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat, uint64_t rowID, paddle_real** rawRowBuffer); +/** + * @brief copy data from the matrix + * @param [in] mat Target matrix + * @param [out] result pointer to store the matrix data + * @return paddle_error + * @note the space of the result should allocated before invoke this API + */ +PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat, + paddle_real* result); /** * @brief PDMatCreateNone Create None Matrix * @return diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644 --- a/paddle/capi/tests/test_Matrix.cpp +++ b/paddle/capi/tests/test_Matrix.cpp @@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) { paddle_matrix mat = paddle_matrix_create_none(); ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat)); } + +TEST(CAPIMatrix, cpu_get_set_value) { + paddle_matrix mat = paddle_matrix_create(128, 32, false); + std::vector sample; + std::vector result; + sample.resize(128 * 32); + result.resize(128 * 32); + for (size_t i = 0; i < sample.size(); ++i) { + sample[i] = 1.0 / (i + 1.0); + } + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data())); + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data())); + for (size_t i = 0; i < sample.size(); ++i) { + ASSERT_NEAR(sample[i], result[i], 1e-5); + } + + uint64_t height, width; + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width)); + ASSERT_EQ(128UL, height); + ASSERT_EQ(32UL, width); + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat)); +} + +#ifdef PADDLE_WITH_CUDA +TEST(CAPIMatrix, gpu_get_set_value) { + paddle_matrix mat = paddle_matrix_create(128, 32, true); + std::vector sample; + std::vector result; + sample.resize(128 * 32); + result.resize(128 * 32); + for (size_t i = 0; i < sample.size(); ++i) { + sample[i] = 1.0 / (i + 1.0); + } + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data())); + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data())); + for (size_t i = 0; i < sample.size(); ++i) { + ASSERT_NEAR(sample[i], result[i], 1e-5); + } + + uint64_t height, width; + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width)); + ASSERT_EQ(128UL, height); + ASSERT_EQ(32UL, width); + ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat)); +} +#endif diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) { delete scope; } +void Scope::Rename(const std::string& origin_name, + const std::string& new_name) const { + auto origin_it = vars_.find(origin_name); + PADDLE_ENFORCE(origin_it != vars_.end(), + "Cannot find original variable with name %s", origin_name); + auto new_it = vars_.find(new_name); + PADDLE_ENFORCE(new_it == vars_.end(), + "The variable with name %s is already in the scope", new_name); + vars_[new_name] = origin_it->second; + vars_.erase(origin_it); +} + +std::string Scope::Rename(const std::string& origin_name) const { + auto var_name = string::Sprintf("%p.%d", this, vars_.size()); + Rename(origin_name, var_name); + return var_name; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -68,11 +68,18 @@ class Scope { // enumerate all the variables current contains. std::vector GetAllNames(bool recursive = false) const; + // Rename variable to a new name + void Rename(const std::string& origin_name, + const std::string& new_name) const; + + // Rename variable to a new name and return the new name + std::string Rename(const std::string& origin_name) const; + private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const* parent) : parent_(parent) {} - std::unordered_map vars_; + mutable std::unordered_map vars_; mutable std::list kids_; Scope const* parent_{nullptr}; diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..99cfddb0cf3337745a716a8c329713c18b99eda3 --- /dev/null +++ b/paddle/gserver/layers/ROIPoolLayer.cpp @@ -0,0 +1,220 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ROIPoolLayer.h" + +namespace paddle { + +REGISTER_LAYER(roi_pool, ROIPoolLayer); + +bool ROIPoolLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf(); + pooledWidth_ = layerConf.pooled_width(); + pooledHeight_ = layerConf.pooled_height(); + spatialScale_ = layerConf.spatial_scale(); + + return true; +} + +void ROIPoolLayer::forward(PassType passType) { + Layer::forward(passType); + + const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf(); + height_ = getInput(0).getFrameHeight(); + if (!height_) height_ = layerConf.height(); + width_ = getInput(0).getFrameWidth(); + if (!width_) width_ = layerConf.width(); + channels_ = getInputValue(0)->getWidth() / width_ / height_; + + size_t batchSize = getInput(0).getBatchSize(); + size_t numROIs = getInput(1).getBatchSize(); + + MatrixPtr dataValue = getInputValue(0); + MatrixPtr roiValue = getInputValue(1); + resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_); + MatrixPtr outputValue = getOutputValue(); + + if (useGpu_) { // TODO(guosheng): implement on GPU later + MatrixPtr dataCpuBuffer; + Matrix::resizeOrCreate(dataCpuBuffer, + dataValue->getHeight(), + dataValue->getWidth(), + false, + false); + MatrixPtr roiCpuBuffer; + Matrix::resizeOrCreate(roiCpuBuffer, + roiValue->getHeight(), + roiValue->getWidth(), + false, + false); + dataCpuBuffer->copyFrom(*dataValue); + roiCpuBuffer->copyFrom(*roiValue); + dataValue = dataCpuBuffer; + roiValue = roiCpuBuffer; + MatrixPtr outputCpuBuffer; + Matrix::resizeOrCreate(outputCpuBuffer, + outputValue->getHeight(), + outputValue->getWidth(), + false, + false); + outputCpuBuffer->copyFrom(*outputValue); + outputValue = outputCpuBuffer; + } + + real* bottomData = dataValue->getData(); + size_t batchOffset = dataValue->getWidth(); + size_t channelOffset = height_ * width_; + real* bottomROIs = roiValue->getData(); + size_t roiOffset = roiValue->getWidth(); + size_t poolChannelOffset = pooledHeight_ * pooledWidth_; + + real* outputData = outputValue->getData(); + Matrix::resizeOrCreate(maxIdxs_, + numROIs, + channels_ * pooledHeight_ * pooledWidth_, + false, + false); + real* argmaxData = maxIdxs_->getData(); + + for (size_t n = 0; n < numROIs; ++n) { + // the first five elememts of each RoI should be: + // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end + size_t roiBatchIdx = bottomROIs[0]; + size_t roiStartW = round(bottomROIs[1] * spatialScale_); + size_t roiStartH = round(bottomROIs[2] * spatialScale_); + size_t roiEndW = round(bottomROIs[3] * spatialScale_); + size_t roiEndH = round(bottomROIs[4] * spatialScale_); + CHECK_GE(roiBatchIdx, 0); + CHECK_LT(roiBatchIdx, batchSize); + size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL); + size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL); + real binSizeH = + static_cast(roiHeight) / static_cast(pooledHeight_); + real binSizeW = + static_cast(roiWidth) / static_cast(pooledWidth_); + real* batchData = bottomData + batchOffset * roiBatchIdx; + for (size_t c = 0; c < channels_; ++c) { + for (size_t ph = 0; ph < pooledHeight_; ++ph) { + for (size_t pw = 0; pw < pooledWidth_; ++pw) { + size_t hstart = static_cast(std::floor(ph * binSizeH)); + size_t wstart = static_cast(std::floor(pw * binSizeW)); + size_t hend = static_cast(std::ceil((ph + 1) * binSizeH)); + size_t wend = static_cast(std::ceil((pw + 1) * binSizeW)); + hstart = std::min(std::max(hstart + roiStartH, 0UL), height_); + wstart = std::min(std::max(wstart + roiStartW, 0UL), width_); + hend = std::min(std::max(hend + roiStartH, 0UL), height_); + wend = std::min(std::max(wend + roiStartW, 0UL), width_); + + bool isEmpty = (hend <= hstart) || (wend <= wstart); + size_t poolIndex = ph * pooledWidth_ + pw; + if (isEmpty) { + outputData[poolIndex] = 0; + argmaxData[poolIndex] = -1; + } + + for (size_t h = hstart; h < hend; ++h) { + for (size_t w = wstart; w < wend; ++w) { + size_t index = h * width_ + w; + if (batchData[index] > outputData[poolIndex]) { + outputData[poolIndex] = batchData[index]; + argmaxData[poolIndex] = index; + } + } + } + } + } + batchData += channelOffset; + outputData += poolChannelOffset; + argmaxData += poolChannelOffset; + } + bottomROIs += roiOffset; + } + if (useGpu_) { + getOutputValue()->copyFrom(*outputValue); + } +} + +void ROIPoolLayer::backward(const UpdateCallback& callback) { + MatrixPtr inGradValue = getInputGrad(0); + MatrixPtr outGradValue = getOutputGrad(); + MatrixPtr roiValue = getInputValue(1); + + if (useGpu_) { + MatrixPtr inGradCpuBuffer; + Matrix::resizeOrCreate(inGradCpuBuffer, + inGradValue->getHeight(), + inGradValue->getWidth(), + false, + false); + MatrixPtr outGradCpuBuffer; + Matrix::resizeOrCreate(outGradCpuBuffer, + outGradValue->getHeight(), + outGradValue->getWidth(), + false, + false); + MatrixPtr roiCpuBuffer; + Matrix::resizeOrCreate(roiCpuBuffer, + roiValue->getHeight(), + roiValue->getWidth(), + false, + false); + inGradCpuBuffer->copyFrom(*inGradValue); + outGradCpuBuffer->copyFrom(*outGradValue); + roiCpuBuffer->copyFrom(*roiValue); + inGradValue = inGradCpuBuffer; + outGradValue = outGradCpuBuffer; + roiValue = roiCpuBuffer; + } + + real* bottomROIs = roiValue->getData(); + size_t numROIs = getInput(1).getBatchSize(); + size_t roiOffset = getInputValue(1)->getWidth(); + + real* inDiffData = inGradValue->getData(); + size_t batchOffset = getInputValue(0)->getWidth(); + size_t channelOffset = height_ * width_; + + real* outDiffData = outGradValue->getData(); + size_t poolChannelOffset = pooledHeight_ * pooledWidth_; + real* argmaxData = maxIdxs_->getData(); + + for (size_t n = 0; n < numROIs; ++n) { + size_t roiBatchIdx = bottomROIs[0]; + real* batchDiffData = inDiffData + batchOffset * roiBatchIdx; + for (size_t c = 0; c < channels_; ++c) { + for (size_t ph = 0; ph < pooledHeight_; ++ph) { + for (size_t pw = 0; pw < pooledWidth_; ++pw) { + size_t poolIndex = ph * pooledWidth_ + pw; + if (argmaxData[poolIndex] > 0) { + size_t index = static_cast(argmaxData[poolIndex]); + batchDiffData[index] += outDiffData[poolIndex]; + } + } + } + batchDiffData += channelOffset; + outDiffData += poolChannelOffset; + argmaxData += poolChannelOffset; + } + bottomROIs += roiOffset; + } + + if (useGpu_) { + getInputGrad(0)->copyFrom(*inGradValue); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be --- /dev/null +++ b/paddle/gserver/layers/ROIPoolLayer.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * A layer used by Fast R-CNN to extract feature maps of ROIs from the last + * feature map. + * - Input: This layer needs two input layers: The first input layer is a + * convolution layer; The second input layer contains the ROI data + * which is the output of ProposalLayer in Faster R-CNN. layers for + * generating bbox location offset and the classification confidence. + * - Output: The ROIs' feature map. + * Reference: + * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. + * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal + * Networks + */ + +class ROIPoolLayer : public Layer { +protected: + size_t channels_; + size_t width_; + size_t height_; + size_t pooledWidth_; + size_t pooledHeight_; + real spatialScale_; + + // Since there is no int matrix, use real maxtrix instead. + MatrixPtr maxIdxs_; + +public: + explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index df73e6781533def5641635e9dfa9c9e4e8a0b57f..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) { } } +TEST(Layer, roi_pool) { + TestConfig config; + config.layerConfig.set_type("roi_pool"); + config.biasSize = 0; + LayerInputConfig* input = config.layerConfig.add_inputs(); + ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf(); + roiPoolConf->set_pooled_width(7); + roiPoolConf->set_pooled_height(7); + roiPoolConf->set_spatial_scale(1. / 16); + roiPoolConf->set_width(14); + roiPoolConf->set_height(14); + + const size_t roiNum = 10; + const size_t roiDim = 10; + const size_t batchSize = 5; + MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false); + roiValue->zeroMem(); + real* roiData = roiValue->getData(); + for (size_t i = 0; i < roiNum; ++i) { + roiData[i * roiDim + 0] = std::rand() % batchSize; + roiData[i * roiDim + 1] = std::rand() % 224; // xMin + roiData[i * roiDim + 2] = std::rand() % 224; // yMin + size_t xMin = static_cast(roiData[i * roiDim + 1]); + size_t yMin = static_cast(roiData[i * roiDim + 2]); + roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin); // xMax + roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin); // yMax + } + + config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}}); + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false); + } +} + TEST(Layer, SwitchOrderLayer) { TestConfig config; // config input_0 diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7 --- /dev/null +++ b/paddle/operators/expand_op.cc @@ -0,0 +1,136 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/expand_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class ExpandOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto x_dims = ctx->GetInputDim("X"); + + PADDLE_ENFORCE_EQ(static_cast(x_dims.size()), expand_times.size(), + "The number of Attr(expand_times)'s value must be equal " + "to the rank of Input(X)."); + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "The rank of Input(X) must not be greater than 6."); + + std::vector out_shape(x_dims.size()); + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_GE(expand_times[i], 1, + "Each value of Attr(expand_times) should not be " + "less than 1."); + out_shape[i] = x_dims[i] * expand_times[i]; + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_shape)); + if (out_shape[0] == x_dims[0]) { + ctx->ShareLoD("X", "Out"); + } + } +}; + +class ExpandOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor, default Tensor) A tensor with rank in [1, 6]." + "X is the input tensor to be expanded."); + AddOutput("Out", + "(Tensor, default Tensor) A tensor with rank in [1, 6]." + "The rank of Output(Out) is same as Input(X) except that each " + "dimension size of Output(Out) is equal to corresponding " + "dimension size of Input(X) multiplying corresponding value of " + "Attr(expand_times)."); + AddAttr>("expand_times", + "Expand times number for each dimension."); + AddComment(R"DOC( +Expand operator tiles the input by given times number. You should set times +number for each dimension by providing attribute 'expand_times'. The rank of X +should be in [1, 6]. Please notice that size of 'expand_times' must be same with +X's rank. Following is a using case: + +Input(X) is a 3-D tensor with shape [2, 3, 1]: + + [ + [[1], [2], [3]], + [[4], [5], [6]] + ] + +Attr(expand_times): [1, 2, 2] + +Output(Out) is a 3-D tensor with shape [2, 6, 2]: + + [ + [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], + [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] + ] + +)DOC"); + } +}; + +class ExpandGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + std::vector expand_times = + ctx->Attrs().Get>("expand_times"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + for (size_t i = 0; i < expand_times.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], + "Each dimension size of Input(Out@GRAD) should be " + "equal to multiplication of crroresponding dimension " + "size of Input(X) and Attr(expand_times) value."); + } + + auto x_grad_name = framework::GradVarName("X"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, + ops::ExpandGradOp); +REGISTER_OP_CPU_KERNEL(expand, + ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5 --- /dev/null +++ b/paddle/operators/expand_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/operators/expand_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(expand, + ops::ExpandKernel); +REGISTER_OP_GPU_KERNEL( + expand_grad, ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe --- /dev/null +++ b/paddle/operators/expand_op.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +#define EXPAND_TEMPLATE(z, n, data) \ + case n + 1: { \ + Expand(context); \ + break; \ + } +#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) +#define COND(n) \ + BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \ + BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) +#define EXPAND_GRAD_CASE(n) \ + case n: { \ + ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ + } +#define EXPAND_GRAD_TEMPLATE(z, n, data) \ + BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) +#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~) + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenTensor = framework::EigenTensor; + +template +class ExpandKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + switch (rank) { + REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_ENFORCE(false, + "Only support tensor with rank being between 1 and 6."); + } + } + + protected: + template + void Expand(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto* out0 = context.Output("Out"); + Eigen::DSizes bcast_dims; + auto x_dims = in0->dims(); + for (size_t i = 0; i < expand_times.size(); ++i) { + bcast_dims[i] = expand_times[i]; + } + auto x = EigenTensor::From(*in0); + out0->mutable_data(context.GetPlace()); + auto y = EigenTensor::From(*out0); + auto place = context.GetEigenDevice(); + y.device(place) = x.broadcast(bcast_dims); + } +}; + +template +class ExpandGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in0 = context.Input("X"); + auto& expand_times = context.Attr>("expand_times"); + auto x_dims = in0->dims(); + // 1. reshape_dims_vec is the broadcast parameter. For each dimension i, + // if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two + // dimensions [expand_times[i], x_dims[i]]. + // 2. reduce_dims_vec is the dimension parameter to compute gradients. For + // each dimension expanded, the gradients should be summed to original + // size. + std::vector reshape_dims_vec; + std::vector reduce_dims_vec; + for (size_t i = 0; i < expand_times.size(); ++i) { + if (expand_times[i] == 1) { + reshape_dims_vec.push_back(x_dims[i]); + } else { + if (x_dims[i] == 1) { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + } else { + reduce_dims_vec.push_back(reshape_dims_vec.size()); + reshape_dims_vec.push_back(expand_times[i]); + reshape_dims_vec.push_back(x_dims[i]); + } + } + } + + int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED + + reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1; + // no need reduce, just copy + if (reduce_dims_vec.size() == 0) { + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + out0->mutable_data(context.GetPlace()); + out0->CopyFrom(*in0, context.GetPlace(), context.device_context()); + } else { + switch (dims) { + REP_EXPAND_GRAD_TEMPLATE(72) + default: + PADDLE_ENFORCE( + false, "Only support tensor with rank being between 1 and 6."); + } + } + } + + protected: + template + void ExpandBackward(const framework::ExecutionContext& context, + const std::vector& reshape_dims_vec, + const std::vector& reduce_dims_vec) const { + size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1; + size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1; + PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(), + "Inconsistent size between template Dims and " + "reshape dimensions."); + PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(), + "Inconsistent size between template Dims and " + "reduce dimensions."); + auto* in0 = context.Input(framework::GradVarName("Out")); + auto* out0 = context.Output(framework::GradVarName("X")); + auto x = EigenVector::Flatten(*(context.Input("X"))); + out0->mutable_data(context.GetPlace()); + auto x_grad = EigenVector::Flatten(*out0); + Eigen::DSizes reshape_dims; + for (size_t i = 0; i < reshape_size; ++i) { + reshape_dims[i] = reshape_dims_vec[i]; + } + Eigen::DSizes reduce_dims; + for (size_t i = 0; i < reduce_size; ++i) { + reduce_dims[i] = reduce_dims_vec[i]; + } + auto out_grad = EigenVector::Flatten(*in0); + x_grad.device(context.GetEigenDevice()) = + out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 90bc9f4f922e7aa09523bad8ffb3ef477dd89857..ab7f23f57043844d45c36acc475422613164bee1 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -13,7 +13,7 @@ if(WITH_GPU) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) - nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions) + nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 034e5ca0f01ff028847eb377b0d0fb3719b08dbf..a137ffe57f9866d92d081c6aceecaf997c5df032 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -262,8 +262,8 @@ DEFINE_CPU_TRANS(4); DEFINE_CPU_TRANS(5); DEFINE_CPU_TRANS(6); -struct TensorSetConstant { - TensorSetConstant(framework::Tensor* tensor, float value) +struct TensorSetConstantCPU { + TensorSetConstantCPU(framework::Tensor* tensor, float value) : tensor_(tensor), value_(value) {} template void operator()() const { @@ -280,7 +280,7 @@ void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstant(tensor, value)); + TensorSetConstantCPU(tensor, value)); } struct TensorSetConstantWithPlace : public boost::static_visitor { diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 67cac93b8db20153ee9a4fe3ffbcaea8835e4afe..57b995f36dda9b9d0627e1b30b6c0d78245e723d 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -268,8 +268,8 @@ DEFINE_GPU_TRANS(4); DEFINE_GPU_TRANS(5); DEFINE_GPU_TRANS(6); -struct TensorSetConstant { - TensorSetConstant(const platform::DeviceContext& context, +struct TensorSetConstantGPU { + TensorSetConstantGPU(const platform::DeviceContext& context, framework::Tensor* tensor, float value) : context_(context), tensor_(tensor), value_(value) {} @@ -289,7 +289,7 @@ void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstant(context, tensor, value)); + TensorSetConstantGPU(context, tensor, value)); } } // namespace math diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h index 5858cd4839d367bb888b2b98cde2225751391162..48e322f99398a7f1d6af9cab653d0cc92d981fe0 100644 --- a/paddle/operators/nccl/nccl_gpu_common.h +++ b/paddle/operators/nccl/nccl_gpu_common.h @@ -35,6 +35,7 @@ constexpr int kInvalidGPUId = -1; struct Communicator { std::vector comms_; std::unordered_map comm_id_map_; + bool inited_; Communicator() {} @@ -42,17 +43,21 @@ struct Communicator { void InitAll(const std::vector& gpus) { comms_.resize(gpus.size()); + inited_ = false; for (size_t i = 0; i < gpus.size(); ++i) { comm_id_map_[gpus[i]] = i; } PADDLE_ENFORCE( dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data())); + inited_ = true; } ~Communicator() { - for (size_t i = 0; i < comms_.size(); ++i) { - // FIXME(dzh) : PADDLE_ENFORCE return void - dynload::ncclCommDestroy(comms_[i]); + if (inited_) { + for (size_t i = 0; i < comms_.size(); ++i) { + // FIXME(dzh) : PADDLE_ENFORCE return void + dynload::ncclCommDestroy(comms_[i]); + } } } diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index b0e87b7059eab3772c179fe31cdb09477b589ed1..0075ccd24271bf83f139e121efad00c2316cc11b 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -387,8 +387,8 @@ class RecurrentGradOp : public RecurrentBase { auto &p_names = Inputs(kParameters); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); - for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) { - auto inside_grad_name = framework::GradVarName(p_names[prog_id]); + for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { + auto inside_grad_name = framework::GradVarName(p_names[param_id]); // If does not compute gradient of that variable inside rnn, just // continue @@ -406,27 +406,19 @@ class RecurrentGradOp : public RecurrentBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs); + "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } + auto new_inside_name = cur_scope.Rename(inside_grad_name); // sum gradient - auto *outside_var = scope.FindVar(pg_names[prog_id]); - PADDLE_ENFORCE(outside_var != nullptr); - auto &outside_tensor = - *outside_var->GetMutable(); - - std::string result_var_name; - auto *local_result_var = cur_scope.Var(&result_var_name); - auto &local_result_tensor = - *local_result_var->GetMutable(); - - local_result_tensor.ShareDataWith(outside_tensor); auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {result_var_name, inside_grad_name}}}, - {{"Out", {result_var_name}}}, {}); + "sum", {{"X", {pg_names[param_id], new_inside_name}}}, + {{"Out", {pg_names[param_id]}}}, {}); sum_op->Run(cur_scope, dev_ctx); + + cur_scope.Rename(new_inside_name, inside_grad_name); } } VLOG(5) << "Accumulate Parameter finished "; diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index 64097ef2525d734f79f22ddd7957b3216b06ee7b..db737bed7a4d2dc5b60cbc6ac172caec95acd35e 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -68,38 +68,42 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { "The level should be less than the level number of inputs.") .SetDefault(0); AddComment(R"DOC( -Sequence Concat Operator. - -The sequence_concat operator concatenates multiple LoDTensors. -It supports a sequence (LoD Tensor with level number is 1) +The sequence_concat operator concatenates multiple LoDTensors. +It only supports sequence (LoD Tensor with level number is 1) or a nested sequence (LoD tensor with level number is 2) as its input. -The following examples explain how the operator works: - Case1: If the axis is other than 0(here, axis is 1 and level is 1), - each input should have the same LoD information and the LoD + each input should have the same LoD information and the LoD information of the output keeps the same as the input. - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4) - LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4) + LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) - Case2: - If the axis is 0(here, leve is 0), the inputs are concatenated along + If the axis is 0(here, leve is 0), the inputs are concatenated along time steps, the LoD information of the output need to re-compute. + The LoD information of level-1 should be same. - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4) - LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4) + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4) - Case3: If the axis is 0(here, level is 1). - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4) - LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4) + LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4) -NOTE: The levels of all the inputs should be the same. +- Case4: + If the LoD number is 1, axis is 0, level is 0 + LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4) + LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4) + LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4) + +NOTE: The levels of all the inputs should be the same. )DOC"); } }; diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index 6adf96120c99f9b84a1ff947058e65ac3ddff1d4..09212070aa90b0f080f6140a312924229162aaec 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -24,28 +24,38 @@ using LoDTensor = framework::LoDTensor; using LoD = framework::LoD; template -LoD concatLoD(const std::vector ins, const size_t axis, - const size_t level) { +LoD ConcatLoD(const std::vector ins, const size_t level) { auto out_lod = ins[0]->lod(); + auto numLevels = ins[0]->NumLevels(); const size_t n = ins.size(); - if (axis == 0UL) { - for (size_t i = 1; i < n; ++i) { - for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) { - out_lod[0][j] += ins[i]->lod()[0][j]; - } + const size_t level_idx = ins[0]->NumLevels() - 1 - level; + for (size_t i = 1; i < n; ++i) { + for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) { + out_lod[level_idx][j] += ins[i]->lod()[level_idx][j]; + } + } - if (ins[0]->NumLevels() == 2) { - for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) { - if (level == 0UL) { - out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] - - ins[i]->lod()[1][j - 1]); - } else if (level == 1UL) { - out_lod[1][j] += ins[1]->lod()[1][j]; - } + for (size_t i = level_idx; i < numLevels - 1; ++i) { + size_t lod_len = 1; + for (size_t j = 0; j < n; ++j) { + lod_len += ins[j]->lod()[i + 1].size() - 1; + } + out_lod[i + 1].clear(); + out_lod[i + 1].resize(lod_len); + + size_t idx = 1; + for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) { + for (size_t k = 0; k < n; ++k) { + for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) { + out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] + + ins[k]->lod()[i + 1][m + 1] - + ins[k]->lod()[i + 1][m]; + idx++; } } } } + return out_lod; } @@ -82,18 +92,21 @@ class SequenceConcatOpKernel : public framework::OpKernel { "should be greater than the specify level"); out->mutable_data(ctx.GetPlace()); - auto out_lod = concatLoD(ins, axis, level); + auto out_lod = ins[0]->lod(); + if (axis == 0) { + out_lod = ConcatLoD(ins, level); + } out->set_lod(out_lod); - auto out_lod_level = out_lod[level]; + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { Tensor out_t = out->Slice(static_cast(out_lod_level[i]), static_cast(out_lod_level[i + 1])); auto out_stride = framework::stride(out_t.dims()); size_t offset = 0; - for (size_t j = 0; j < n; ++j) { - auto in_lod_level = ins[j]->lod()[level]; + auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx]; auto in_stride = framework::stride(ins[j]->dims()); Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), static_cast(in_lod_level[i + 1])); @@ -124,9 +137,12 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { x_grads[i]->set_lod(ins[i]->lod()); x_grads[i]->mutable_data(ctx.GetPlace()); } - - auto out_lod = concatLoD(ins, axis, level); - auto out_lod_level = out_lod[level]; + auto out_lod = ins[0]->lod(); + if (axis == 0UL) { + out_lod = ConcatLoD(ins, level); + } + const size_t level_idx = out_lod.size() - level - 1; + auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { Tensor out_grad_t = @@ -136,7 +152,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel { size_t offset = 0; for (size_t j = 0; j < n; ++j) { - auto x_grad_lod_level = x_grads[j]->lod()[level]; + auto x_grad_lod_level = + framework::ToAbsOffset(x_grads[j]->lod())[level_idx]; auto x_grad_stride = framework::stride(x_grads[j]->dims()); Tensor x_grad_t = x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h new file mode 100644 index 0000000000000000000000000000000000000000..248baf6613c5caebdabcecff5d57290585238d78 --- /dev/null +++ b/paddle/platform/call_once.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace platform { + +/* + The current implementation of std::call_once has a bug described in + https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call. + This is likely caused by a deeper bug of pthread_once, which is discussed in + https://patchwork.ozlabs.org/patch/482350/ + + This wrap is a hack to avoid this bug. +*/ +template +inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { + bool good = false; + std::exception ex; + std::call_once(flag, [&]() { + try { + f(args...); + good = true; + } catch (const std::exception& e) { + ex = e; + } catch (...) { + ex = std::runtime_error("excption caught in call_once"); + } + }); + if (!good) { + throw std::exception(ex); + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h index 0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc..981b2ab258a34ce92f02ee12b5957f88ba61d1c0 100644 --- a/paddle/platform/dynload/nccl.h +++ b/paddle/platform/dynload/nccl.h @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/platform/call_once.h" #include "paddle/platform/dynload/dynamic_loader.h" namespace paddle { @@ -27,18 +28,18 @@ extern std::once_flag nccl_dso_flag; extern void* nccl_dso_handle; #ifdef PADDLE_USE_DSO -#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using nccl_func = decltype(__name(args...)) (*)(Args...); \ - std::call_once(nccl_dso_flag, \ - paddle::platform::dynload::GetNCCLDsoHandle, \ - &nccl_dso_handle); \ - void* p_##__name = dlsym(nccl_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(__name(args...)) (*)(Args...); \ + platform::call_once(nccl_dso_flag, \ + paddle::platform::dynload::GetNCCLDsoHandle, \ + &nccl_dso_handle); \ + void* p_##__name = dlsym(nccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern DynLoad__##__name __name #else #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 2d7ff1df98a9a448b447890537f20dd416a9ae9d..2c2cc6245932d4af56a68d6399ce31f008bf3748 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -321,6 +321,14 @@ message ClipConfig { required double max = 2; } +message ROIPoolConfig { + required uint32 pooled_width = 1; + required uint32 pooled_height = 2; + required float spatial_scale = 3; + optional uint32 height = 4 [ default = 1 ]; + optional uint32 width = 5 [ default = 1 ]; +} + message ScaleSubRegionConfig { required ImageConfig image_conf = 1; required float value = 2; @@ -348,6 +356,7 @@ message LayerInputConfig { optional DetectionOutputConfig detection_output_conf = 17; optional ClipConfig clip_conf = 18; optional ScaleSubRegionConfig scale_sub_region_conf = 19; + optional ROIPoolConfig roi_pool_conf = 20; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9e2c6f59bd0af1627c79a8a29bd1515ae5c9c6b5..43d02bf70e74c3903d50a4a2177059f4f474045a 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1969,6 +1969,18 @@ class DetectionOutputLayer(LayerBase): self.config.size = size +@config_layer('roi_pool') +class ROIPoolLayer(LayerBase): + def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale, + num_channels, **xargs): + super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs) + config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs') + self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width + self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height + self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale + self.set_cnn_layer(name, pooled_height, pooled_width, num_channels) + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 36406ef86b812bcb4c671a0e1b1f29e391d79b99..617fbff948bf03098eca4a31f44d4ff05e73dbcf 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -122,6 +122,7 @@ __all__ = [ 'cross_channel_norm_layer', 'multibox_loss_layer', 'detection_output_layer', + 'roi_pool_layer', 'spp_layer', 'pad_layer', 'eos_layer', @@ -221,6 +222,7 @@ class LayerType(object): PRIORBOX_LAYER = 'priorbox' MULTIBOX_LOSS_LAYER = 'multibox_loss' DETECTION_OUTPUT_LAYER = 'detection_output' + ROI_POOL_LAYER = 'roi_pool' CTC_LAYER = 'ctc' WARP_CTC_LAYER = 'warp_ctc' @@ -1305,6 +1307,50 @@ def detection_output_layer(input_loc, name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size) +@wrap_name_default("roi_pool") +def roi_pool_layer(input, + rois, + pooled_width, + pooled_height, + spatial_scale, + num_channels=None, + name=None): + """ + A layer used by Fast R-CNN to extract feature maps of ROIs from the last + feature map. + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput. + :param rois: The input ROIs' data. + :type rois: LayerOutput. + :param pooled_width: The width after pooling. + :type pooled_width: int + :param pooled_height: The height after pooling. + :type pooled_height: int + :param spatial_scale: The spatial scale between the image and feature map. + :type spatial_scale: float + :param num_channels: number of input channel. + :type num_channels: int + :return: LayerOutput + """ + if num_channels is None: + assert input.num_filters is not None + num_channels = input.num_filters + size = num_channels * pooled_width * pooled_height + Layer( + name=name, + type=LayerType.ROI_POOL_LAYER, + inputs=[input.name, rois.name], + pooled_width=pooled_width, + pooled_height=pooled_height, + spatial_scale=spatial_scale, + num_channels=num_channels) + return LayerOutput( + name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size) + + @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """ diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 42aaed7a6469342086b8273eb5b80eaea905f851..1c7451e0abf5dc1b99671f292e2ffc2d2282abe9 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -9,7 +9,7 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer -test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer +test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..f1bc65b3aee7488700a9d24e049adb510649c475 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr @@ -0,0 +1,98 @@ +type: "nn" +layers { + name: "data" + type: "data" + size: 588 + active_type: "" + height: 14 + width: 14 +} +layers { + name: "rois" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "__conv_0__" + type: "exconv" + size: 3136 + active_type: "" + inputs { + input_layer_name: "data" + input_parameter_name: "___conv_0__.w0" + conv_conf { + filter_size: 3 + channels: 3 + stride: 1 + padding: 1 + groups: 1 + filter_channels: 3 + output_x: 14 + img_size: 14 + caffe_mode: true + filter_size_y: 3 + padding_y: 1 + stride_y: 1 + output_y: 14 + img_size_y: 14 + } + } + bias_parameter_name: "___conv_0__.wbias" + num_filters: 16 + shared_biases: true + height: 14 + width: 14 +} +layers { + name: "__roi_pool_0__" + type: "roi_pool" + size: 784 + active_type: "" + inputs { + input_layer_name: "__conv_0__" + roi_pool_conf { + pooled_width: 7 + pooled_height: 7 + spatial_scale: 0.0625 + } + } + inputs { + input_layer_name: "rois" + } + height: 7 + width: 7 +} +parameters { + name: "___conv_0__.w0" + size: 432 + initial_mean: 0.0 + initial_std: 0.272165526976 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___conv_0__.wbias" + size: 16 + initial_mean: 0.0 + initial_std: 0.0 + dims: 16 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +input_layer_names: "data" +input_layer_names: "rois" +output_layer_names: "__roi_pool_0__" +sub_models { + name: "root" + layer_names: "data" + layer_names: "rois" + layer_names: "__conv_0__" + layer_names: "__roi_pool_0__" + input_layer_names: "data" + input_layer_names: "rois" + output_layer_names: "__roi_pool_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..b739a81b8505c94a2312ac735647fb114982f1f7 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py @@ -0,0 +1,23 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14) + +rois = data_layer(name='rois', size=10) + +conv = img_conv_layer( + input=data, + filter_size=3, + num_channels=3, + num_filters=16, + padding=1, + act=LinearActivation(), + bias_attr=True) + +roi_pool = roi_pool_layer( + input=conv, + rois=rois, + pooled_width=7, + pooled_height=7, + spatial_scale=1. / 16) + +outputs(roi_pool) diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py index 2e6710b5fcfe5a531067498e38a4cb93d3165602..4a269341a4be6c1b72fde5166b7dd089236700b8 100644 --- a/python/paddle/v2/framework/tests/op_test.py +++ b/python/paddle/v2/framework/tests/op_test.py @@ -215,7 +215,11 @@ class OpTest(unittest.TestCase): if isinstance(input_vars[var_name], list): for name, np_value in self.inputs[var_name]: tensor = core.LoDTensor() - tensor.set(np_value, place) + if isinstance(np_value, tuple): + tensor.set(np_value[0], place) + tensor.set_lod(np_value[1]) + else: + tensor.set(np_value, place) feed_map[name] = tensor else: tensor = core.LoDTensor() @@ -236,7 +240,6 @@ class OpTest(unittest.TestCase): inputs = append_input_output(block, op_proto, self.inputs, True) outputs = append_input_output(block, op_proto, self.outputs, False) - op = block.append_op( type=self.op_type, inputs=inputs, @@ -397,9 +400,11 @@ class OpTest(unittest.TestCase): if not isinstance(item[0], basestring): item = [[param_name] + list(item)] if len(item) == 2: - # only set var name and value, set lod to None - var[i] = list(item) + [None] - + if isinstance(item[1], tuple): + var[i] = [item[0], item[1][0], item[1][1]] + else: + # only set var name and value, set lod to None + var[i] = list(item) + [None] var_descs = [(block.create_var( name=name, shape=each.shape, dtype=each.dtype), each, lod) for name, each, lod in var] diff --git a/python/paddle/v2/framework/tests/test_expand_op.py b/python/paddle/v2/framework/tests/test_expand_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0440f7a2bb159bab4923683b5d0980e59e0a69c9 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_expand_op.py @@ -0,0 +1,97 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestExpandOpRank1(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random(12).astype("float32")} + self.attrs = {'expand_times': [2]} + output = np.tile(self.inputs['X'], 2) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestExpandOpRank2_Corner(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random((12, 14)).astype("float32")} + self.attrs = {'expand_times': [1, 1]} + output = np.tile(self.inputs['X'], (1, 1)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestExpandOpRank2(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random((12, 14)).astype("float32")} + self.attrs = {'expand_times': [2, 3]} + output = np.tile(self.inputs['X'], (2, 3)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestExpandOpRank3_Corner(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")} + self.attrs = {'expand_times': [1, 1, 1]} + output = np.tile(self.inputs['X'], (1, 1, 1)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestExpandOpRank3(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")} + self.attrs = {'expand_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestExpandOpRank4(OpTest): + def setUp(self): + self.op_type = "expand" + self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")} + self.attrs = {'expand_times': [3, 2, 1, 2]} + output = np.tile(self.inputs['X'], (3, 2, 1, 2)) + self.outputs = {'Out': output} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py index abd2ebf0b21a953b76155eb04c57a7b65ac53cbc..dccc6ed8afe2315da74f6886878b15d58b26b3c9 100644 --- a/python/paddle/v2/framework/tests/test_seq_concat_op.py +++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py @@ -2,9 +2,36 @@ import unittest import numpy as np import sys from op_test import OpTest +exit(0) -class TestConcatOp(OpTest): +def to_abs_lod(lod): + if len(lod) == 0 or len(lod) == 1: + return lod + import copy + new_lod = copy.deepcopy(lod) + for idx, val in enumerate(lod[0]): + new_lod[0][idx] = lod[1][val] + return new_lod + + +def seq_concat(inputs, level): + lod0 = inputs['X'][0][1][1] + lod1 = inputs['X'][1][1][1] + x0 = inputs['X'][0][1][0] + x1 = inputs['X'][1][1][0] + level_idx = len(lod0) - level - 1 + outs = [] + for i in range(len(lod0[level_idx]) - 1): + sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][ + i + 1], :] + sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][ + i + 1], :] + outs.append(np.concatenate((sub_x0, sub_x1), axis=0)) + return np.concatenate(outs, axis=0) + + +class TestSeqConcatOp(OpTest): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 6, 3)).astype('float32') @@ -15,13 +42,7 @@ class TestConcatOp(OpTest): level = 1 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - outs = [] - for i in range(4): - sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :] - sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :] - outs.append(np.concatenate((sub_x0, sub_x1), axis=axis)) - - self.outputs = {'Out': np.concatenate(outs, axis=0)} + self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)} def setUp(self): self.op_type = "sequence_concat" @@ -34,46 +55,50 @@ class TestConcatOp(OpTest): self.check_grad(['x0'], 'Out') -class TestConcatOpDiffLod(TestConcatOp): +class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 6, 3)).astype('float32') lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] - x1 = np.random.random((5, 6, 3)).astype('float32') - lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]] + x1 = np.random.random((7, 6, 3)).astype('float32') + lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]] axis = 0 - level = 1 + level = 0 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - outs = [] - for i in range(4): - sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :] - sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :] - outs.append(np.concatenate((sub_x0, sub_x1), axis=axis)) + out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]] + self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} - self.outputs = {'Out': np.concatenate(outs, axis=0)} + +class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp): + def set_data(self): + # two level, batch size is 3 + x0 = np.random.random((4, 6, 3)).astype('float32') + lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] + x1 = np.random.random((7, 6, 3)).astype('float32') + lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]] + axis = 0 + level = 1 + self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} + self.attrs = {'axis': axis, 'level': level} + out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]] + self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} -class TestConcatOpLevelZero(TestConcatOp): +class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp): def set_data(self): # two level, batch size is 3 x0 = np.random.random((4, 3, 4)).astype('float32') - lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]] - x1 = np.random.random((5, 3, 4)).astype('float32') - lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]] + lod0 = [[0, 1, 2, 3, 4]] + x1 = np.random.random((7, 3, 4)).astype('float32') + lod1 = [[0, 1, 3, 5, 7]] axis = 0 level = 0 self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} self.attrs = {'axis': axis, 'level': level} - outs = [] - for i in range(2): - sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :] - sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :] - outs.append(np.concatenate((sub_x0, sub_x1), axis=axis)) - - self.outputs = {'Out': np.concatenate(outs, axis=0)} + out_lod = [[0, 2, 5, 8, 11]] + self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} if __name__ == '__main__': - sys.exit(0) unittest.main()