diff --git a/CMakeLists.txt b/CMakeLists.txt index c86889c05c8cf0d521dce9adbf3e918ba91729a1..0ec65bac84b0b0d89123473a8941f80c90f1b339 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. -option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON) +option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) diff --git a/Dockerfile b/Dockerfile index 60e76c7f2ede6beaca11659020d5991a75d5b741..fbec88c7966d6ea93495519843d6cda63f622661 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. RUN pip install --upgrade pip && \ pip install -U wheel && \ - pip install -U docopt PyYAML sphinx && \ - pip install -U sphinx-rtd-theme==0.1.9 recommonmark + pip install -U docopt PyYAML sphinx==1.5.6 && \ + pip install sphinx-rtd-theme==0.1.9 recommonmark RUN pip install pre-commit 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ diff --git a/cmake/generic.cmake b/cmake/generic.cmake index d0b5eaec2e2a50acf17e5dd1d1aeb0ec3e614fbf..471e3929069d0d28105404b4f0f6baa303faf0e0 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -244,11 +244,11 @@ function(cc_test TARGET_NAME) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) - target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) endif() - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) @@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md deleted file mode 100644 index 22c2a925eb8c5e1dd8451e1d3cba261ce471ec51..0000000000000000000000000000000000000000 --- a/doc/design/cpp_data_feeding.md +++ /dev/null @@ -1,78 +0,0 @@ -# C++ Data Feeding - -In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. - -In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching. - -## Reader - -A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data. - - -### `ReaderBase` - -`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces. - -```cpp -class ReaderBase { - public: - explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { - PADDLE_ENFORCE(!shapes_.empty()); - } - // Read the next batch of data. (A 'batch' can be only one instance) - // If the next batch doesn't exist, the '*out' will be an empty std::vector. - virtual void ReadNext(std::vector* out) = 0; - - // Reinitialize the reader and read the file from the begin. - virtual void ReInit() = 0; - - // Get a certain read in data's shape. - DDim shape(size_t idx) const; - // Get shapes of all read in data. - std::vector shapes() const { return shapes_; } - // Set shapes of read in data. - void set_shapes(const std::vector& shapes) { shapes_ = shapes; } - - virtual ~ReaderBase() {} - - protected: - std::vector shapes_; -}; -``` - -### `FileReader` and `DecoratedReader` - -These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers. - -All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly. - - -### `ReaderHolder` - -Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code: - -```cpp -var->Get("batch_reader"); -``` - -we have to write: - -```cpp -var->Get("batch_reader"); -``` - -This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible. - -To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader. - -## Related Operators - -To create and invoke readers, some now ops are introduced: - -### `CreateReaderOp` - -Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers. - -### `ReadOp` - -A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables. diff --git a/doc/design/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle deleted file mode 100644 index 5979f792e252f028a615729215529c2be42d9165..0000000000000000000000000000000000000000 Binary files a/doc/design/images/duplicate_op.graffle and /dev/null differ diff --git a/doc/design/images/duplicate_op.png b/doc/design/images/duplicate_op.png deleted file mode 100644 index f299c5d37f260a1bb0daec886f0a4ee1c1f31c92..0000000000000000000000000000000000000000 Binary files a/doc/design/images/duplicate_op.png and /dev/null differ diff --git a/doc/design/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle deleted file mode 100644 index 5cec3bc64dbd44dc99e348485969f29bd128ceb1..0000000000000000000000000000000000000000 Binary files a/doc/design/images/duplicate_op2.graffle and /dev/null differ diff --git a/doc/design/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png deleted file mode 100644 index 21cdd5cabf1b5203e1435a75b57770d2f702fa92..0000000000000000000000000000000000000000 Binary files a/doc/design/images/duplicate_op2.png and /dev/null differ diff --git a/doc/design/images/multiple_reader.png b/doc/design/images/multiple_reader.png new file mode 100644 index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046 Binary files /dev/null and b/doc/design/images/multiple_reader.png differ diff --git a/doc/design/images/readers.png b/doc/design/images/readers.png new file mode 100644 index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be Binary files /dev/null and b/doc/design/images/readers.png differ diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png deleted file mode 100644 index ef59e56b01d792a059279e6bb9a29f3db6a59a41..0000000000000000000000000000000000000000 Binary files a/doc/design/images/replica.png and /dev/null differ diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png deleted file mode 100644 index ef6f7317bd440cc7d9fe08fcbbf2b7a542f99049..0000000000000000000000000000000000000000 Binary files a/doc/design/images/two_phase_commit.png and /dev/null differ diff --git a/doc/design/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif similarity index 100% rename from doc/design/images/asgd.gif rename to doc/fluid/design/algorithm/images/asgd.gif diff --git a/doc/design/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif similarity index 100% rename from doc/design/images/theta_star.gif rename to doc/fluid/design/algorithm/images/theta_star.gif diff --git a/doc/design/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md similarity index 100% rename from doc/design/parameter_average.md rename to doc/fluid/design/algorithm/parameter_average.md diff --git a/doc/design/build_system/README.md b/doc/fluid/design/concepts/README.md similarity index 100% rename from doc/design/build_system/README.md rename to doc/fluid/design/concepts/README.md diff --git a/doc/design/block.md b/doc/fluid/design/concepts/block.md similarity index 100% rename from doc/design/block.md rename to doc/fluid/design/concepts/block.md diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md new file mode 100644 index 0000000000000000000000000000000000000000..8607b40ccbbe01db77afed72c1efa780b520744c --- /dev/null +++ b/doc/fluid/design/concepts/cpp_data_feeding.md @@ -0,0 +1,171 @@ +# C++ Data Feeding + +While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. + +In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching. + +## Overview + +![](images/readers.png) + +## Reader + +In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data. + + +### ReaderBase + +`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers. + +```cpp +class ReaderBase { + public: + // Reads the next batch of data. (A 'batch' can be only one instance) + // If the next batch doesn't exist, it throws an exception + virtual void ReadNext(std::vector* out) = 0; + + // Checks whether the next instance exists. + virtual bool HasNext() = 0; + + // Reinitializes the reader and read the file from the beginning. + virtual void ReInit() = 0; + + virtual ~ReaderBase(); +}; +``` + +### FileReader + +`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format. + +```cpp +class FileReader : public ReaderBase { + public: + explicit FileReader(const std::vector& dims); + + void ReadNext(std::vector* out) override; + + protected: + virtual void ReadNextImpl(std::vector* out) = 0; + + private: + std::vector dims_; +}; +``` + +A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`. + +The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`. + +### DecoratedReader + +A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers. + +```cpp +class DecoratedReader : public ReaderBase { + public: + explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { + PADDLE_ENFORCE_NOT_NULL(reader_); + } + + void ReInit() override { reader_->ReInit(); } + + bool HasNext() const override { return reader_->HasNext(); } + + protected: + ReaderBase* reader_; +}; +``` + +Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type. + +### MultipleReader + +All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`. + +So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling. + +![](images/multiple_reader.png) + +This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel. + +To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. + +### ReaderHolder + +Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code: + +```cpp +var->Get("batch_reader"); +``` + +We would have to write: + +```cpp +var->Get("batch_reader"); +``` + +This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible. + +To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader. + +## Related Operators + +To create and invoke readers, some new ops are introduced: + +### CreateReaderOp + +Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. + +However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice. + +### OpenFilesOp + +The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names. + +To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors. + +### HasNextOp + +`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface. + +### ResetOp + +`ResetOp` is used to reset a reader via its `ReInit()` interface. + +### ReadOp + +A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables. + +## Program with Readers + +A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`. + +The ops of a `startup_program` with readers would be like this: + +``` +multiple_reader = open_files_op(...) +batch_reader = create_batch_reader_op(multiple_reader) +double_buffer_reader = create_double_buffer_op(batch_reader) +... (other initializers) +``` + +The forwarding ops of the corresponding `main_program` would be like this: + +``` +while_op { + has_next = has_next_op(double_buffer_reader) + if_else_op(has_next) { + batch_data = read_op(double_buffer_reader) + ... (subsequent training ops) + } else { + reset_op(double_buffer_reader) + } +} +``` + +Two important considerations for these programs are as follows: + +1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. + +2. All readers exist in both `startup_program` and `main_program`. And they are persistable. diff --git a/doc/design/executor.md b/doc/fluid/design/concepts/executor.md similarity index 100% rename from doc/design/executor.md rename to doc/fluid/design/concepts/executor.md diff --git a/doc/design/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md similarity index 100% rename from doc/design/functions_operators_layers.md rename to doc/fluid/design/concepts/functions_operators_layers.md diff --git a/paddle/fluid/framework/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md similarity index 100% rename from paddle/fluid/framework/lod_tensor.md rename to doc/fluid/design/concepts/lod_tensor.md diff --git a/doc/design/program.md b/doc/fluid/design/concepts/program.md similarity index 100% rename from doc/design/program.md rename to doc/fluid/design/concepts/program.md diff --git a/doc/design/scope.md b/doc/fluid/design/concepts/scope.md similarity index 100% rename from doc/design/scope.md rename to doc/fluid/design/concepts/scope.md diff --git a/paddle/fluid/framework/tensor.md b/doc/fluid/design/concepts/tensor.md similarity index 100% rename from paddle/fluid/framework/tensor.md rename to doc/fluid/design/concepts/tensor.md diff --git a/doc/design/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md similarity index 100% rename from doc/design/tensor_array.md rename to doc/fluid/design/concepts/tensor_array.md diff --git a/doc/design/var_desc.md b/doc/fluid/design/concepts/var_desc.md similarity index 100% rename from doc/design/var_desc.md rename to doc/fluid/design/concepts/var_desc.md diff --git a/paddle/fluid/framework/variable.md b/doc/fluid/design/concepts/variable.md similarity index 100% rename from paddle/fluid/framework/variable.md rename to doc/fluid/design/concepts/variable.md diff --git a/doc/design/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md similarity index 100% rename from doc/design/concurrent_programming.md rename to doc/fluid/design/concurrent/concurrent_programming.md diff --git a/doc/design/csp.md b/doc/fluid/design/concurrent/csp.md similarity index 100% rename from doc/design/csp.md rename to doc/fluid/design/concurrent/csp.md diff --git a/doc/design/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md similarity index 100% rename from doc/design/parallel_do.md rename to doc/fluid/design/concurrent/parallel_do.md diff --git a/doc/design/float16.md b/doc/fluid/design/data_type/float16.md similarity index 100% rename from doc/design/float16.md rename to doc/fluid/design/data_type/float16.md diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md similarity index 98% rename from doc/design/dist_refactor/distributed_architecture.md rename to doc/fluid/design/dist_train/distributed_architecture.md index 9368c5780dc922953f38bf0f86d9f797a4a8a6fe..a405cb6aaf80b9d2e8a1a9c774ca85cc7e62bbab 100644 --- a/doc/design/dist_refactor/distributed_architecture.md +++ b/doc/fluid/design/dist_train/distributed_architecture.md @@ -1,4 +1,4 @@ -# Design Doc: Distributed Training Architecture +# Design Doc: Fluid Distributed Training Architecture ## Abstract @@ -155,7 +155,7 @@ Cluster environment. `RemoteExecutor.run` sends the `ProgramDesc` and -[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) +[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource) to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`. diff --git a/doc/design/dist_refactor/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md similarity index 100% rename from doc/design/dist_refactor/multi_cpu.md rename to doc/fluid/design/dist_train/multi_cpu.md diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md similarity index 86% rename from doc/design/dist_refactor/parameter_server.md rename to doc/fluid/design/dist_train/parameter_server.md index 805dd13048d41b995d2a01cda52b2ea33e4bbe1d..6ce48dfbfce8b094684b412ebfda7e505ddc30ae 100644 --- a/doc/design/dist_refactor/parameter_server.md +++ b/doc/fluid/design/dist_train/parameter_server.md @@ -59,6 +59,17 @@ After converting: queue. It will block until the queue has the required number of tensors. +### Sparse Update + +For embedding layers, the gradient may have many rows containing only 0 when training, +if the gradient uses a dense tensor to do parameter optimization, +it could spend unnecessary memory, slow down the calculations and waste +the bandwidth while doing distributed training. +In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing +non-zero gradient data. So when we do parameter optimization both locally and remotely, +we only need to send those non-zero rows to the optimizer operators: + + ### Benefits @@ -91,6 +102,6 @@ After converting: `min_count` attribute), does our current design support it? (similar question for the *Add* OP) +### References -### References: [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/design/dist_refactor/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle similarity index 100% rename from doc/design/dist_refactor/src/compiler.graffle rename to doc/fluid/design/dist_train/src/compiler.graffle diff --git a/doc/design/dist_refactor/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png similarity index 100% rename from doc/design/dist_refactor/src/compiler.png rename to doc/fluid/design/dist_train/src/compiler.png diff --git a/doc/design/dist_refactor/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle similarity index 100% rename from doc/design/dist_refactor/src/dist-graph.graffle rename to doc/fluid/design/dist_train/src/dist-graph.graffle diff --git a/doc/design/dist_refactor/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png similarity index 100% rename from doc/design/dist_refactor/src/dist-graph.png rename to doc/fluid/design/dist_train/src/dist-graph.png diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle similarity index 100% rename from doc/design/dist_refactor/src/distributed_architecture.graffle rename to doc/fluid/design/dist_train/src/distributed_architecture.graffle diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png similarity index 100% rename from doc/design/dist_refactor/src/distributed_architecture.png rename to doc/fluid/design/dist_train/src/distributed_architecture.png diff --git a/doc/design/dist_refactor/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle similarity index 100% rename from doc/design/dist_refactor/src/local-graph.graffle rename to doc/fluid/design/dist_train/src/local-graph.graffle diff --git a/doc/design/dist_refactor/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png similarity index 100% rename from doc/design/dist_refactor/src/local-graph.png rename to doc/fluid/design/dist_train/src/local-graph.png diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle similarity index 100% rename from doc/design/dist_refactor/src/local_architecture.graffle rename to doc/fluid/design/dist_train/src/local_architecture.graffle diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png similarity index 100% rename from doc/design/dist_refactor/src/local_architecture.png rename to doc/fluid/design/dist_train/src/local_architecture.png diff --git a/doc/design/dist_refactor/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle similarity index 100% rename from doc/design/dist_refactor/src/multi-threads.graffle rename to doc/fluid/design/dist_train/src/multi-threads.graffle diff --git a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png similarity index 100% rename from doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png rename to doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png diff --git a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png similarity index 100% rename from doc/design/dist_refactor/src/multi-threads/single-thread@3x.png rename to doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png diff --git a/doc/design/dist_refactor/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle similarity index 100% rename from doc/design/dist_refactor/src/paddle-compile.graffle rename to doc/fluid/design/dist_train/src/paddle-compile.graffle diff --git a/doc/design/dist_refactor/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png similarity index 100% rename from doc/design/dist_refactor/src/paddle-compile.png rename to doc/fluid/design/dist_train/src/paddle-compile.png diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle similarity index 100% rename from doc/design/dist_refactor/src/remote_executor.graffle rename to doc/fluid/design/dist_train/src/remote_executor.graffle diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png similarity index 100% rename from doc/design/dist_refactor/src/remote_executor.png rename to doc/fluid/design/dist_train/src/remote_executor.png diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle new file mode 100644 index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.graffle differ diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png new file mode 100644 index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7 Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.png differ diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot similarity index 100% rename from doc/design/ops/images/2_level_rnn.dot rename to doc/fluid/design/dynamic_rnn/2_level_rnn.dot diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png similarity index 100% rename from doc/design/ops/images/2_level_rnn.png rename to doc/fluid/design/dynamic_rnn/2_level_rnn.png diff --git a/doc/design/ops/images/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot similarity index 100% rename from doc/design/ops/images/rnn.dot rename to doc/fluid/design/dynamic_rnn/rnn.dot diff --git a/doc/design/ops/images/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg similarity index 100% rename from doc/design/ops/images/rnn.jpg rename to doc/fluid/design/dynamic_rnn/rnn.jpg diff --git a/doc/design/ops/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md similarity index 100% rename from doc/design/ops/rnn.md rename to doc/fluid/design/dynamic_rnn/rnn.md diff --git a/doc/design/ops/images/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png similarity index 100% rename from doc/design/ops/images/rnn.png rename to doc/fluid/design/dynamic_rnn/rnn.png diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot similarity index 100% rename from doc/design/ops/images/rnn_2level_data.dot rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.dot diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png similarity index 100% rename from doc/design/ops/images/rnn_2level_data.png rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.png diff --git a/paddle/fluid/operators/op_documentation/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md similarity index 100% rename from paddle/fluid/operators/op_documentation/rnn_design.md rename to doc/fluid/design/dynamic_rnn/rnn_design.md diff --git a/doc/design/if_else_op.md b/doc/fluid/design/execution/if_else_op.md similarity index 100% rename from doc/design/if_else_op.md rename to doc/fluid/design/execution/if_else_op.md diff --git a/doc/design/switch.md b/doc/fluid/design/execution/switch.md similarity index 100% rename from doc/design/switch.md rename to doc/fluid/design/execution/switch.md diff --git a/doc/design/multi_language_interface/00.why_plain_c.md b/doc/fluid/design/interface/00.why_plain_c.md similarity index 100% rename from doc/design/multi_language_interface/00.why_plain_c.md rename to doc/fluid/design/interface/00.why_plain_c.md diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/fluid/design/interface/01.inference_implementation.md similarity index 100% rename from doc/design/multi_language_interface/01.inference_implementation.md rename to doc/fluid/design/interface/01.inference_implementation.md diff --git a/paddle/fluid/memory/README.md b/doc/fluid/design/memory/README.md similarity index 100% rename from paddle/fluid/memory/README.md rename to doc/fluid/design/memory/README.md diff --git a/doc/design/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png similarity index 100% rename from doc/design/images/control_flow_graph.png rename to doc/fluid/design/memory/images/control_flow_graph.png diff --git a/doc/design/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png similarity index 100% rename from doc/design/images/dataflow_equations.png rename to doc/fluid/design/memory/images/dataflow_equations.png diff --git a/doc/design/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png similarity index 100% rename from doc/design/images/deep_learning.png rename to doc/fluid/design/memory/images/deep_learning.png diff --git a/doc/design/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md similarity index 100% rename from doc/design/memory_optimization.md rename to doc/fluid/design/memory/memory_optimization.md diff --git a/doc/design/backward.md b/doc/fluid/design/modules/backward.md similarity index 100% rename from doc/design/backward.md rename to doc/fluid/design/modules/backward.md diff --git a/paddle/fluid/operators/op_documentation/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md similarity index 100% rename from paddle/fluid/operators/op_documentation/batch_norm_op.md rename to doc/fluid/design/modules/batch_norm_op.md diff --git a/doc/design/evaluator.md b/doc/fluid/design/modules/evaluator.md similarity index 100% rename from doc/design/evaluator.md rename to doc/fluid/design/modules/evaluator.md diff --git a/paddle/fluid/operators/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot similarity index 100% rename from paddle/fluid/operators/images/batch_norm_fork.dot rename to doc/fluid/design/modules/images/batch_norm_fork.dot diff --git a/paddle/fluid/operators/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png similarity index 100% rename from paddle/fluid/operators/images/batch_norm_fork.png rename to doc/fluid/design/modules/images/batch_norm_fork.png diff --git a/paddle/fluid/operators/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png similarity index 100% rename from paddle/fluid/operators/images/batch_norm_op_kernel.png rename to doc/fluid/design/modules/images/batch_norm_op_kernel.png diff --git a/doc/design/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png similarity index 100% rename from doc/design/images/feed_forward.png rename to doc/fluid/design/modules/images/feed_forward.png diff --git a/doc/design/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png similarity index 100% rename from doc/design/images/feed_forward_regularized.png rename to doc/fluid/design/modules/images/feed_forward_regularized.png diff --git a/doc/design/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png similarity index 100% rename from doc/design/images/l1_regularization.png rename to doc/fluid/design/modules/images/l1_regularization.png diff --git a/doc/design/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png similarity index 100% rename from doc/design/images/l2_regularization.png rename to doc/fluid/design/modules/images/l2_regularization.png diff --git a/doc/design/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png similarity index 100% rename from doc/design/images/loss_equation.png rename to doc/fluid/design/modules/images/loss_equation.png diff --git a/doc/design/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md similarity index 100% rename from doc/design/infer_var_type.md rename to doc/fluid/design/modules/infer_var_type.md diff --git a/paddle/fluid/operators/op_documentation/net_op_design.md b/doc/fluid/design/modules/net_op_design.md similarity index 100% rename from paddle/fluid/operators/op_documentation/net_op_design.md rename to doc/fluid/design/modules/net_op_design.md diff --git a/doc/design/optimizer.md b/doc/fluid/design/modules/optimizer.md similarity index 100% rename from doc/design/optimizer.md rename to doc/fluid/design/modules/optimizer.md diff --git a/doc/design/prune.md b/doc/fluid/design/modules/prune.md similarity index 100% rename from doc/design/prune.md rename to doc/fluid/design/modules/prune.md diff --git a/doc/design/python_api.md b/doc/fluid/design/modules/python_api.md similarity index 100% rename from doc/design/python_api.md rename to doc/fluid/design/modules/python_api.md diff --git a/doc/design/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md similarity index 100% rename from doc/design/register_grad_op.md rename to doc/fluid/design/modules/register_grad_op.md diff --git a/doc/design/regularization.md b/doc/fluid/design/modules/regularization.md similarity index 100% rename from doc/design/regularization.md rename to doc/fluid/design/modules/regularization.md diff --git a/doc/design/selected_rows.md b/doc/fluid/design/modules/selected_rows.md similarity index 100% rename from doc/design/selected_rows.md rename to doc/fluid/design/modules/selected_rows.md diff --git a/doc/design/api.md b/doc/fluid/design/motivation/api.md similarity index 100% rename from doc/design/api.md rename to doc/fluid/design/motivation/api.md diff --git a/doc/design/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle similarity index 100% rename from doc/design/fluid-compiler.graffle rename to doc/fluid/design/motivation/fluid-compiler.graffle diff --git a/doc/design/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png similarity index 100% rename from doc/design/fluid-compiler.png rename to doc/fluid/design/motivation/fluid-compiler.png diff --git a/doc/design/fluid.md b/doc/fluid/design/motivation/fluid.md similarity index 100% rename from doc/design/fluid.md rename to doc/fluid/design/motivation/fluid.md diff --git a/doc/design/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md similarity index 100% rename from doc/design/fluid_compiler.md rename to doc/fluid/design/motivation/fluid_compiler.md diff --git a/doc/design/refactorization.md b/doc/fluid/design/motivation/refactorization.md similarity index 100% rename from doc/design/refactorization.md rename to doc/fluid/design/motivation/refactorization.md diff --git a/doc/design/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md similarity index 100% rename from doc/design/kernel_hint_design.md rename to doc/fluid/design/muti_devices/kernel_hint_design.md diff --git a/doc/design/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md similarity index 100% rename from doc/design/kernel_selection.md rename to doc/fluid/design/muti_devices/kernel_selection.md diff --git a/doc/design/operator_kernel_type.md b/doc/fluid/design/muti_devices/operator_kernel_type.md similarity index 100% rename from doc/design/operator_kernel_type.md rename to doc/fluid/design/muti_devices/operator_kernel_type.md diff --git a/doc/design/speech/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md similarity index 98% rename from doc/design/speech/deep_speech_2.md rename to doc/fluid/design/network/deep_speech_2.md index cfdc4d6df04344c70d3334626bd38eca997c31ff..af0c6ef36feba9e0239e7a5f81a8dc9108b2471a 100644 --- a/doc/design/speech/deep_speech_2.md +++ b/doc/fluid/design/network/deep_speech_2.md @@ -94,7 +94,7 @@ The classical DS2 network contains 15 layers (from bottom to top): - **One** CTC-loss layer
-
+
Figure 1. Archetecture of Deep Speech 2 Network.
@@ -141,7 +141,7 @@ TODO by Assignees ### Beam Search with CTC and LM
-
+
Figure 2. Algorithm for CTC Beam Search Decoder.
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg similarity index 100% rename from doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg rename to doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg diff --git a/doc/design/speech/image/beam_search.png b/doc/fluid/design/network/images/beam_search.png similarity index 100% rename from doc/design/speech/image/beam_search.png rename to doc/fluid/design/network/images/beam_search.png diff --git a/doc/design/speech/image/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png similarity index 100% rename from doc/design/speech/image/ds2_network.png rename to doc/fluid/design/network/images/ds2_network.png diff --git a/doc/design/ops/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md similarity index 100% rename from doc/design/ops/sequence_decoder.md rename to doc/fluid/design/network/sequence_decoder.md diff --git a/doc/design/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md similarity index 100% rename from doc/design/auto_gradient_check.md rename to doc/fluid/design/others/auto_gradient_check.md diff --git a/doc/design/dcgan.png b/doc/fluid/design/others/dcgan.png similarity index 100% rename from doc/design/dcgan.png rename to doc/fluid/design/others/dcgan.png diff --git a/doc/design/gan_api.md b/doc/fluid/design/others/gan_api.md similarity index 100% rename from doc/design/gan_api.md rename to doc/fluid/design/others/gan_api.md diff --git a/doc/design/graph.md b/doc/fluid/design/others/graph.md similarity index 100% rename from doc/design/graph.md rename to doc/fluid/design/others/graph.md diff --git a/doc/design/graph_survey.md b/doc/fluid/design/others/graph_survey.md similarity index 100% rename from doc/design/graph_survey.md rename to doc/fluid/design/others/graph_survey.md diff --git a/doc/design/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash similarity index 100% rename from doc/design/images/graph_construction_example.bash rename to doc/fluid/design/others/images/graph_construction_example.bash diff --git a/doc/design/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot similarity index 100% rename from doc/design/images/graph_construction_example.dot rename to doc/fluid/design/others/images/graph_construction_example.dot diff --git a/doc/design/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png similarity index 100% rename from doc/design/images/graph_construction_example_all.png rename to doc/fluid/design/others/images/graph_construction_example_all.png diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png similarity index 100% rename from doc/design/images/graph_construction_example_forward_backward.png rename to doc/fluid/design/others/images/graph_construction_example_forward_backward.png diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png similarity index 100% rename from doc/design/images/graph_construction_example_forward_only.png rename to doc/fluid/design/others/images/graph_construction_example_forward_only.png diff --git a/doc/design/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md similarity index 100% rename from doc/design/parameters_in_cpp.md rename to doc/fluid/design/others/parameters_in_cpp.md diff --git a/doc/design/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md similarity index 100% rename from doc/design/simple_op_design.md rename to doc/fluid/design/others/simple_op_design.md diff --git a/doc/design/test.dot b/doc/fluid/design/others/test.dot similarity index 100% rename from doc/design/test.dot rename to doc/fluid/design/others/test.dot diff --git a/doc/design/test.dot.png b/doc/fluid/design/others/test.dot.png similarity index 100% rename from doc/design/test.dot.png rename to doc/fluid/design/others/test.dot.png diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..5596b2653ae6ed9917f77dad08f926bcb1fb3419 --- /dev/null +++ b/doc/fluid/dev/api_doc_std_cn.md @@ -0,0 +1,220 @@ +# API注释撰写标准 + +- [API注释模块](#API注释模块) +- [格式及示例](#格式及示例) +- [完整示例](#完整示例) + + +## API注释模块 + +API文档须包含以下几个模块(排列顺序为文档撰写顺序): + +- Python API Definition + + API的代码定义。 + +- Function Description + + API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。 + +- Args Description + + API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。 + +- Returns + + API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。 + +- Raises(如果有) + + 可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。 + +- Note(如果有) + + 注意事项。当有多条注意事项时,应分条列出。 + +- Examples + + API的使用示例。 + + +## 格式及示例 + +API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明): + +- Python API Definition + + - 格式: + + [Python API Definition] + + - 示例 + + ``` + fc(input, + size, + num_flatten_dims=1, + param_attr=None, + bias_attr=None, + act=None, + name=None, + main_program=None, + startup_program=None) + ``` + +- Function Description + + - 格式 + + 本模块应包含以下内容(排列顺序为文档撰写顺序): + + [Function Description] + + [Formula] + + [Symbols' Descriptions if necessary] + + [References if necessary] + + - 示例 + + [Function Description] + + ``` + **Fully Connected Layer** + + The fully connected layer can take multiple tensors as its inputs. It + creates a variable called weights for each input tensor, which represents + a fully connected weight matrix from each input unit to each output unit. + The fully connected layer multiplies each input tensor with its coresponding + weight to produce an output Tensor. If multiple input tensors are given, + the results of multiple multiplications will be sumed up. If bias_attr is + not None, a bias variable will be created and added to the output. Finally, + if activation is not None, it will be applied to the output as well. + ``` + + [Formula] + + ``` + This process can be formulated as follows: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) + ``` + + [Symbols' Descriptions if necessary] + + ``` + In the above equation: + + * :math:`N`: Number of the input. + * :math:`X_i`: The input tensor. + * :math:`W`: The weights created by this layer. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation function. + * :math:`Out`: The output tensor. + ``` + + [References if necessary] + + 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例: + + ``` + Refer to `Layer Normalization `_ for more details. + ``` + + +- Args Description + + - 格式 + + \[Arg's Name\][(Data Type, Default Value)][Description] + + - 示例 + + fc的部分参数注释如下: + + ``` + Args: + input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of + the input tensor(s) is at least 2. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable + parameters/weights of this layer. + name (str, default None): The name of this layer. + ``` + +- Returns + + - 格式 + + [Name][Shape] + + - 示例 + + ``` + Returns: + A tensor variable storing the transformation result. + ``` + + 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例: + + ``` + Returns: + A tuple containing: + The hidden state of LSTM whose shape is (T X D). + The cell state of LSTM whose shape is (T X D). + ``` + +- Raises + + - 格式 + + [Exception Type][Condition] + + - 示例 + + ``` + Raises: + ValueError: If the rank of the input is less than 2. + ``` + +- Note + + - 格式 + + [Note] + + - 示例 + + fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例: + + ``` + Note: + 1. When num_heads > 1, three linear projections are learned respectively + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + 2. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. + ``` + +- Examples + + - 格式 + + \[Python Code Snipper] + + - 示例 + + ``` + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, size=1000, act="tanh") + ``` + +## 完整示例 + +fc 的完整注释见[示例](src/fc.py)。 diff --git a/doc/design/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png similarity index 100% rename from doc/design/ci_build_whl.png rename to doc/fluid/dev/ci_build_whl.png diff --git a/paddle/fluid/operators/op_documentation/name_convention.md b/doc/fluid/dev/name_convention.md similarity index 100% rename from paddle/fluid/operators/op_documentation/name_convention.md rename to doc/fluid/dev/name_convention.md diff --git a/paddle/fluid/operators/op_documentation/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md similarity index 100% rename from paddle/fluid/operators/op_documentation/op_markdown_format.md rename to doc/fluid/dev/op_markdown_format.md diff --git a/doc/design/releasing_process.md b/doc/fluid/dev/releasing_process.md similarity index 100% rename from doc/design/releasing_process.md rename to doc/fluid/dev/releasing_process.md diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py new file mode 100644 index 0000000000000000000000000000000000000000..3b074821cc2276a29b2a8639e82199fcf4d72020 --- /dev/null +++ b/doc/fluid/dev/src/fc.py @@ -0,0 +1,81 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def fc(input, + size, + num_flatten_dims=1, + param_attr=None, + bias_attr=None, + act=None, + name=None): + """ + **Fully Connected Layer** + + The fully connected layer can take multiple tensors as its inputs. It + creates a variable called weights for each input tensor, which represents + a fully connected weight matrix from each input unit to each output unit. + The fully connected layer multiplies each input tensor with its coresponding + weight to produce an output Tensor. If multiple input tensors are given, + the results of multiple multiplications will be sumed up. If bias_attr is + not None, a bias variable will be created and added to the output. Finally, + if activation is not None, it will be applied to the output as well. + + This process can be formulated as follows: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) + + In the above equation: + + * :math:`N`: Number of the input. + * :math:`X_i`: The input tensor. + * :math:`W`: The weights created by this layer. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation function. + * :math:`Out`: The output tensor. + + Args: + input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of + the input tensor(s) is at least 2. + size(int): The number of output units in this layer. + num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than + two dimensions. If this happens, the multidimensional tensor will first be flattened + into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input + tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) + dimensions will be flatten to form the first dimension of the final matrix (height of + the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to + form the second dimension of the final matrix (width of the matrix). For example, suppose + `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable + parameters/weights of this layer. + bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to None, no bias will be added to the output units. + act (str, default None): Activation to be applied to the output of this layer. + name (str, default None): The name of this layer. + + Returns: + A tensor variable storing the transformation result. + + Raises: + ValueError: If rank of the input tensor is less than 2. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, size=1000, act="tanh") + """ diff --git a/doc/design/support_new_device.md b/doc/fluid/dev/support_new_device.md similarity index 100% rename from doc/design/support_new_device.md rename to doc/fluid/dev/support_new_device.md diff --git a/doc/design/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md similarity index 100% rename from doc/design/reader/README.md rename to doc/fluid/getstarted/concepts/reader/README.md diff --git a/doc/design/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md similarity index 100% rename from doc/design/model_format.md rename to doc/fluid/getstarted/concepts/save_model/model_format.md diff --git a/doc/design/error_clip.md b/doc/fluid/howto/performance/error_clip.md similarity index 100% rename from doc/design/error_clip.md rename to doc/fluid/howto/performance/error_clip.md diff --git a/doc/design/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png similarity index 100% rename from doc/design/images/profiler.png rename to doc/fluid/howto/performance/images/profiler.png diff --git a/doc/design/profiler.md b/doc/fluid/howto/performance/profiler.md similarity index 100% rename from doc/design/profiler.md rename to doc/fluid/howto/performance/profiler.md diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle similarity index 100% rename from doc/design/images/multigpu_allreduce.graffle rename to doc/fluid/howto/third_party/images/multigpu_allreduce.graffle diff --git a/doc/design/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png similarity index 100% rename from doc/design/images/multigpu_allreduce.png rename to doc/fluid/howto/third_party/images/multigpu_allreduce.png diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle similarity index 100% rename from doc/design/images/multigpu_before_convert.graffle rename to doc/fluid/howto/third_party/images/multigpu_before_convert.graffle diff --git a/doc/design/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png similarity index 100% rename from doc/design/images/multigpu_before_convert.png rename to doc/fluid/howto/third_party/images/multigpu_before_convert.png diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md similarity index 100% rename from doc/design/mkl/mkldnn_fluid.md rename to doc/fluid/howto/third_party/mkldnn_fluid.md diff --git a/doc/design/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md similarity index 100% rename from doc/design/paddle_nccl.md rename to doc/fluid/howto/third_party/paddle_nccl.md diff --git a/doc/design/cluster_train/README.md b/doc/v2/design/cluster_train/README.md similarity index 100% rename from doc/design/cluster_train/README.md rename to doc/v2/design/cluster_train/README.md diff --git a/doc/design/cluster_train/checkpointing.md b/doc/v2/design/cluster_train/checkpointing.md similarity index 100% rename from doc/design/cluster_train/checkpointing.md rename to doc/v2/design/cluster_train/checkpointing.md diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/v2/design/cluster_train/data_dispatch.md similarity index 100% rename from doc/design/cluster_train/data_dispatch.md rename to doc/v2/design/cluster_train/data_dispatch.md diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md similarity index 100% rename from doc/design/cluster_train/large_model_dist_train.md rename to doc/v2/design/cluster_train/large_model_dist_train.md diff --git a/doc/design/cluster_train/master_server.md b/doc/v2/design/cluster_train/master_server.md similarity index 100% rename from doc/design/cluster_train/master_server.md rename to doc/v2/design/cluster_train/master_server.md diff --git a/doc/design/cluster_train/pserver_client.md b/doc/v2/design/cluster_train/pserver_client.md similarity index 100% rename from doc/design/cluster_train/pserver_client.md rename to doc/v2/design/cluster_train/pserver_client.md diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/v2/design/cluster_train/remote_parameter_updater.md similarity index 100% rename from doc/design/cluster_train/remote_parameter_updater.md rename to doc/v2/design/cluster_train/remote_parameter_updater.md diff --git a/doc/design/cluster_train/save_model.md b/doc/v2/design/cluster_train/save_model.md similarity index 100% rename from doc/design/cluster_train/save_model.md rename to doc/v2/design/cluster_train/save_model.md diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/v2/design/cluster_train/src/checkpointing.png similarity index 100% rename from doc/design/cluster_train/src/checkpointing.png rename to doc/v2/design/cluster_train/src/checkpointing.png diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/v2/design/cluster_train/src/data_dispatch.png similarity index 100% rename from doc/design/cluster_train/src/data_dispatch.png rename to doc/v2/design/cluster_train/src/data_dispatch.png diff --git a/doc/design/cluster_train/src/dataset.graffle b/doc/v2/design/cluster_train/src/dataset.graffle similarity index 100% rename from doc/design/cluster_train/src/dataset.graffle rename to doc/v2/design/cluster_train/src/dataset.graffle diff --git a/doc/design/cluster_train/src/dataset.png b/doc/v2/design/cluster_train/src/dataset.png similarity index 100% rename from doc/design/cluster_train/src/dataset.png rename to doc/v2/design/cluster_train/src/dataset.png diff --git a/doc/design/cluster_train/src/file_storage.graffle b/doc/v2/design/cluster_train/src/file_storage.graffle similarity index 100% rename from doc/design/cluster_train/src/file_storage.graffle rename to doc/v2/design/cluster_train/src/file_storage.graffle diff --git a/doc/design/cluster_train/src/file_storage.png b/doc/v2/design/cluster_train/src/file_storage.png similarity index 100% rename from doc/design/cluster_train/src/file_storage.png rename to doc/v2/design/cluster_train/src/file_storage.png diff --git a/doc/design/cluster_train/src/init_lock.graffle b/doc/v2/design/cluster_train/src/init_lock.graffle similarity index 100% rename from doc/design/cluster_train/src/init_lock.graffle rename to doc/v2/design/cluster_train/src/init_lock.graffle diff --git a/doc/design/cluster_train/src/init_lock.png b/doc/v2/design/cluster_train/src/init_lock.png similarity index 100% rename from doc/design/cluster_train/src/init_lock.png rename to doc/v2/design/cluster_train/src/init_lock.png diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png similarity index 100% rename from doc/design/cluster_train/src/paddle-cloud-in-data-center.png rename to doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/v2/design/cluster_train/src/paddle-etcd.graffle similarity index 100% rename from doc/design/cluster_train/src/paddle-etcd.graffle rename to doc/v2/design/cluster_train/src/paddle-etcd.graffle diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/v2/design/cluster_train/src/paddle-etcd.png similarity index 100% rename from doc/design/cluster_train/src/paddle-etcd.png rename to doc/v2/design/cluster_train/src/paddle-etcd.png diff --git a/doc/design/cluster_train/src/paddle-model-sharding.graffle b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle similarity index 100% rename from doc/design/cluster_train/src/paddle-model-sharding.graffle rename to doc/v2/design/cluster_train/src/paddle-model-sharding.graffle diff --git a/doc/design/cluster_train/src/paddle-model-sharding.png b/doc/v2/design/cluster_train/src/paddle-model-sharding.png similarity index 100% rename from doc/design/cluster_train/src/paddle-model-sharding.png rename to doc/v2/design/cluster_train/src/paddle-model-sharding.png diff --git a/doc/design/cluster_train/src/paddle-ps-0.png b/doc/v2/design/cluster_train/src/paddle-ps-0.png similarity index 100% rename from doc/design/cluster_train/src/paddle-ps-0.png rename to doc/v2/design/cluster_train/src/paddle-ps-0.png diff --git a/doc/design/cluster_train/src/paddle-ps-1.png b/doc/v2/design/cluster_train/src/paddle-ps-1.png similarity index 100% rename from doc/design/cluster_train/src/paddle-ps-1.png rename to doc/v2/design/cluster_train/src/paddle-ps-1.png diff --git a/doc/design/cluster_train/src/paddle-ps.graffle b/doc/v2/design/cluster_train/src/paddle-ps.graffle similarity index 100% rename from doc/design/cluster_train/src/paddle-ps.graffle rename to doc/v2/design/cluster_train/src/paddle-ps.graffle diff --git a/doc/design/cluster_train/src/paddle-task-queues.graffle b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle similarity index 100% rename from doc/design/cluster_train/src/paddle-task-queues.graffle rename to doc/v2/design/cluster_train/src/paddle-task-queues.graffle diff --git a/doc/design/cluster_train/src/paddle-task-queues.png b/doc/v2/design/cluster_train/src/paddle-task-queues.png similarity index 100% rename from doc/design/cluster_train/src/paddle-task-queues.png rename to doc/v2/design/cluster_train/src/paddle-task-queues.png diff --git a/doc/design/cluster_train/src/paddle-task-states.graffle b/doc/v2/design/cluster_train/src/paddle-task-states.graffle similarity index 100% rename from doc/design/cluster_train/src/paddle-task-states.graffle rename to doc/v2/design/cluster_train/src/paddle-task-states.graffle diff --git a/doc/design/cluster_train/src/paddle-task-states.png b/doc/v2/design/cluster_train/src/paddle-task-states.png similarity index 100% rename from doc/design/cluster_train/src/paddle-task-states.png rename to doc/v2/design/cluster_train/src/paddle-task-states.png diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/v2/design/cluster_train/src/pserver_init.graffle similarity index 100% rename from doc/design/cluster_train/src/pserver_init.graffle rename to doc/v2/design/cluster_train/src/pserver_init.graffle diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/v2/design/cluster_train/src/pserver_init.png similarity index 100% rename from doc/design/cluster_train/src/pserver_init.png rename to doc/v2/design/cluster_train/src/pserver_init.png diff --git a/doc/design/cluster_train/src/submit-job.graffle b/doc/v2/design/cluster_train/src/submit-job.graffle similarity index 100% rename from doc/design/cluster_train/src/submit-job.graffle rename to doc/v2/design/cluster_train/src/submit-job.graffle diff --git a/doc/design/cluster_train/src/submit-job.png b/doc/v2/design/cluster_train/src/submit-job.png similarity index 100% rename from doc/design/cluster_train/src/submit-job.png rename to doc/v2/design/cluster_train/src/submit-job.png diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/v2/design/cluster_train/src/trainer.graffle similarity index 100% rename from doc/design/cluster_train/src/trainer.graffle rename to doc/v2/design/cluster_train/src/trainer.graffle diff --git a/doc/design/cluster_train/src/trainer.png b/doc/v2/design/cluster_train/src/trainer.png similarity index 100% rename from doc/design/cluster_train/src/trainer.png rename to doc/v2/design/cluster_train/src/trainer.png diff --git a/doc/design/cluster_train/submit-job.md b/doc/v2/design/cluster_train/submit-job.md similarity index 100% rename from doc/design/cluster_train/submit-job.md rename to doc/v2/design/cluster_train/submit-job.md diff --git a/doc/design/mkl/image/engine.png b/doc/v2/design/mkl/image/engine.png similarity index 100% rename from doc/design/mkl/image/engine.png rename to doc/v2/design/mkl/image/engine.png diff --git a/doc/design/mkl/image/gradients.png b/doc/v2/design/mkl/image/gradients.png similarity index 100% rename from doc/design/mkl/image/gradients.png rename to doc/v2/design/mkl/image/gradients.png diff --git a/doc/design/mkl/image/layers.png b/doc/v2/design/mkl/image/layers.png similarity index 100% rename from doc/design/mkl/image/layers.png rename to doc/v2/design/mkl/image/layers.png diff --git a/doc/design/mkl/image/matrix.png b/doc/v2/design/mkl/image/matrix.png similarity index 100% rename from doc/design/mkl/image/matrix.png rename to doc/v2/design/mkl/image/matrix.png diff --git a/doc/design/mkl/image/overview.png b/doc/v2/design/mkl/image/overview.png similarity index 100% rename from doc/design/mkl/image/overview.png rename to doc/v2/design/mkl/image/overview.png diff --git a/doc/design/mkl/mkl_packed.md b/doc/v2/design/mkl/mkl_packed.md similarity index 100% rename from doc/design/mkl/mkl_packed.md rename to doc/v2/design/mkl/mkl_packed.md diff --git a/doc/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md similarity index 100% rename from doc/design/mkl/mkldnn.md rename to doc/v2/design/mkl/mkldnn.md diff --git a/doc/v2/dev/index_cn.rst b/doc/v2/dev/index_cn.rst index c488191b8174531905e44cb9443ee539d4cb1ed3..aee3c68de05de26df3cd79170fa7f4ecad4bf386 100644 --- a/doc/v2/dev/index_cn.rst +++ b/doc/v2/dev/index_cn.rst @@ -1,9 +1,24 @@ 开发标准 ======== +PaddlePaddle遵守如下三个部分的代码和文档规范。 + +PaddlePaddle使用git做版本管理,docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范,不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。 .. toctree:: :maxdepth: 1 contribute_to_paddle_cn.md + +PaddlePaddle面向国内外用户,包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档,重在问题描述,背景阐述,然后才是解决方案。文档由Sphinx生成,因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档,请参阅如下文档。 + +.. toctree:: + :maxdepth: 1 + write_docs_cn.rst + +PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer,请参阅如下文档,欢迎提交patch。 + +.. toctree:: + :maxdepth: 1 + new_layer_cn.rst diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst index 0ded1c262adad44f4df000ef2933c7b68050f2fc..3115654b2bd87995fa63bb7828fd1b3039aea8cc 100644 --- a/doc/v2/dev/new_layer_cn.rst +++ b/doc/v2/dev/new_layer_cn.rst @@ -16,7 +16,7 @@ 下图是一个全连接层的示意图。在全连接层中,每个输出节点都连接到所有的输入节点上。 -.. image:: FullyConnected.jpg +.. image:: src/FullyConnected.jpg :align: center :scale: 60 % diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst index 110a9fb38f890a766bb4480e91feb22d3b0838a5..b05bb45f11eb253dfb87d6283c29ec6689394d22 100644 --- a/doc/v2/dev/new_layer_en.rst +++ b/doc/v2/dev/new_layer_en.rst @@ -16,7 +16,7 @@ First we need to derive equations of the *forward* and *backward* part of the la The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes. -.. image:: FullyConnected.jpg +.. image:: src/FullyConnected.jpg :align: center :scale: 60 % diff --git a/doc/v2/dev/FullyConnected.jpg b/doc/v2/dev/src/FullyConnected.jpg similarity index 100% rename from doc/v2/dev/FullyConnected.jpg rename to doc/v2/dev/src/FullyConnected.jpg diff --git a/doc/v2/dev/src/doc_en.png b/doc/v2/dev/src/doc_en.png new file mode 100644 index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8 Binary files /dev/null and b/doc/v2/dev/src/doc_en.png differ diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst index f79769b810b91c6984016d95f40b89186bfb61b0..a055bb04c0c093c9159290067e5ccbd2525cd519 100644 --- a/doc/v2/dev/write_docs_cn.rst +++ b/doc/v2/dev/write_docs_cn.rst @@ -2,20 +2,19 @@ 如何贡献文档 ############# -PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 -也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 +PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成,也可以利用paddlepaddle.org工具来编译和预览文档。 如何构建文档 ============ -PaddlePaddle的文档构建有三种方式。 +PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。 使用PaddlePaddle.org工具 --------------- -这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 +------------------------ +这个是目前推荐的使用方法。除了可以自动编译文档,还可以直接在网页中预览文档,需要注意的是,采用后续说明的其它方式虽然也可以预览文档,但是文档的样式与官网文档是不一致的,使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。 -文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 +PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后即可用以下命令启动工具 .. code-block:: bash @@ -35,7 +34,7 @@ PaddlePaddle的文档构建有三种方式。 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 编译后的文件将被存储在工作目录 /.ppo_workspace/content。 -如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 +如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 .. code-block:: bash @@ -62,37 +61,46 @@ PaddlePaddle的文档构建有三种方式。 想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 -使用Docker构建 --------------- +不使用PaddlePaddle.org工具 +-------------------------- 使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 -.. code-block:: bash +[TBD] - cd TO_YOUR_PADDLE_CLONE_PATH - cd paddle/scripts/tools/build_docs - sh build_docs.sh +如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即 -编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 -打开浏览器访问对应目录下的index.html即可访问本地文档。 +.. code-block:: bash -直接构建 --------- + mkdir paddle + cd paddle + git clone https://github.com/PaddlePaddle/Paddle.git + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -如果提示正确,可以执行以下命令编译生成文档,即 + # 如果只需要构建使用文档,则执行以下命令 + make -j $processors gen_proto_py + make -j $processors paddle_docs paddle_docs_cn -.. code-block:: bash + # 如果只需要构建API,则执行以下命令 + make -j $processors gen_proto_py framework_py_proto + make -j $processors copy_paddle_pybind + make -j $processors paddle_api_docs + +其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。 + +编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令: + +.. code-block:: bash - cd TO_YOUR_PADDLE_CLONE_PATH - mkdir -p build - cd build - cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON - make gen_proto_py - make paddle_docs paddle_docs_cn + python -m SimpleHTTPServer 8088 -编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 -打开浏览器访问对应目录下的index.html即可访问本地文档。 +在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 +.. image:: src/doc_en.png + :align: center + :scale: 60 % 如何书写文档 ============ @@ -102,7 +110,7 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程 如何更新www.paddlepaddle.org ============================ -更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 +更新的文档以PR的形式提交到github中,提交方式参见 `如何贡献文档 `_ 。 目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 `英文文档 `_ 。 diff --git a/doc/v2/howto/cluster/cmd_argument_cn.md b/doc/v2/howto/cluster/cmd_argument_cn.md index 40e1dde4858b802c2e703bcca4b71730facde5ef..c0ba093cbf2eac5c3b60a0b071b31776a11998f3 100644 --- a/doc/v2/howto/cluster/cmd_argument_cn.md +++ b/doc/v2/howto/cluster/cmd_argument_cn.md @@ -71,6 +71,13 @@ paddle.init( - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 +```python +trainer = paddle.trainer.SGD(..., is_local=False) +``` + +参数说明 + +- is_local: **必选, 默认True**, 是否使用PServer更新参数 ## 准备数据集 diff --git a/doc/v2/howto/cluster/cmd_argument_en.md b/doc/v2/howto/cluster/cmd_argument_en.md index 40179c28f83800c1c74a6045f8fac6841bdafeaa..df1381a00fa0fa129eecffe002164c489a4183aa 100644 --- a/doc/v2/howto/cluster/cmd_argument_en.md +++ b/doc/v2/howto/cluster/cmd_argument_en.md @@ -73,6 +73,14 @@ Parameter Description - trainer_id: **required, default 0**, ID for every trainer, start from 0. - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". +```python +trainer = paddle.trainer.SGD(..., is_local=False) +``` + +Parameter Description + +- is_local: **required, default True**, whether update parameters by PServer. + ## Prepare Training Dataset Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst index 2640a09dcc904619bc97c9bd3f3d81a9dc307663..c965d30d54e71339cf10d4b05f25e740c81adbf9 100644 --- a/doc/v2/howto/cluster/index_en.rst +++ b/doc/v2/howto/cluster/index_en.rst @@ -1,8 +1,7 @@ Distributed Training ==================== -In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: - +The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model. .. image:: src/ps_en.png :width: 500 @@ -10,13 +9,27 @@ In this section, we'll explain how to run distributed training jobs with PaddleP - Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. - Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. -PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. +The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers. + +PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD. -When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. +Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below: .. toctree:: :maxdepth: 1 preparations_en.md + +Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below: + +.. toctree:: + :maxdepth: 1 + cmd_argument_en.md + +PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below: + +.. toctree:: + :maxdepth: 1 + multi_cluster/index_en.rst diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index a7b249d43bf3ad9924749d5e66618750f19d8bf7..d2a4b1335464f553a361728e64ed5ca177ca53da 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,27 +1,29 @@ -add_subdirectory(cuda) -add_subdirectory(function) -add_subdirectory(utils) -add_subdirectory(math) -add_subdirectory(gserver) -add_subdirectory(parameter) -add_subdirectory(testing) - -if(MOBILE_INFERENCE) - add_subdirectory(capi) -else() - add_subdirectory(pserver) - add_subdirectory(trainer) - add_subdirectory(scripts) +if(NOT WITH_FLUID) + add_subdirectory(cuda) + add_subdirectory(function) + add_subdirectory(utils) + add_subdirectory(math) + add_subdirectory(gserver) + add_subdirectory(parameter) - if(WITH_C_API) + if(MOBILE_INFERENCE) add_subdirectory(capi) - endif() + else() + add_subdirectory(pserver) + add_subdirectory(trainer) + add_subdirectory(scripts) - if(NOT ANDROID AND NOT IOS) - add_subdirectory(fluid) - endif() + if(WITH_C_API) + add_subdirectory(capi) + endif() - if(WITH_SWIG_PY) - add_subdirectory(api) + if(WITH_SWIG_PY) + add_subdirectory(api) + endif() endif() endif() + +add_subdirectory(testing) +if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS) + add_subdirectory(fluid) +endif() diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt index ebb083c5a477d5be91ef14be74dd9de349d07931..e06e9a2b363d1ffc6876b98bcb7304b0a54dbcaa 100644 --- a/paddle/capi/CMakeLists.txt +++ b/paddle/capi/CMakeLists.txt @@ -36,7 +36,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} ${CAPI_PRIVATE_HEADER}) -add_dependencies(paddle_capi paddle_proto) +add_dependencies(paddle_capi paddle_proto paddle_gserver) # TODO: paddle_capi_whole will be removed. set(PADDLE_CAPI_LAYERS_LIBS diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 48713f2c2ac62a37b7b7a4602f7f6a325aecb0b8..a4ea74a6d2fbc29dc33a6b57ee453f49ed36c7fa 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -21,7 +21,7 @@ endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) @@ -103,4 +103,5 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op - channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc) + channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op + conditional_block_op while_op assign_op print_op executor proto_desc) diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index d72b64700f7cf680501fd3e355d20e694f1f097d..3693bc25d81a8309df1a6ddf3d9b08d484596ea9 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -135,6 +135,14 @@ OpDesc *BlockDesc::PrependOp() { return ops_.front().get(); } +OpDesc *BlockDesc::InsertOp(size_t index) { + need_update_ = true; + auto it = ops_.begin() + index; + std::unique_ptr new_op(new OpDesc(this)); + it = ops_.insert(it, std::move(new_op)); + return (*it).get(); +} + void BlockDesc::RemoveOp(size_t s, size_t e) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { return; diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 3bd90f38907c0a45ae0c9bb00706e5c127f08417..185f018ac1b5863e0ee86fdaa17df1ccbc6e030e 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -87,6 +87,8 @@ class BlockDesc { OpDesc *PrependOp(); + OpDesc *InsertOp(size_t index); + void RemoveOp(size_t s, size_t e); std::vector AllOps() const; diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 9f8fb12098d622058a86f83c1c42a1feb1cfb2e2..adfaba26ace78f547161ad4029a741f3ca8a6764 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -15,23 +15,43 @@ limitations under the License. */ #pragma once #include // for size_t +#include #include #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { +enum class ChannelAction { + SEND = 0, + RECEIVE = 1, + CLOSE = 2, +}; + // Channel is the abstract class of buffered and un-buffered channels. template class Channel { public: + virtual bool CanSend() = 0; + virtual bool CanReceive() = 0; virtual bool Send(T*) = 0; virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; virtual void Lock() = 0; + virtual void Unlock() = 0; + virtual bool IsClosed() = 0; virtual void Close() = 0; virtual ~Channel() {} + + virtual void AddToSendQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) = 0; + virtual void AddToReceiveQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) = 0; + virtual void RemoveFromSendQ(const void* referrer) = 0; + virtual void RemoveFromReceiveQ(const void* referrer) = 0; }; // Forward declaration of channel implementations. @@ -80,6 +100,27 @@ class ChannelHolder { return channel != nullptr ? channel->Receive(data) : false; } + bool IsClosed() { + if (IsInitialized()) { + return holder_->IsClosed(); + } + return false; + } + + bool CanSend() { + if (IsInitialized()) { + return holder_->CanSend(); + } + return false; + } + + bool CanReceive() { + if (IsInitialized()) { + return holder_->CanReceive(); + } + return false; + } + void close() { if (IsInitialized()) holder_->Close(); } @@ -97,6 +138,38 @@ class ChannelHolder { if (IsInitialized()) holder_->Unlock(); } + template + void AddToSendQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToSendQ(referrer, data, cond, cb); + } + } + } + + template + void AddToReceiveQ(const void* referrer, T* data, + std::shared_ptr cond, + std::function cb) { + if (IsInitialized()) { + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToReceiveQ(referrer, data, cond, cb); + } + } + } + + void RemoveFromSendQ(const void* referrer) { + if (IsInitialized()) holder_->RemoveFromSendQ(referrer); + } + + void RemoveFromReceiveQ(const void* referrer) { + if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer); + } + inline bool IsInitialized() const { return holder_ != nullptr; } inline const std::type_index Type() { @@ -113,6 +186,11 @@ class ChannelHolder { virtual ~Placeholder() {} virtual const std::type_index Type() const = 0; virtual void* Ptr() const = 0; + virtual bool IsClosed() = 0; + virtual bool CanSend() = 0; + virtual bool CanReceive() = 0; + virtual void RemoveFromSendQ(const void* referrer) = 0; + virtual void RemoveFromReceiveQ(const void* referrer) = 0; virtual void Close() = 0; virtual void Lock() = 0; virtual void Unlock() = 0; @@ -129,6 +207,39 @@ class ChannelHolder { virtual void* Ptr() const { return static_cast(channel_.get()); } + virtual bool IsClosed() { + if (channel_) { + return channel_->IsClosed(); + } + return false; + } + + virtual bool CanSend() { + if (channel_) { + return channel_->CanSend(); + } + return false; + } + + virtual bool CanReceive() { + if (channel_) { + return channel_->CanReceive(); + } + return false; + } + + virtual void RemoveFromSendQ(const void* referrer) { + if (channel_) { + channel_->RemoveFromSendQ(referrer); + } + } + + virtual void RemoveFromReceiveQ(const void* referrer) { + if (channel_) { + channel_->RemoveFromReceiveQ(referrer); + } + } + virtual void Close() { if (channel_) channel_->Close(); } diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h index a4561031fd8c49613269e7008ce558f25f9765e4..457abbf373d4549229e8fd8bd6b2087cc6b8f5c8 100644 --- a/paddle/fluid/framework/channel_impl.h +++ b/paddle/fluid/framework/channel_impl.h @@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel { friend void paddle::framework::CloseChannel(Channel *); public: + virtual bool CanSend(); + virtual bool CanReceive(); virtual bool Send(T *); virtual bool Receive(T *); virtual size_t Cap() { return cap_; } virtual void Lock(); virtual void Unlock(); + virtual bool IsClosed(); virtual void Close(); - ChannelImpl(size_t); virtual ~ChannelImpl(); + virtual void AddToSendQ(const void *referrer, T *data, + std::shared_ptr cond, + std::function cb); + virtual void AddToReceiveQ(const void *referrer, T *data, + std::shared_ptr cond, + std::function cb); + + virtual void RemoveFromSendQ(const void *referrer); + virtual void RemoveFromReceiveQ(const void *referrer); + private: struct QueueMessage { T *data; - std::condition_variable_any cond; + std::shared_ptr cond; bool chan_closed = false; bool completed = false; + const void *referrer; // TODO(thuan): figure out better way to do this + std::function callback; - QueueMessage(T *item) : data(item) {} + QueueMessage(T *item) + : data(item), cond(std::make_shared()) {} + + QueueMessage(T *item, std::shared_ptr cond) + : data(item), cond(cond) {} void Wait(std::unique_lock &lock) { - cond.wait(lock, [this]() { return completed; }); + cond->wait(lock, [this]() { return completed; }); } void Notify() { completed = true; - cond.notify_all(); + cond->notify_all(); } }; @@ -87,6 +105,18 @@ ChannelImpl::ChannelImpl(size_t capacity) PADDLE_ENFORCE_GE(capacity, 0); } +template +bool ChannelImpl::CanSend() { + std::lock_guard lock{mu_}; + return !closed_ && (!recvq.empty() || buf_.size() < cap_); +} + +template +bool ChannelImpl::CanReceive() { + std::lock_guard lock{mu_}; + return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0); +} + template bool ChannelImpl::Send(T *item) { send_ctr++; @@ -105,7 +135,24 @@ bool ChannelImpl::Send(T *item) { std::shared_ptr m = recvq.front(); recvq.pop_front(); // Do the data transfer - *(m->data) = std::move(*item); + // We will do this data transfer if either of the following + // cases are true + // 1. callback == nullptr // This means it was a regular channel send + // 2. callback returns true + bool do_send = true; + if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); + if (do_send) + *(m->data) = std::move(*item); + else + // We cannot do the data transfer because + // this QueueMessage was added by Select + // and some other case was executed. + // So call the Send function again. + // We do not care about notifying other + // because they would have been notified + // by the executed select case. + return send_return(Send(item)); + // Wake up the blocked process and unlock m->Notify(); lock.unlock(); @@ -150,7 +197,25 @@ bool ChannelImpl::Receive(T *item) { std::shared_ptr m = sendq.front(); sendq.pop_front(); // Do the data transfer - *item = std::move(*(m->data)); + // We will do this data transfer if either of the following + // cases are true + // 1. callback == nullptr // This means it was a regular channel send + // 2. callback returns true + bool do_receive = true; + if (m->callback != nullptr) + do_receive = m->callback(ChannelAction::RECEIVE); + if (do_receive) + *item = std::move(*(m->data)); + else + // We cannot do the data transfer because + // this QueueMessage was added by Select + // and some other case was executed. + // So call the Receive function again. + // We do not care about notifying other + // because they would have been notified + // by the executed select case. + return recv_return(Receive(item)); + // Wake up the blocked process and unlock m->Notify(); lock.unlock(); @@ -186,6 +251,12 @@ void ChannelImpl::Unlock() { mu_.unlock(); } +template +bool ChannelImpl::IsClosed() { + std::lock_guard lock{mu_}; + return closed_; +} + template void ChannelImpl::Close() { std::unique_lock lock{mu_}; @@ -203,6 +274,12 @@ void ChannelImpl::Close() { std::shared_ptr m = recvq.front(); recvq.pop_front(); m->chan_closed = true; + + // Execute callback function (if any) + if (m->callback != nullptr) { + m->callback(ChannelAction::CLOSE); + } + m->Notify(); } @@ -211,10 +288,70 @@ void ChannelImpl::Close() { std::shared_ptr m = sendq.front(); sendq.pop_front(); m->chan_closed = true; + + // Execute callback function (if any) + if (m->callback != nullptr) { + m->callback(ChannelAction::CLOSE); + } + m->Notify(); } } +template +void ChannelImpl::AddToSendQ( + const void *referrer, T *data, + std::shared_ptr cond, + std::function cb) { + std::lock_guard lock{mu_}; + auto m = std::make_shared(data, cond); + m->referrer = referrer; + m->callback = cb; + sendq.push_back(m); +} + +template +void ChannelImpl::AddToReceiveQ( + const void *referrer, T *data, + std::shared_ptr cond, + std::function cb) { + std::lock_guard lock{mu_}; + auto m = std::make_shared(data, cond); + m->referrer = referrer; + m->callback = cb; + recvq.push_back(m); +} + +template +void ChannelImpl::RemoveFromSendQ(const void *referrer) { + std::lock_guard lock{mu_}; + + for (auto it = sendq.begin(); it != sendq.end();) { + std::shared_ptr sendMsg = (std::shared_ptr)*it; + + if (sendMsg->referrer == referrer) { + it = sendq.erase(it); + } else { + ++it; + } + } +} + +template +void ChannelImpl::RemoveFromReceiveQ(const void *referrer) { + std::lock_guard lock{mu_}; + + for (auto it = recvq.begin(); it != recvq.end();) { + std::shared_ptr recvMsg = (std::shared_ptr)*it; + + if (recvMsg->referrer == referrer) { + it = recvq.erase(it); + } else { + ++it; + } + } +} + template ChannelImpl::~ChannelImpl() { Close(); diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc index 5770b0a5a18659e615e80a7c48113d8b543b69ec..25152054eb8452a9667bd65b4441665476c1d46d 100644 --- a/paddle/fluid/framework/concurrency_test.cc +++ b/paddle/fluid/framework/concurrency_test.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" USE_NO_KERNEL_OP(go); USE_NO_KERNEL_OP(channel_close); @@ -27,6 +26,12 @@ USE_NO_KERNEL_OP(channel_create); USE_NO_KERNEL_OP(channel_recv); USE_NO_KERNEL_OP(channel_send); USE_NO_KERNEL_OP(elementwise_add); +USE_NO_KERNEL_OP(select); +USE_NO_KERNEL_OP(conditional_block); +USE_NO_KERNEL_OP(equal); +USE_NO_KERNEL_OP(assign); +USE_NO_KERNEL_OP(while); +USE_NO_KERNEL_OP(print); namespace f = paddle::framework; namespace p = paddle::platform; @@ -35,27 +40,15 @@ namespace paddle { namespace framework { template -void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name, - T value) { - // Create LoDTensor of dim [1,1] +LoDTensor *CreateVariable(Scope &scope, p::CPUPlace &place, std::string name, + T value) { + // Create LoDTensor of dim [1] auto var = scope.Var(name); auto tensor = var->GetMutable(); - tensor->Resize({1, 1}); + tensor->Resize({1}); T *expect = tensor->mutable_data(place); expect[0] = value; -} - -void InitTensorsInScope(Scope &scope, p::CPUPlace &place) { - p::CPUDeviceContext ctx(place); - - // Create channel variable - scope.Var("Channel"); - - // Create Variables, x0 will be put into channel, - // result will be pulled from channel - CreateIntVariable(scope, place, "Status", false); - CreateIntVariable(scope, place, "x0", 99); - CreateIntVariable(scope, place, "result", 0); + return tensor; } void AddOp(const std::string &type, const VariableNameMap &inputs, @@ -73,12 +66,116 @@ void AddOp(const std::string &type, const VariableNameMap &inputs, op->SetAttrMap(attrs); } +void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place, + BlockDesc *casesBlock, int caseId, int caseType, + std::string caseChannel, std::string caseVarName, + std::function func) { + std::string caseCondName = std::string("caseCond") + std::to_string(caseId); + std::string caseCondXVarName = + std::string("caseCondX") + std::to_string(caseId); + + BlockDesc *caseBlock = program->AppendBlock(*casesBlock); + func(caseBlock, scope); + + CreateVariable(*scope, *place, caseCondName, false); + CreateVariable(*scope, *place, caseCondXVarName, caseId); + CreateVariable(*scope, *place, caseVarName, caseId); + + scope->Var("step_scope"); + + AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}}, + {{"Out", {caseCondName}}}, {}, casesBlock); + + AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}}, + {{"Out", {}}, {"Scope", {"step_scope"}}}, + {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock); +} + +void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, + BlockDesc *parentBlock, std::string dataChanName, + std::string quitChanName) { + BlockDesc *whileBlock = program->AppendBlock(*parentBlock); + + CreateVariable(*scope, *place, "whileExitCond", true); + CreateVariable(*scope, *place, "caseToExecute", -1); + CreateVariable(*scope, *place, "case1var", 0); + + CreateVariable(*scope, *place, "xtemp", 0); + + // TODO(thuan): Need to create fibXToSend, since channel send moves the actual + // data, + // which causes the data to be no longer accessible to do the fib calculation + // TODO(abhinav): Change channel send to do a copy instead of a move! + CreateVariable(*scope, *place, "fibXToSend", 0); + + CreateVariable(*scope, *place, "fibX", 0); + CreateVariable(*scope, *place, "fibY", 1); + CreateVariable(*scope, *place, "quitVar", 0); + + BlockDesc *casesBlock = program->AppendBlock(*whileBlock); + std::function f = [](BlockDesc *caseBlock) {}; + + // TODO(thuan): Remove this once we change channel send to do a copy instead + // of move + AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock); + + // Case 0: Send to dataChanName + std::function case0Func = [&]( + BlockDesc *caseBlock, Scope *scope) { + AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock); + AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock); + AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}}, + {{"Out", {"fibY"}}}, {}, caseBlock); + }; + AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend", + case0Func); + std::string case0Config = + std::string("0,1,") + dataChanName + std::string(",fibXToSend"); + + // Case 1: Receive from quitChanName + std::function case2Func = [&]( + BlockDesc *caseBlock, Scope *scope) { + // Exit the while loop after we receive from quit channel. + // We assign a false to "whileExitCond" variable, which will + // break out of while_op loop + CreateVariable(*scope, *place, "whileFalse", false); + AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {}, + caseBlock); + }; + AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar", + case2Func); + std::string case1Config = + std::string("1,2,") + quitChanName + std::string(",quitVar"); + + // Select block + AddOp("select", {{"X", {dataChanName, quitChanName}}, + {"case_to_execute", {"caseToExecute"}}}, + {}, {{"sub_block", casesBlock}, + {"cases", std::vector{case0Config, case1Config}}}, + whileBlock); + + scope->Var("stepScopes"); + AddOp("while", + {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}}, + {{"Out", {}}, {"StepScopes", {"stepScopes"}}}, + {{"sub_block", whileBlock}}, parentBlock); +} + TEST(Concurrency, Go_Op) { Scope scope; p::CPUPlace place; // Initialize scope variables - InitTensorsInScope(scope, place); + p::CPUDeviceContext ctx(place); + + // Create channel variable + scope.Var("Channel"); + + // Create Variables, x0 will be put into channel, + // result will be pulled from channel + CreateVariable(scope, place, "Status", false); + CreateVariable(scope, place, "x0", 99); + CreateVariable(scope, place, "result", 0); framework::Executor executor(place); ProgramDesc program; @@ -118,5 +215,78 @@ TEST(Concurrency, Go_Op) { auto *finalData = tensor.data(); EXPECT_EQ(finalData[0], 99); } + +/** + * This test implements the fibonacci function using go_op and select_op + */ +TEST(Concurrency, Select) { + Scope scope; + p::CPUPlace place; + + // Initialize scope variables + p::CPUDeviceContext ctx(place); + + CreateVariable(scope, place, "Status", false); + CreateVariable(scope, place, "result", 0); + CreateVariable(scope, place, "currentXFib", 0); + + framework::Executor executor(place); + ProgramDesc program; + BlockDesc *block = program.MutableBlock(0); + + // Create channel OP + std::string dataChanName = "Channel"; + scope.Var(dataChanName); + AddOp("channel_create", {}, {{"Out", {dataChanName}}}, + {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); + + std::string quitChanName = "Quit"; + scope.Var(quitChanName); + AddOp("channel_create", {}, {{"Out", {quitChanName}}}, + {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); + + // Create Go Op routine, which loops 10 times over fibonacci sequence + CreateVariable(scope, place, "xReceiveVar", 0); + + BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); + for (int i = 0; i < 10; ++i) { + AddOp("channel_recv", {{"Channel", {dataChanName}}}, + {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock); + AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}}, + {{"first_n", 100}, + {"summarize", -1}, + {"print_tensor_name", false}, + {"print_tensor_type", true}, + {"print_tensor_shape", false}, + {"print_tensor_lod", false}, + {"print_phase", std::string("FORWARD")}, + {"message", std::string("X: ")}}, + goOpBlock); + } + + CreateVariable(scope, place, "quitSignal", 0); + AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}}, + {{"Status", {"Status"}}}, {}, goOpBlock); + + // Create Go Op + AddOp("go", {{"X", {dataChanName, quitChanName}}}, {}, + {{"sub_block", goOpBlock}}, block); + + AddFibonacciSelect(&scope, &place, &program, block, dataChanName, + quitChanName); + + // Create Channel Close Op + AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block); + AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block); + + executor.Run(program, &scope, 0, true, true); + + // After we call executor.run, "result" variable should be equal to 34 + // (which is 10 loops through fibonacci sequence) + const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get(); + auto *finalData = tensor.data(); + EXPECT_EQ(finalData[0], 34); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index f8e7d0d99074936ad81c4ddc52be6907ead4c27d..bcbd717aa47dd98252c5beaaf33d77b466496578 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" DECLARE_bool(benchmark); DEFINE_bool(check_nan_inf, false, @@ -33,6 +34,20 @@ DEFINE_bool(check_nan_inf, false, namespace paddle { namespace framework { +namespace { +// block id starts from 0. This id is used to represent the codeblock +// wrapping the first block 0. +int kProgramId = -1; +} // namespace + +struct ExecutorPrepareContext { + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id) + : prog_(prog), block_id_(block_id) {} + + const framework::ProgramDesc& prog_; + size_t block_id_; + std::vector> ops_; +}; Executor::Executor(const platform::Place& place) : place_(place) {} @@ -85,73 +100,10 @@ static void CheckTensorNANOrInf(const std::string& name, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { - // TODO(tonyyang-svail): - // - only runs on the first device (i.e. no interdevice communication) - // - will change to use multiple blocks for RNN op and Cond Op - PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); - auto& block = pdesc.Block(block_id); - - Scope* local_scope = scope; - if (create_vars) { - if (create_local_scope) { - local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; - } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } // if (create_local_scope) - } // if (create_vars) - - for (auto& op_desc : block.AllOps()) { - auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - - VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); - op->Run(*local_scope, place_); - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); - - if (FLAGS_benchmark) { - VLOG(2) << "Memory used after operator " + op->Type() + " running: " - << memory::memory_usage(place_); - } - if (FLAGS_check_nan_inf) { - for (auto& vname : op->OutputVars(true)) { - auto* var = local_scope->FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } - } - } - } - if (create_vars && create_local_scope) { - scope->DeleteScope(local_scope); - } - if (FLAGS_benchmark) { - VLOG(2) << "-------------------------------------------------------"; - VLOG(2) << "Memory used after deleting local scope: " - << memory::memory_usage(place_); - VLOG(2) << "-------------------------------------------------------"; - } + platform::RecordBlock b(block_id); + auto* ctx = Prepare(pdesc, block_id); + RunPreparedContext(ctx, scope, create_local_scope, create_vars); + delete ctx; } // Check whether the block already has feed operators and feed_holder. @@ -239,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, std::map& fetch_targets, const std::string& feed_holder_name, const std::string& fetch_holder_name) { + platform::RecordBlock b(kProgramId); auto* copy_program = new ProgramDesc(program); auto* global_block = copy_program->MutableBlock(0); @@ -313,5 +266,81 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, delete copy_program; } +ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program, + int block_id) { + auto* ctx = new ExecutorPrepareContext(program, block_id); + PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); + auto& block = program.Block(block_id); + for (auto& op_desc : block.AllOps()) { + ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } + return ctx; +} + +void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope, bool create_vars) { + auto& block = ctx->prog_.Block(ctx->block_id_); + + Scope* local_scope = scope; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : block.AllVars()) { + auto* ptr = local_scope->Var(var->Name()); + CreateTensor(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } // if (create_local_scope) + } // if (create_vars) + + for (auto& op : ctx->ops_) { + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); + op->Run(*local_scope, place_); + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + + if (FLAGS_benchmark) { + VLOG(2) << "Memory used after operator " + op->Type() + " running: " + << memory::memory_usage(place_); + } + if (FLAGS_check_nan_inf) { + for (auto& vname : op->OutputVars(true)) { + auto* var = local_scope->FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } + } + } + } + if (create_vars && create_local_scope) { + scope->DeleteScope(local_scope); + } + if (FLAGS_benchmark) { + VLOG(2) << "-------------------------------------------------------"; + VLOG(2) << "Memory used after deleting local scope: " + << memory::memory_usage(place_); + VLOG(2) << "-------------------------------------------------------"; + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index c1f4d4e02a951e8b127b66cae125309e4798cc76..28ce3315154cea45412984df4daf7385ce2cf572 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -22,7 +22,7 @@ limitations under the License. */ namespace paddle { namespace framework { - +struct ExecutorPrepareContext; class Executor { public: // TODO(dzhwinter) : Do not rely on this function, it will be removed @@ -38,8 +38,8 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, - bool create_vars = true); + void Run(const ProgramDesc& prog, Scope* scope, int block_id, + bool create_local_scope = true, bool create_vars = true); void Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, @@ -47,6 +47,13 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + static ExecutorPrepareContext* Prepare(const ProgramDesc& program, + int block_id); + + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope = true, + bool create_vars = true); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index e2f4e9cad1996578b7c51257785e1273d126f80f..8155cb55a468a09320b1196b49fc3e34cea261b1 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/recordio/writer.h" + #include #include #include @@ -291,6 +294,31 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, TensorFromStream(is, static_cast(tensor), dev_ctx); } +void WriteToRecordIO(recordio::Writer &writer, + const std::vector &tensor, + const platform::DeviceContext &dev_ctx) { + std::stringstream buffer; + size_t sz = tensor.size(); + buffer.write(reinterpret_cast(&sz), sizeof(uint32_t)); + for (auto &each : tensor) { + SerializeToStream(buffer, each, dev_ctx); + } + writer.Write(buffer.str()); +} + +std::vector ReadFromRecordIO( + recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) { + std::istringstream sin(scanner.Next()); + uint32_t sz; + sin.read(reinterpret_cast(&sz), sizeof(uint32_t)); + std::vector result; + result.resize(sz); + for (uint32_t i = 0; i < sz; ++i) { + DeserializeFromStream(sin, &result[i], dev_ctx); + } + return result; +} + std::vector LoDTensor::SplitLoDTensor( const std::vector places) const { check_memory_size(); diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 94d5a6e9fd9b68d3d8230a8c258316efadda5a05..dee505fee0dccd8d60bb290a8bec4df243e504a2 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -29,6 +29,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" namespace paddle { + +namespace recordio { +class Writer; +class Scanner; +} + namespace framework { /* @@ -209,5 +215,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor, void DeserializeFromStream(std::istream& is, LoDTensor* tensor, const platform::DeviceContext& dev_ctx); +extern void WriteToRecordIO(recordio::Writer& writer, + const std::vector& tensor, + const platform::DeviceContext& dev_ctx); + +extern std::vector ReadFromRecordIO( + recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 5e135192ce774ab5c351b89164be9d7600ae3640..e691e29383d4842b80769021e0e494967d38e9bb 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/recordio/writer.h" + #include #include #include @@ -224,5 +227,43 @@ TEST(LoD, CheckAbsLoD) { abs_lod0.push_back(std::vector({0})); ASSERT_FALSE(CheckAbsLoD(abs_lod0)); } + +TEST(LoDTensor, RecordIO) { + LoDTensor tensor; + int* tmp = tensor.mutable_data(make_ddim({4, 5}), platform::CPUPlace()); + for (int i = 0; i < 20; ++i) { + tmp[i] = i; + } + + std::stringstream* stream = new std::stringstream(); + auto& ctx = + *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); + { + recordio::Writer writer(stream, recordio::Compressor::kSnappy); + WriteToRecordIO(writer, {tensor, tensor}, ctx); + WriteToRecordIO(writer, {tensor, tensor}, ctx); + writer.Flush(); + } + + auto assert_tensor_ok = [](const LoDTensor& tensor) { + for (int i = 0; i < 20; ++i) { + ASSERT_EQ(tensor.data()[i], i); + } + }; + + { + std::unique_ptr stream_ptr(stream); + recordio::Scanner scanner(std::move(stream_ptr)); + auto tensors = ReadFromRecordIO(scanner, ctx); + ASSERT_EQ(tensors.size(), 2); + assert_tensor_ok(tensors[0]); + assert_tensor_ok(tensors[1]); + tensors = ReadFromRecordIO(scanner, ctx); + ASSERT_EQ(tensors.size(), 2); + assert_tensor_ok(tensors[0]); + assert_tensor_ok(tensors[1]); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ac6289c5abe8f40ae9ee32aa3d58cdef3ff0e836..b39a1164dbd9877d9f45cc6415d74f930921a42f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -442,15 +442,7 @@ class RuntimeInferShapeContext : public InferShapeContext { } std::vector GetRepeatedDims(const std::string& name) const override { - Variable* var = scope_.FindVar(name); - if (var->IsType()) { - return var->Get().shapes(); - } else { - PADDLE_THROW( - "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's " - "type_id is %s.", - name, var->Type().name()); - } + PADDLE_THROW("Only compile time support this method"); } void SetDim(const std::string& name, const DDim& dim) override { @@ -467,15 +459,7 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetRepeatedDims(const std::string& name, const std::vector& dims) override { - Variable* var = scope_.FindVar(name); - if (var->IsType()) { - var->GetMutable()->set_shapes(dims); - } else { - PADDLE_THROW( - "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's " - "type_id is %s.", - name, var->Type().name()); - } + PADDLE_THROW("Only compile time support this method"); } proto::VarType::Type GetVarType(const std::string& name) const override { @@ -497,8 +481,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto dev_ctx = pool.Get(place); - // profile + auto* dev_ctx = pool.Get(place); + + // For profiling, don't move out of this function because that will result + // in the failure of multi-GPU profiling. platform::RecordEvent record_event(Type(), dev_ctx); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index 91879d6d45868bb37ca44baafb8b0e8677cd6d1a..fa00c08e0d5791ee1187aed38b4d140564b7c97d 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -16,14 +16,22 @@ namespace paddle { namespace framework { +ReaderBase::~ReaderBase() {} -DDim ReaderBase::shape(size_t idx) const { - PADDLE_ENFORCE_LT( - idx, shapes_.size(), - "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx, - shapes_.size()); - return shapes_[idx]; -} +FileReader::FileReader(const std::vector &dims) : dims_(dims) {} + +void FileReader::ReadNext(std::vector *out) { + ReadNextImpl(out); + PADDLE_ENFORCE_EQ(out->size(), dims_.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + auto &actual = out->at(i).dims(); + auto &expect = dims_[i]; + PADDLE_ENFORCE_EQ(actual.size(), expect.size()); + for (int j = 0; j < actual.size(); ++j) { + PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1); + } + } +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 1be3f4ef1f46bd8d72a285afa69b52d6f519ccf5..3573b99becf6d657c680c5fec0bda4bdde5dd7a2 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -16,47 +16,53 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" + +#include +#include +#include namespace paddle { namespace framework { class ReaderBase { public: - explicit ReaderBase(const std::vector& shapes) : shapes_(shapes) { - PADDLE_ENFORCE(!shapes_.empty()); - } virtual void ReadNext(std::vector* out) = 0; virtual void ReInit() = 0; - DDim shape(size_t idx) const; - std::vector shapes() const { return shapes_; } - void set_shapes(const std::vector& shapes) { shapes_ = shapes; } - - virtual ~ReaderBase() {} - - protected: - std::vector shapes_; -}; + virtual bool HasNext() const = 0; -class FileReader : public ReaderBase { - public: - explicit FileReader(const std::vector& shapes) : ReaderBase(shapes) {} + virtual ~ReaderBase(); }; class DecoratedReader : public ReaderBase { public: - explicit DecoratedReader(ReaderBase* reader) - : ReaderBase(reader->shapes()), reader_(reader) { + explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { PADDLE_ENFORCE_NOT_NULL(reader_); } void ReInit() override { reader_->ReInit(); } + bool HasNext() const override { return reader_->HasNext(); } + protected: ReaderBase* reader_; }; +class FileReader : public ReaderBase { + public: + explicit FileReader(const std::vector& dims); + + void ReadNext(std::vector* out) override; + + protected: + virtual void ReadNextImpl(std::vector* out) = 0; + + private: + std::vector dims_; +}; + // The ReaderHolder is used as reader' unified wrapper, // making it easier to access different type reader in Variables. class ReaderHolder { @@ -65,14 +71,16 @@ class ReaderHolder { ReaderBase* Get() const { return reader_.get(); } - void ReadNext(std::vector* out) { reader_->ReadNext(out); } - void ReInit() { reader_->ReInit(); } - - DDim shape(size_t idx) const { return reader_->shape(idx); } - std::vector shapes() const { return reader_->shapes(); } - void set_shapes(const std::vector& shapes) { - reader_->set_shapes(shapes); + void ReadNext(std::vector* out) { + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->ReadNext(out); } + void ReInit() { + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->ReInit(); + } + + bool HasNext() const { return reader_->HasNext(); } private: std::unique_ptr reader_; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index ea6c8cebd3ff16451c974bf3a0ded9d822a9caf8..17e38b1cf042657834b4d0d1c12cbbb92f19fa45 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include // for unique_ptr #include // for call_once +#include #include "glog/logging.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" @@ -102,6 +103,18 @@ void Scope::DeleteScope(Scope* scope) { } } +void Scope::EraseVars(std::vector& var_names) { + std::set var_set(var_names.begin(), var_names.end()); + for (auto it = vars_.begin(); it != vars_.end();) { + if (var_set.find(it->first) != var_set.end()) { + delete it->second; + it = vars_.erase(it); + } else { + ++it; + } + } +} + void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { auto origin_it = vars_.find(origin_name); diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index d8fad162e59c8ea6465b2dc78c90709a758ace24..c1e1f49caaa5a60df0e97289aada465b45213971 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -51,6 +51,8 @@ class Scope { /// Create a variable with a scope-unique name. Variable* Var(std::string* name = nullptr); + void EraseVars(std::vector& var_names); + /// Find a variable in the scope or any of its ancestors. Returns /// nullptr if cannot find. Variable* FindVar(const std::string& name) const; diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 0f5fe6d0aa9a5522c67a3c06f8677f1f2f259eb3..dce541c0971a6ff9a3728e915fe8c3d009c23550 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -115,11 +115,11 @@ void TestInference(const std::string& dirname, #endif } - // Enable the profiler - paddle::platform::EnableProfiler(state); - // 2. Initialize the inference_program and load parameters std::unique_ptr inference_program; + + // Enable the profiler + paddle::platform::EnableProfiler(state); { paddle::platform::RecordEvent record_event( "init_program", @@ -143,6 +143,10 @@ void TestInference(const std::string& dirname, inference_program = paddle::inference::Load(executor, *scope, dirname); } } + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "load_program_profiler.txt"); + paddle::platform::ResetProfiler(); // 3. Get the feed_target_names and fetch_target_names const std::vector& feed_target_names = @@ -165,6 +169,12 @@ void TestInference(const std::string& dirname, // 6. Run the inference program { + // Ignore the profiling results of the first run + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + + // Enable the profiler + paddle::platform::EnableProfiler(state); + // Run repeat times to profile the performance for (int i = 0; i < repeat; ++i) { paddle::platform::RecordEvent record_event( @@ -173,12 +183,13 @@ void TestInference(const std::string& dirname, executor.Run(*inference_program, scope, feed_targets, fetch_targets); } - } - // Disable the profiler and print the timing information - paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, - "profiler.txt"); - paddle::platform::ResetProfiler(); + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kDefault, + "run_inference_profiler.txt"); + paddle::platform::ResetProfiler(); + } delete scope; } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5d436a7e0c3752c889c19820507589f34d3bee94..d30124d4a3b89b802a4abaae07a33b76526f163d 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -165,7 +165,6 @@ op_library(cond_op DEPS framework_proto tensor net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) -op_library(detection_output_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) @@ -203,6 +202,11 @@ op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(concat_op DEPS concat) +# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency +add_subdirectory(concurrency) +op_library(channel_send_op DEPS concurrency) +op_library(channel_recv_op DEPS concurrency) + list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) @@ -222,8 +226,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) -if(WITH_GPU) - cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) -endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 39ae3c0040d04a6d901f1d6c992d547a6778c28e..d372213e1b6008b0c4227103dd40730f86a84301 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -56,6 +56,7 @@ class AssignFunctor { private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; auto &out_tensor = *out; TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); out_tensor.set_lod(lod_tensor.lod()); diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 72f8cb04f2de3af4ee526c3d9b86ff96e34f0b0a..dd0068d571f72c9c22334e523cd091fe4c8da5a6 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -88,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel); + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 507e9a531aae70e60bc6748bfab800310d6e0c21..c486c5850e25fcf4370f02cb145c244743a4cc4b 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/platform/float16.h" template using CastOpKernel = @@ -20,4 +21,5 @@ using CastOpKernel = REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, CastOpKernel, CastOpKernel, - CastOpKernel); + CastOpKernel, + CastOpKernel); diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc index c12b88e7a91c4ea7044223464a2f902db494d1a8..844b3ae3b7bf87c9b253128165b3c938801d5d60 100644 --- a/paddle/fluid/operators/channel_recv_op.cc +++ b/paddle/fluid/operators/channel_recv_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/concurrency/channel_util.h" #include "paddle/fluid/operators/math/math_function.h" static constexpr char Channel[] = "Channel"; @@ -36,25 +37,6 @@ void SetReceiveStatus(const platform::Place &dev_place, status_tensor[0] = status; } -bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) { - // Get type of channel and use that to call mutable data for Variable - auto type = framework::ToVarType(ch->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Receive(var->GetMutable()); - else - PADDLE_THROW("ChannelReceive:Unsupported type"); -} - class ChannelRecvOp : public framework::OperatorBase { public: ChannelRecvOp(const std::string &type, @@ -81,7 +63,7 @@ class ChannelRecvOp : public framework::OperatorBase { scope.FindVar(Input(Channel))->GetMutable(); auto output_var = scope.FindVar(Output(Out)); // Receive the data from the channel. - bool ok = ChannelReceive(ch, output_var); + bool ok = concurrency::ChannelReceive(ch, output_var); // Set the status output of the `ChannelReceive` call. SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok); diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc index 6d7715ad229e821f02437246e3326063cb1ee757..47cf7d7efc9996e8a8db11b79c0310f77c2435a4 100644 --- a/paddle/fluid/operators/channel_send_op.cc +++ b/paddle/fluid/operators/channel_send_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/concurrency/channel_util.h" #include "paddle/fluid/operators/math/math_function.h" static constexpr char Channel[] = "Channel"; @@ -37,24 +38,6 @@ void SetSendStatus(const platform::Place &dev_place, status_tensor[0] = status; } -bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - return ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Send(var->GetMutable()); - else - PADDLE_THROW("ChannelSend:Unsupported type"); -} - class ChannelSendOp : public framework::OperatorBase { public: ChannelSendOp(const std::string &type, @@ -82,7 +65,7 @@ class ChannelSendOp : public framework::OperatorBase { auto input_var = scope.FindVar(Input(X)); // Send the input data through the channel. - bool ok = ChannelSend(ch, input_var); + bool ok = concurrency::ChannelSend(ch, input_var); // Set the status output of the `ChannelSend` call. SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok); diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4617440d152b4c15d09e81cd19c76739b95b979 --- /dev/null +++ b/paddle/fluid/operators/concurrency/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3) diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..a483af7affd824da7d18676d934dc959167ef71f --- /dev/null +++ b/paddle/fluid/operators/concurrency/channel_util.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "channel_util.h" +#include "paddle/fluid/framework/var_type.h" + +namespace poc = paddle::operators::concurrency; + +bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { + auto type = framework::ToVarType(var->Type()); + if (type == framework::proto::VarType_Type_LOD_TENSOR) + return ch->Send(var->GetMutable()); + else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) + return ch->Send(var->GetMutable()); + else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) + return ch->Send(var->GetMutable()); + else if (type == framework::proto::VarType_Type_SELECTED_ROWS) + return ch->Send(var->GetMutable()); + else if (type == framework::proto::VarType_Type_READER) + return ch->Send(var->GetMutable()); + else if (type == framework::proto::VarType_Type_CHANNEL) + return ch->Send(var->GetMutable()); + else + PADDLE_THROW("ChannelSend:Unsupported type"); +} + +bool poc::ChannelReceive(framework::ChannelHolder *ch, + framework::Variable *var) { + // Get type of channel and use that to call mutable data for Variable + auto type = framework::ToVarType(ch->Type()); + if (type == framework::proto::VarType_Type_LOD_TENSOR) + return ch->Receive(var->GetMutable()); + else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) + return ch->Receive(var->GetMutable()); + else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) + return ch->Receive(var->GetMutable()); + else if (type == framework::proto::VarType_Type_SELECTED_ROWS) + return ch->Receive(var->GetMutable()); + else if (type == framework::proto::VarType_Type_READER) + return ch->Receive(var->GetMutable()); + else if (type == framework::proto::VarType_Type_CHANNEL) + return ch->Receive(var->GetMutable()); + else + PADDLE_THROW("ChannelReceive:Unsupported type"); +} + +void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, + framework::Variable *var, + std::shared_ptr cond, + std::function cb) { + auto type = framework::ToVarType(var->Type()); + if (type == framework::proto::VarType_Type_LOD_TENSOR) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, cb); + } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, + cb); + } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, + cb); + } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, + cb); + } else if (type == framework::proto::VarType_Type_READER) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, + cb); + } else if (type == framework::proto::VarType_Type_CHANNEL) { + ch->AddToSendQ(referrer, var->GetMutable(), cond, + cb); + } else { + PADDLE_THROW("ChannelAddToSendQ:Unsupported type"); + } +} + +void poc::ChannelAddToReceiveQ( + framework::ChannelHolder *ch, const void *referrer, + framework::Variable *var, std::shared_ptr cond, + std::function cb) { + auto type = framework::ToVarType(var->Type()); + if (type == framework::proto::VarType_Type_LOD_TENSOR) { + ch->AddToReceiveQ(referrer, var->GetMutable(), cond, + cb); + } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { + ch->AddToReceiveQ(referrer, var->GetMutable(), + cond, cb); + } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { + ch->AddToReceiveQ(referrer, var->GetMutable(), + cond, cb); + } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { + ch->AddToReceiveQ(referrer, var->GetMutable(), + cond, cb); + } else if (type == framework::proto::VarType_Type_READER) { + ch->AddToReceiveQ(referrer, var->GetMutable(), + cond, cb); + } else if (type == framework::proto::VarType_Type_CHANNEL) { + ch->AddToReceiveQ(referrer, var->GetMutable(), + cond, cb); + } else { + PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type"); + } +} diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h new file mode 100644 index 0000000000000000000000000000000000000000..c3674bd9815df451751707bfa84d18dbb5fa0f6b --- /dev/null +++ b/paddle/fluid/operators/concurrency/channel_util.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { +namespace concurrency { + +bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); +bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); + +void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, + framework::Variable *var, + std::shared_ptr cond, + std::function cb); +void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer, + framework::Variable *var, + std::shared_ptr cond, + std::function cb); + +} // namespace concurrency +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index ff0fbf21f86269885df5491afab7443df813f13f..a32aba4c1ff2f5e775aeb41f25b02322dbc6a64a 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -27,6 +28,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; @@ -133,7 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- - T alpha = 1.0f, beta = 0.0f; + ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, @@ -280,7 +283,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- - T alpha = 1.0f, beta = 0.0f; + ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. @@ -315,16 +318,18 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvGradOpKernel, paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace, +REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace, +REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvGradOpKernel, paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 4b02b80d7772fa15d2333692551da5e59d93765f..e3fc21c90f95469d646139a4454501d1c30bd51c 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -83,12 +83,23 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } #endif + auto input_data_type = + framework::ToDataType(ctx.Input("Input")->type()); + auto filter_data_type = + framework::ToDataType(ctx.Input("Filter")->type()); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, + "float16 can only be used when CUDNN is used"); + } + std::string data_format = ctx.Attr("data_format"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, + library_); } Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1fe9404c00335edbe3594486f8c403e69f2ab08f --- /dev/null +++ b/paddle/fluid/operators/delete_var_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class DeleteVarOp : public framework::OperatorBase { + public: + DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); + + auto delete_var_names = Inputs("X"); + const_cast(scope).EraseVars(delete_var_names); + } +}; + +class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of delete op").AsDuplicable(); + AddComment(R"DOC( +Delete Operator. +It should not be configured by users directly. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::DeleteVarOpInfoMaker); diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc index a9488156e073e515926240c9bb66d7b6edf8f82e..741dd51de9e75feb608161579e56cb160b058ebb 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.cc +++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc @@ -85,4 +85,4 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { } // namespace detail } // namespace operators -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 7266f3276477891d3c7b6827316a428ef7a31c6e..ddeeebec58e02f1686fd2e3d3e5ac1a4c4fd3c59 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -97,7 +97,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, return true; } -bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { +void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); @@ -108,8 +108,18 @@ bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, (void*)s); req_count_++; +} - return true; +void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { + const auto ch = GetChannel(ep); + FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); + s->Prepare(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + req_count_++; } bool RPCClient::Wait() { @@ -154,7 +164,7 @@ bool RPCClient::Proceed() { PADDLE_ENFORCE(tag); // TODO(gongwb): add more retries. - ClientBase* c = static_cast(tag); + BaseProcessor* c = static_cast(tag); if (!c->status_.ok()) { LOG(ERROR) << "proc param error:" << c->var_h_.String() << " grpc error:" << c->status_.error_message(); @@ -174,6 +184,8 @@ std::shared_ptr RPCClient::GetChannel(const std::string& ep) { } grpc::ChannelArguments args; + args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000); + args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetMaxSendMessageSize(std::numeric_limits::max()); args.SetMaxReceiveMessageSize(std::numeric_limits::max()); diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h index 669838810de240358857300c1014aa622c17f808..f520367dd981288416631fdad15241fb5d811d07 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -52,14 +52,14 @@ struct VarHandle { void ProcGetResponse(const VarHandle& var_h, const sendrecv::VariableMessage& msg); -class ClientBase { +class BaseProcessor { public: - explicit ClientBase(std::shared_ptr ch) { + explicit BaseProcessor(std::shared_ptr ch) { stub_ = sendrecv::SendRecvService::NewStub(ch); context_ = NULL; } - virtual ~ClientBase() {} + virtual ~BaseProcessor() {} virtual void Prepare(const VarHandle& var_info, int64_t time_out) { context_.reset(new grpc::ClientContext()); @@ -91,9 +91,10 @@ class ClientBase { typedef std::function RequestSendCallBack; -class SendProcessor : public ClientBase { +class SendProcessor : public BaseProcessor { public: - explicit SendProcessor(std::shared_ptr ch) : ClientBase(ch) {} + explicit SendProcessor(std::shared_ptr ch) + : BaseProcessor(ch) {} virtual ~SendProcessor() {} @@ -110,9 +111,10 @@ class SendProcessor : public ClientBase { typedef std::function RequestGetCallBack; -class GetProcessor : public ClientBase { +class GetProcessor : public BaseProcessor { public: - explicit GetProcessor(std::shared_ptr ch) : ClientBase(ch) {} + explicit GetProcessor(std::shared_ptr ch) + : BaseProcessor(ch) {} virtual ~GetProcessor() {} @@ -126,10 +128,10 @@ class GetProcessor : public ClientBase { RequestGetCallBack response_call_back_ = ProcGetResponse; }; -class BatchBarrierProcessor : public ClientBase { +class BatchBarrierProcessor : public BaseProcessor { public: explicit BatchBarrierProcessor(std::shared_ptr ch) - : ClientBase(ch) {} + : BaseProcessor(ch) {} virtual ~BatchBarrierProcessor() {} @@ -137,6 +139,17 @@ class BatchBarrierProcessor : public ClientBase { sendrecv::VoidMessage reply_; }; +class FetchBarrierProcessor : public BaseProcessor { + public: + explicit FetchBarrierProcessor(std::shared_ptr ch) + : BaseProcessor(ch) {} + + virtual ~FetchBarrierProcessor() {} + + virtual void Process() {} + sendrecv::VariableMessage reply_; +}; + class RPCClient { public: bool AsyncSendVariable(const std::string& ep, @@ -151,7 +164,10 @@ class RPCClient { const std::string& var_name, int64_t time_out = 600 * 1000); - bool AsyncSendBatchBarrier(const std::string& ep, + void AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out = 600 * 1000); + + void AsyncSendFetchBarrier(const std::string& ep, int64_t time_out = 600 * 1000); bool Wait(); diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 2a567516614aac39da9d069824f598ed26c5fb0c..8fff430cc4890925e4edba2fadb8eb7fc647d181 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -84,7 +84,7 @@ class RequestGet final : public RequestBase { explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, grpc::ServerCompletionQueue* cq, framework::Scope* scope, const platform::DeviceContext* dev_ctx, - SimpleBlockQueue* queue) + SimpleBlockQueue* queue) : RequestBase(service, cq), responder_(&ctx_), scope_(scope), @@ -101,11 +101,16 @@ class RequestGet final : public RequestBase { // proc request. std::string var_name = request_.varname(); auto* var = scope_->FindVar(var_name); - SerializeToMessage(var_name, var, *dev_ctx_, &reply_); + if (var_name != FETCH_BARRIER_MESSAGE) { + SerializeToMessage(var_name, var, *dev_ctx_, &reply_); + } // TODO(gongwb): check var's info. responder_.Finish(reply_, grpc::Status::OK, this); status_ = FINISH; - queue_->Push('c'); + MessageWithName msg_with_name = + // request name reply + std::make_pair(var_name, std::move(reply_)); + queue_->Push(msg_with_name); } protected: @@ -114,12 +119,16 @@ class RequestGet final : public RequestBase { ServerAsyncResponseWriter responder_; framework::Scope* scope_; const platform::DeviceContext* dev_ctx_; - SimpleBlockQueue* queue_; + SimpleBlockQueue* queue_; }; void AsyncGRPCServer::WaitClientGet(int count) { - for (int i = 0; i < count; ++i) { - var_get_queue_.Pop(); + int fetch_barriers = 0; + while (fetch_barriers < count) { + auto msg = var_get_queue_.Pop(); + if (msg.first == FETCH_BARRIER_MESSAGE) { + fetch_barriers++; + } } } diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index e9402ff6aafe73684b57442c03ee4c573a902a46..b6666bcf96e484b0b17b935c0efb2930f19b19f2 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -77,7 +77,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { const platform::DeviceContext *dev_ctx_; // received variable from RPC, operators fetch variable from this queue. SimpleBlockQueue var_recv_queue_; - SimpleBlockQueue var_get_queue_; + SimpleBlockQueue var_get_queue_; // condition of the sub program std::mutex barrier_mutex_; diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index 9cb5851deba6b16261d4499afcfb867d9d706498..48bdce740878ea486eda6821dc29885a3e480114 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace operators { namespace detail { diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index f196fc9862d2374583d50820a6c3b63c866bf048..39117eeeb611b025c426938c60ddf82c6af232ca 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, DestroyCallback destroy_callback = [](void* backing) {}; void* buf = malloc(1024); - void* payload; + void* payload = nullptr; size_t payload_size; ProtoEncodeHelper e((char*)buf, 1024); e.WriteString(VarMsg::kVarnameFieldNumber, name); @@ -297,4 +297,4 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, } // namespace detail } // namespace operators -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 5208091e54b4da2bb0265f84827ce23b57e954dc..4fa6aefd3e0b1bd45ac52b1eff3b29126d79f03a 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -32,6 +32,7 @@ namespace detail { #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" typedef void (*DestroyCallback)(void*); diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index a009e9dfce130bfd6c506c71b58a418a569bdf7a..8c15bfa36bfe72586cfcbdbd8efc4542253adaca 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>& true_pos, std::map>>& false_pos, const int class_num) const { - constexpr T kEPS = static_cast(1e-6); const int* pos_count_data = input_pos_count.data(); for (int i = 0; i < class_num; ++i) { label_pos_count[i] = pos_count_data[i]; @@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod(); - for (size_t i = 0; i < pos_data_lod.size(); ++i) { - for (size_t j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) { + auto pos_data_lod = pos_tensor.lod()[0]; + for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { + for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; - int flag = 1; - if (pos_data[j * 2 + 1] < kEPS) flag = 0; + int flag = pos_data[j * 2 + 1]; pos[i].push_back(std::make_pair(score, flag)); } } diff --git a/paddle/fluid/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc deleted file mode 100644 index f7520475917ff23535f11ccfde0ee915112bba30..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection_output_op.cc +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection_output_op.h" -namespace paddle { -namespace operators { - -class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker { - public: - DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("Loc", - "(Tensor) The input tensor of detection_output operator." - "The input predict locations" - "The format of input tensor is kNCHW. Where K is priorbox point " - "numbers," - "N is How many boxes are there on each point, " - "C is 4, H and W both are 1."); - AddInput("Conf", - "(Tensor) The input tensor of detection_output operator." - "The input priorbox confidence." - "The format of input tensor is kNCHW. Where K is priorbox point " - "numbers," - "N is How many boxes are there on each point, " - "C is the number of classes, H and W both are 1."); - AddInput("PriorBox", - "(Tensor) The input tensor of detection_output operator." - "The format of input tensor is the position and variance " - "of the boxes"); - AddOutput("Out", - "(Tensor) The output tensor of detection_output operator."); - AddAttr("background_label_id", "(int), The background class index."); - AddAttr("num_classes", "(int), The number of the classification."); - AddAttr("nms_threshold", - "(float), The Non-maximum suppression threshold."); - AddAttr("confidence_threshold", - "(float), The classification confidence threshold."); - AddAttr("top_k", "(int), The bbox number kept of the layer’s output."); - AddAttr("nms_top_k", - "(int), The bbox number kept of the NMS’s output."); - AddComment(R"DOC( - detection output for SSD(single shot multibox detector) - Apply the NMS to the output of network and compute the predict - bounding box location. The output’s shape of this layer could - be zero if there is no valid bounding box. - )DOC"); - } -}; - -class DetectionOutputOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Loc"), - "Input(X) of DetectionOutputOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Conf"), - "Input(X) of DetectionOutputOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("PriorBox"), - "Input(X) of DetectionOutputOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of DetectionOutputOp should not be null."); - std::vector output_shape({1, 7}); - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp, - ops::DetectionOutputOpMaker); -REGISTER_OP_CPU_KERNEL( - detection_output, - ops::DetectionOutputKernel, - ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc deleted file mode 100644 index 0f48765c9c67c1d3fa32b19d5e87b2acaa3c486a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection_output_op.cu.cc +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection_output_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - detection_output, - ops::DetectionOutputKernel, - ops::DetectionOutputKernel); diff --git a/paddle/fluid/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h deleted file mode 100644 index af9081c93436776b6ca6ee7139e340054111e440..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection_output_op.h +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/math/detection_util.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/strided_memcpy.h" -namespace paddle { -namespace operators { -template -inline void transpose_fun(const framework::ExecutionContext& context, - const framework::Tensor& src, - framework::Tensor* dst) { - int input_nums = src.dims()[0]; - int offset = 0; - for (int j = 0; j < input_nums; ++j) { - framework::Tensor in_p_tensor = src.Slice(j, j + 1); - std::vector shape_vec( - {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3], - in_p_tensor.dims()[4], in_p_tensor.dims()[2]}); - framework::DDim shape(framework::make_ddim(shape_vec)); - framework::Tensor in_p_tensor_transpose; - in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); - std::vector shape_axis({0, 1, 3, 4, 2}); - math::Transpose trans5; - trans5(context.template device_context(), in_p_tensor, - &in_p_tensor_transpose, shape_axis); - auto dst_stride = framework::stride(dst->dims()); - auto src_stride = framework::stride(in_p_tensor_transpose.dims()); - StridedMemcpy(context.device_context(), in_p_tensor_transpose.data(), - src_stride, in_p_tensor_transpose.dims(), dst_stride, - dst->data() + offset); - offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; - } -} -template -class DetectionOutputKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_loc = context.Input("Loc"); - const framework::Tensor* in_conf = context.Input("Conf"); - const framework::Tensor* in_priorbox = - context.Input("PriorBox"); - auto* out = context.Output("Out"); - int num_classes = context.template Attr("num_classes"); - int top_k = context.template Attr("top_k"); - int nms_top_k = context.template Attr("nms_top_k"); - int background_label_id = context.template Attr("background_label_id"); - float nms_threshold = context.template Attr("nms_threshold"); - float confidence_threshold = - context.template Attr("confidence_threshold"); - size_t batch_size = in_conf->dims()[1]; - int conf_sum_size = in_conf->numel(); - // for softmax - std::vector conf_shape_softmax_vec( - {conf_sum_size / num_classes, num_classes}); - framework::DDim conf_shape_softmax( - framework::make_ddim(conf_shape_softmax_vec)); - // for knchw => nhwc - std::vector loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3], - in_loc->dims()[4], - in_loc->dims()[2] * in_loc->dims()[0]}); - std::vector conf_shape_vec( - {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4], - in_conf->dims()[2] * in_conf->dims()[0]}); - framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); - framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); - framework::Tensor loc_tensor; - framework::Tensor conf_tensor; - loc_tensor.mutable_data(loc_shape, context.GetPlace()); - conf_tensor.mutable_data(conf_shape, context.GetPlace()); - // for cpu - framework::Tensor loc_cpu; - framework::Tensor conf_cpu; - framework::Tensor priorbox_cpu; - const T* priorbox_data = in_priorbox->data(); - transpose_fun(context, *in_loc, &loc_tensor); - transpose_fun(context, *in_conf, &conf_tensor); - conf_tensor.Resize(conf_shape_softmax); - math::SoftmaxFunctor()( - context.template device_context(), &conf_tensor, - &conf_tensor); - T* loc_data = loc_tensor.data(); - T* conf_data = conf_tensor.data(); - if (platform::is_gpu_place(context.GetPlace())) { - loc_cpu.mutable_data(loc_tensor.dims(), platform::CPUPlace()); - framework::TensorCopy(loc_tensor, platform::CPUPlace(), - context.device_context(), &loc_cpu); - loc_data = loc_cpu.data(); - conf_cpu.mutable_data(conf_tensor.dims(), platform::CPUPlace()); - framework::TensorCopy(conf_tensor, platform::CPUPlace(), - context.device_context(), &conf_cpu); - conf_data = conf_cpu.data(); - priorbox_cpu.mutable_data(in_priorbox->dims(), platform::CPUPlace()); - framework::TensorCopy(*in_priorbox, platform::CPUPlace(), - context.device_context(), &priorbox_cpu); - priorbox_data = priorbox_cpu.data(); - } - // get decode bboxes - size_t num_priors = in_priorbox->numel() / 8; - std::vector>> all_decoded_bboxes; - for (size_t n = 0; n < batch_size; ++n) { - std::vector> decoded_bboxes; - for (size_t i = 0; i < num_priors; ++i) { - size_t prior_offset = i * 8; - size_t loc_pred_offset = n * num_priors * 4 + i * 4; - std::vector> prior_bbox_vec; - math::GetBBoxFromPriorData(priorbox_data + prior_offset, 1, - prior_bbox_vec); - std::vector> prior_bbox_var; - math::GetBBoxVarFromPriorData(priorbox_data + prior_offset, 1, - prior_bbox_var); - std::vector loc_pred_data; - for (size_t j = 0; j < 4; ++j) - loc_pred_data.push_back(*(loc_data + loc_pred_offset + j)); - math::BBox bbox = math::DecodeBBoxWithVar( - prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); - decoded_bboxes.push_back(bbox); - } - all_decoded_bboxes.push_back(decoded_bboxes); - } - std::vector>> all_indices; - int num_kept = math::GetDetectionIndices( - conf_data, num_priors, num_classes, background_label_id, batch_size, - confidence_threshold, nms_top_k, nms_threshold, top_k, - all_decoded_bboxes, &all_indices); - - if (num_kept <= 0) { - std::vector out_shape_vec({0, 0}); - framework::DDim out_shape(framework::make_ddim(out_shape_vec)); - out->Resize(out_shape); - return; - } - std::vector out_shape_vec({num_kept, 7}); - framework::DDim out_shape(framework::make_ddim(out_shape_vec)); - out->mutable_data(out_shape, context.GetPlace()); - framework::Tensor out_cpu; - T* out_data = out->data(); - if (platform::is_gpu_place(context.GetPlace())) { - out_cpu.mutable_data(out->dims(), platform::CPUPlace()); - out_data = out_cpu.data(); - } - math::GetDetectionOutput(conf_data, num_kept, num_priors, num_classes, - batch_size, all_indices, all_decoded_bboxes, - out_data); - if (platform::is_gpu_place(context.GetPlace())) { - framework::TensorCopy(out_cpu, platform::CUDAPlace(), - context.device_context(), out); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc index e9068fcd50ba9309a37939788ca8f67f1f6e25cd..4aab54f60236ecc5fa7f70e22f1553c3bfe68198 100644 --- a/paddle/fluid/operators/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise_add_op.cc @@ -29,8 +29,11 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, - elementwise_add_grad, ops::ElementwiseOpGrad); +REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp, + ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index fe31bbaed44fced68b7b51dd2c2031950ec4247d..f04d8d8fd82ed2336dff9c5b88808dc32de6630a 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -41,6 +41,16 @@ class ElementwiseOp : public framework::OperatorWithKernel { } }; +class ElementwiseOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto x_var = op_desc.Input("X")[0]; + auto out_var = op_desc.Output("Out")[0]; + block->Var(out_var)->SetType(block->Var(x_var)->GetType()); + } +}; + class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { public: ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 8e9923c87ce22ed229f78ef15430e50cab16c947..4253300788462a3704076fc79241a864f2f130a0 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -128,8 +128,8 @@ class ListenAndServOp : public framework::OperatorBase { } } if (exit_flag) { - rpc_service_->ShutDown(); rpc_service_->SetCond(1); + rpc_service_->ShutDown(); break; } try { @@ -148,7 +148,7 @@ class ListenAndServOp : public framework::OperatorBase { } rpc_service_->SetCond(1); // FIXME(typhoonzero): use another condition to sync wait clients get. - rpc_service_->WaitClientGet(ins.size()); + rpc_service_->WaitClientGet(fan_in); sparse_vars.clear(); } // while(true) } diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 3acdca17afc2fea05fb81871e6e03d72691fe91e..50eeadab72e71f39325c5eda69e9a3c3e6517d7d 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -33,8 +33,16 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type + // is LoDTensor, this tensor contains the ids to be looked up in W + // and it must be a column vector with rank = 2 while the 2nd dimension + // size must be 1, when Ids's type is SelectedRows, the rows of Ids + // contains the ids to be looked up in W; + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); ctx->ShareLoD("Ids", /*->*/ "Out"); @@ -54,17 +62,22 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("W", - "An input represents embedding tensors, " + "(Tensor) The input represents embedding tensors, " "which is a learnable parameter."); - AddInput("Ids", - "An input with type int32 or int64 " - "contains the ids to be looked up in W. " - "Ids must be a column vector with rank = 2. " - "The 2nd dimension size must be 1."); - AddOutput("Out", "The lookup results, which have the same type as W."); + AddInput( + "Ids", + "(Tensor or SelectedRows) Ids's type can be Tensor or " + "SelectedRows, when Ids's type is Tensor, this tensor contains " + "the ids to be looked up in W and it must be a column vector with " + "rank = 2 while the 2nd dimension size must be 1; when Ids's type is " + "SelectedRows, the rows of Ids contains the ids to be looked up " + "in W."); + AddOutput("Out", + "(Tensor or SelectedRows) The lookup results, which have the " + "same type as W."); AddAttr("is_sparse", "(boolean, default false) " - "Sparse update") + "Sparse update.") .SetDefault(false); AddAttr("padding_idx", "(int64, default -1) " @@ -76,10 +89,15 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { Lookup Table Operator. This operator is used to perform lookups on the parameter W, -then concatenated into a dense tensor. +then concatenated into a dense or sparse tensor. + +The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's +type is SelectedRows, the rows of Ids contains the ids to be looked up in W; +when Ids's type is Tensor, this tensor contains the ids to be looked up in W +and it must be a column vector with rank = 2 while the 2nd dimension size must be 1, +at this time, Ids can carry the LoD (Level of Details) information, or not, and +the output only shares the LoD information with input Ids. -The input Ids can carry the LoD (Level of Details) information, -or not. And the output only shares the LoD information with input Ids. )DOC"); } diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 923340f46102d888f549c79684ae0ae2f78ed038..6d81fccd2059c511f71d403229e04587e553e93d 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -74,14 +74,32 @@ class LookupTableCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* table_t = context.Input("W"); - auto* ids_t = context.Input("Ids"); - auto* output_t = context.Output("Out"); int64_t padding_idx = context.Attr("padding_idx"); + auto* ids_var = context.InputVar("Ids"); + Tensor* output_t = context.Output("Out"); + + int64_t* ids; + int64_t K; + + // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type + // is LoDTensor, this tensor contains the ids to be looked up in W; + // when Ids's type is SelectedRows, the rows of Ids contains the + // ids to be looked up in W. + if (ids_var->IsType()) { + auto* ids_t = context.Input("Ids"); + ids = const_cast(ids_t->data()); + K = ids_t->numel(); + } else if (ids_var->IsType()) { + auto* ids_t = context.Input("Ids"); + ids = const_cast(ids_t->rows().CUDAData(context.GetPlace())); + K = ids_t->rows().size(); + output_t->Resize({K, table_t->dims()[1]}); + } else { + PADDLE_THROW("Unsupported Variable Type of Ids"); + } size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; - size_t K = ids_t->numel(); - auto* ids = ids_t->data(); auto* table = table_t->data(); auto* output = output_t->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index d88b034e919f1127ac3c424e87e4a5f81a598dc8..c92ce78eeffb8f1517e61c6d6624d406e04d974d 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -22,6 +22,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; @@ -29,25 +30,45 @@ template class LookupTableKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* table_t = context.Input("W"); // float tensor - auto* ids_t = context.Input("Ids"); // int tensor - auto* output_t = context.Output("Out"); // float tensor + auto* table_t = context.Input("W"); + auto* ids_var = context.InputVar("Ids"); + Tensor* output_t = context.Output("Out"); + + int64_t* ids; + int64_t ids_numel; + + // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type + // is LoDTensor, this tensor contains the ids to be looked up in W; + // when Ids's type is SelectedRows, the rows of Ids contains the + // ids to be looked up in W. + if (ids_var->IsType()) { + auto* ids_t = context.Input("Ids"); + ids = const_cast(ids_t->data()); + ids_numel = ids_t->numel(); + } else if (ids_var->IsType()) { + auto* ids_t = context.Input("Ids"); + ids = const_cast(ids_t->rows().data()); + ids_numel = ids_t->rows().size(); + output_t->Resize({ids_numel, table_t->dims()[1]}); + } else { + PADDLE_THROW("Unsupported Variable Type of Ids"); + } + int64_t padding_idx = context.Attr("padding_idx"); int N = table_t->dims()[0]; int D = table_t->dims()[1]; - auto* ids = ids_t->data(); auto* table = table_t->data(); auto* output = output_t->mutable_data(context.GetPlace()); if (padding_idx == -1) { - for (int64_t i = 0; i < ids_t->numel(); ++i) { + for (int64_t i = 0; i < ids_numel; ++i) { PADDLE_ENFORCE_LT(ids[i], N); PADDLE_ENFORCE_GE(ids[i], 0); memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); } } else { - for (int64_t i = 0; i < ids_t->numel(); ++i) { + for (int64_t i = 0; i < ids_numel; ++i) { if (ids[i] == padding_idx) { memset(output + i * D, 0, D * sizeof(T)); } else { diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index b0c213d637c244e4cbacbe75218537973efed047..692e85dcffa583abcb22a1629953badc67489efa 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -36,7 +36,7 @@ struct LRNFunctor { auto e_x = framework::EigenTensor::From(input); for (int m = 0; m < N; m++) { for (int i = 0; i < C; i++) { - for (int c = start; c <= end; c++) { + for (int c = start; c < end; c++) { int ch = i + c; if (ch >= 0 && ch < C) { auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), @@ -92,7 +92,7 @@ struct LRNGradFunctor { Eigen::array({{1, 1, H, W}})); i_x_g = i_mid.pow(-beta) * i_out_g; - for (int c = start; c <= end; c++) { + for (int c = start; c < end; c++) { int ch = i + c; if (ch < 0 || ch >= C) { continue; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index a181d802262d15b188060dae4330cec0e24714ab..fba1612d10f0494f4ab06fabdd0e799a74dafd53 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -38,7 +38,7 @@ math_library(lstm_compute DEPS activation_functions) math_library(math_function DEPS cblas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows) +math_library(selected_rows_functor DEPS selected_rows math_function) math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) diff --git a/paddle/fluid/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h deleted file mode 100644 index c31764cfaf5bbdfea2f3ed06f31f97965a8858ed..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/detection_util.h +++ /dev/null @@ -1,300 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once -#include -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { -namespace math { -template -struct BBox { - BBox(T x_min, T y_min, T x_max, T y_max) - : x_min(x_min), - y_min(y_min), - x_max(x_max), - y_max(y_max), - is_difficult(false) {} - - BBox() {} - - T get_width() const { return x_max - x_min; } - - T get_height() const { return y_max - y_min; } - - T get_center_x() const { return (x_min + x_max) / 2; } - - T get_center_y() const { return (y_min + y_max) / 2; } - - T get_area() const { return get_width() * get_height(); } - - // coordinate of bounding box - T x_min; - T y_min; - T x_max; - T y_max; - // whether difficult object (e.g. object with heavy occlusion is difficult) - bool is_difficult; -}; -// KNCHW ==> NHWC -// template -template -void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec); -template -void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, - std::vector>& var_vec); -template -BBox DecodeBBoxWithVar(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data); -template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2); -template -bool SortScorePairDescend(const std::pair>& pair1, - const std::pair>& pair2); -template -T jaccard_overlap(const BBox& bbox1, const BBox& bbox2); - -template -void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, - size_t class_idx, size_t top_k, T conf_threshold, - T nms_threshold, size_t num_priors, size_t num_classes, - std::vector* indices); -template -int GetDetectionIndices( - const T* conf_data, const size_t num_priors, const size_t num_classes, - const size_t background_label_id, const size_t batch_size, - const T conf_threshold, const size_t nms_top_k, const T nms_threshold, - const size_t top_k, - const std::vector>>& all_decoded_bboxes, - std::vector>>* all_detection_indices); -template -BBox ClipBBox(const BBox& bbox); -template -void GetDetectionOutput( - const T* conf_data, const size_t num_kept, const size_t num_priors, - const size_t num_classes, const size_t batch_size, - const std::vector>>& all_indices, - const std::vector>>& all_decoded_bboxes, T* out_data); -template -void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, - std::vector>& bbox_vec) { - size_t out_offset = bbox_vec.size(); - bbox_vec.resize(bbox_vec.size() + num_bboxes); - for (size_t i = 0; i < num_bboxes; ++i) { - BBox bbox; - bbox.x_min = *(prior_data + i * 8); - bbox.y_min = *(prior_data + i * 8 + 1); - bbox.x_max = *(prior_data + i * 8 + 2); - bbox.y_max = *(prior_data + i * 8 + 3); - bbox_vec[out_offset + i] = bbox; - } -} -template -void GetBBoxVarFromPriorData(const T* prior_data, const size_t num, - std::vector>& var_vec) { - size_t out_offset = var_vec.size(); - var_vec.resize(var_vec.size() + num); - for (size_t i = 0; i < num; ++i) { - std::vector var; - var.push_back(*(prior_data + i * 8 + 4)); - var.push_back(*(prior_data + i * 8 + 5)); - var.push_back(*(prior_data + i * 8 + 6)); - var.push_back(*(prior_data + i * 8 + 7)); - var_vec[out_offset + i] = var; - } -} -template -BBox DecodeBBoxWithVar(BBox& prior_bbox, - const std::vector& prior_bbox_var, - const std::vector& loc_pred_data) { - T prior_bbox_width = prior_bbox.get_width(); - T prior_bbox_height = prior_bbox.get_height(); - T prior_bbox_center_x = prior_bbox.get_center_x(); - T prior_bbox_center_y = prior_bbox.get_center_y(); - - T decoded_bbox_center_x = - prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width + - prior_bbox_center_x; - T decoded_bbox_center_y = - prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height + - prior_bbox_center_y; - T decoded_bbox_width = - std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width; - T decoded_bbox_height = - std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height; - - BBox decoded_bbox; - decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2; - decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2; - decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2; - decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2; - - return decoded_bbox; -} -template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} -template -T jaccard_overlap(const BBox& bbox1, const BBox& bbox2) { - if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || - bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { - return 0.0; - } else { - T inter_x_min = std::max(bbox1.x_min, bbox2.x_min); - T inter_y_min = std::max(bbox1.y_min, bbox2.y_min); - T interX_max = std::min(bbox1.x_max, bbox2.x_max); - T interY_max = std::min(bbox1.y_max, bbox2.y_max); - - T inter_width = interX_max - inter_x_min; - T inter_height = interY_max - inter_y_min; - T inter_area = inter_width * inter_height; - - T bbox_area1 = bbox1.get_area(); - T bbox_area2 = bbox2.get_area(); - - return inter_area / (bbox_area1 + bbox_area2 - inter_area); - } -} - -template -void ApplyNmsFast(const std::vector>& bboxes, const T* conf_score_data, - size_t class_idx, size_t top_k, T conf_threshold, - T nms_threshold, size_t num_priors, size_t num_classes, - std::vector* indices) { - std::vector> scores; - for (size_t i = 0; i < num_priors; ++i) { - size_t conf_offset = i * num_classes + class_idx; - if (conf_score_data[conf_offset] > conf_threshold) - scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); - } - std::stable_sort(scores.begin(), scores.end(), - SortScorePairDescend); - if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); - while (scores.size() > 0) { - const size_t idx = scores.front().second; - bool keep = true; - for (size_t i = 0; i < indices->size(); ++i) { - if (keep) { - const size_t saved_idx = (*indices)[i]; - T overlap = jaccard_overlap(bboxes[idx], bboxes[saved_idx]); - keep = overlap <= nms_threshold; - } else { - break; - } - } - if (keep) indices->push_back(idx); - scores.erase(scores.begin()); - } -} -template -int GetDetectionIndices( - const T* conf_data, const size_t num_priors, const size_t num_classes, - const size_t background_label_id, const size_t batch_size, - const T conf_threshold, const size_t nms_top_k, const T nms_threshold, - const size_t top_k, - const std::vector>>& all_decoded_bboxes, - std::vector>>* all_detection_indices) { - int total_keep_num = 0; - for (size_t n = 0; n < batch_size; ++n) { - const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; - size_t num_detected = 0; - std::map> indices; - size_t conf_offset = n * num_priors * num_classes; - for (size_t c = 0; c < num_classes; ++c) { - if (c == background_label_id) continue; - ApplyNmsFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, - conf_threshold, nms_threshold, num_priors, num_classes, - &(indices[c])); - num_detected += indices[c].size(); - } - if (top_k > 0 && num_detected > top_k) { - // std::vector> score_index_pairs; - std::vector>> score_index_pairs; - for (size_t c = 0; c < num_classes; ++c) { - const std::vector& label_indices = indices[c]; - for (size_t i = 0; i < label_indices.size(); ++i) { - size_t idx = label_indices[i]; - score_index_pairs.push_back( - std::make_pair((conf_data + conf_offset)[idx * num_classes + c], - std::make_pair(c, idx))); - } - } - std::sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); - score_index_pairs.resize(top_k); - std::map> new_indices; - for (size_t i = 0; i < score_index_pairs.size(); ++i) { - size_t label = score_index_pairs[i].second.first; - size_t idx = score_index_pairs[i].second.second; - new_indices[label].push_back(idx); - } - all_detection_indices->push_back(new_indices); - total_keep_num += top_k; - } else { - all_detection_indices->push_back(indices); - total_keep_num += num_detected; - } - } - return total_keep_num; -} -template -BBox ClipBBox(const BBox& bbox) { - T one = static_cast(1.0); - T zero = static_cast(0.0); - BBox clipped_bbox; - clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero); - clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero); - clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero); - clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero); - return clipped_bbox; -} -template -void GetDetectionOutput( - const T* conf_data, const size_t num_kept, const size_t num_priors, - const size_t num_classes, const size_t batch_size, - const std::vector>>& all_indices, - const std::vector>>& all_decoded_bboxes, T* out_data) { - size_t count = 0; - for (size_t n = 0; n < batch_size; ++n) { - for (std::map>::const_iterator it = - all_indices[n].begin(); - it != all_indices[n].end(); ++it) { - size_t label = it->first; - const std::vector& indices = it->second; - const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; - for (size_t i = 0; i < indices.size(); ++i) { - size_t idx = indices[i]; - size_t conf_offset = n * num_priors * num_classes + idx * num_classes; - out_data[count * 7] = n; - out_data[count * 7 + 1] = label; - out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; - BBox clipped_bbox = ClipBBox(decoded_bboxes[idx]); - out_data[count * 7 + 3] = clipped_bbox.x_min; - out_data[count * 7 + 4] = clipped_bbox.y_min; - out_data[count * 7 + 5] = clipped_bbox.x_max; - out_data[count * 7 + 6] = clipped_bbox.y_max; - ++count; - } - } - } -} -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 36655508be2ea9e748333171073c7dc258de52f2..3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -45,6 +45,9 @@ void gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemm( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N)); @@ -106,6 +109,9 @@ void gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemm( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, ldc)); @@ -251,6 +257,9 @@ void batched_gemm( const half* h_B = reinterpret_cast(B); half* h_C = reinterpret_cast(C); + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, + "cublas Hgemm requires GPU compute capability >= 53"); PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 442e62d563ebd40316d001914c93447c102cbf61..8982d9d066165a9da0461288685baa0c60e5f114 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); @@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); @@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + int m = 2; int n = 3; int k = 3; @@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) { CUDAPlace gpu_place(0); CUDADeviceContext context(gpu_place); + // fp16 GEMM in cublas requires GPU compute capability >= 53 + if (context.GetComputeCapability() < 53) { + return; + } + int m = 2; int n = 3; int k = 3; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 38e93fdf15d99eb447948378a599891074c10fc5..34ea6a91ce7743462d378cf471a5ec3a12ca51d1 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -14,13 +14,86 @@ limitations under the License. */ #define EIGEN_USE_GPU +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { namespace math { +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using DataLayout = platform::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; + +template +void SoftmaxCUDNNFunctor::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor xDesc; + ScopedTensorDescriptor yDesc; + std::vector cudnn_tensor_dims = framework::vectorize2int(X->dims()); + DataLayout layout = DataLayout::kNCHW; + if (cudnn_tensor_dims.size() == 5) { + layout = DataLayout::kNCDHW; + } + // NOTE(*) : cudnn softmax only support >= 4D Tensor, + // fill 1 at unused dims + if (cudnn_tensor_dims.size() <= 2) { + cudnn_tensor_dims.resize(4, 1); + } + cudnnTensorDescriptor_t cudnn_x_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_y_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward( + context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_x_desc, + X->data(), CudnnDataType::kZero(), cudnn_y_desc, + Y->mutable_data(context.GetPlace()))); +} + +template +void SoftmaxGradCUDNNFunctor::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor* Y, + const framework::Tensor* YGrad, framework::Tensor* XGrad) { + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor yDesc; + ScopedTensorDescriptor dyDesc; + ScopedTensorDescriptor dxDesc; + std::vector cudnn_tensor_dims = framework::vectorize2int(Y->dims()); + DataLayout layout = DataLayout::kNCHW; + if (cudnn_tensor_dims.size() == 5) { + layout = DataLayout::kNCDHW; + } + // NOTE(*) : cudnn softmax only support >= 4D Tensor, + // fill 1 at unused dims + if (cudnn_tensor_dims.size() <= 2) { + cudnn_tensor_dims.resize(4, 1); + } + cudnnTensorDescriptor_t cudnn_y_desc = + yDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_xgrad_desc = + dxDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_ygrad_desc = + dyDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward( + context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType::kOne(), cudnn_y_desc, + Y->data(), cudnn_ygrad_desc, YGrad->data(), + CudnnDataType::kZero(), cudnn_xgrad_desc, + XGrad->mutable_data(context.GetPlace()))); +} + +template class SoftmaxCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; + template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxGradFunctor; diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h index 14b2690c2a4e764058270953214a07aee8053444..da1f0b672d3a5fb5da8f4d72892be21964bdbc0d 100644 --- a/paddle/fluid/operators/math/softmax.h +++ b/paddle/fluid/operators/math/softmax.h @@ -33,6 +33,23 @@ class SoftmaxGradFunctor { const framework::Tensor* y_grad, framework::Tensor* x_grad); }; +#ifdef PADDLE_WITH_CUDA +template +class SoftmaxCUDNNFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor* X, framework::Tensor* Y); +}; + +template +class SoftmaxGradCUDNNFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor* Y, const framework::Tensor* y_grad, + framework::Tensor* x_grad); +}; +#endif + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index e7bed2c39735b66c19e738c91f4977e46571143b..90af1e2d602ac039b4d98a69a889ff8b1b85ffc6 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -17,11 +17,14 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::OpKernelType; using framework::Tensor; -class MulOpShapeInference : public framework::InferShapeBase { +class MulOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext* ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$. } }; -class MulOpGrad : public framework::OperatorWithKernel { +class MulGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, - ops::MulOpShapeInference, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); +REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc index 0667530e943856576ae8c9fe4856cb6aa1448e4e..757f9c3ee2665c7ac654659416fe8dd727dca16d 100644 --- a/paddle/fluid/operators/mul_op.cu.cc +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mul, ops::MulKernel); -REGISTER_OP_CUDA_KERNEL( - mul_grad, ops::MulGradKernel); +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, + ops::MulKernel); +REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h index 38311cf87265ad0f1f815734cbf69bd682d62e62..b1260d36ebe11f65529ac274c959479dcb38ee5f 100644 --- a/paddle/fluid/operators/mul_op.h +++ b/paddle/fluid/operators/mul_op.h @@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel { } math::matmul( context.template device_context(), x_matrix, false, - y_matrix, false, 1, z, 0); + y_matrix, false, static_cast(1), z, static_cast(0)); if (z_dim.size() != 2) { z->Resize(z_dim); } diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc index 9185666c56c4621d42429c9cfdb079001c6336f1..5e4ed886b10bd48bf991ce84a9099611cf5d1d26 100644 --- a/paddle/fluid/operators/nccl_op.cc +++ b/paddle/fluid/operators/nccl_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" namespace paddle { namespace operators { @@ -105,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { " Input(Communicator) of AllReduce op input should not be NULL"); PADDLE_ENFORCE(ctx->HasOutput("Out"), " Output(Out) of AllReduce op output should not be NULL"); - - auto x_dims = ctx->GetInputsDim("X"); - std::string reduction = ctx->Attrs().Get("reduction"); PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || reduction == "ncclMin" || reduction == "ncclMax"), "invalid reduction."); + auto x_dims = ctx->GetInputsDim("X"); ctx->SetOutputsDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); } }; +// AllReduceOp +class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input of AllReduce op"); + AddInput("Communicator", "Communicator for communicating between gpus"); + AddOutput("Out", "The output of AllReduce op"); + AddAttr("reduction", + "(string, default 'ncclSum') " + "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") + .SetDefault("ncclSum"); + AddComment(R"DOC( +NCCLAllReduce Operator. + +AllReduce the input tensors. + +)DOC"); + } +}; + // ReduceOp class NCCLReduceOp : public framework::OperatorWithKernel { public: @@ -144,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel { } }; -// BcastOp -class NCCLBcastOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - " Input(X) of Bcast op input should not be NULL"); - PADDLE_ENFORCE(ctx->HasInput("Communicator"), - " Input(Communicator) of Bcast op input should not be NULL"); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - " Output(Out) of Bcast op output should not be NULL"); - - int root = ctx->Attrs().Get("root"); - PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); - - auto x_dims = ctx->GetInputsDim("X"); - ctx->SetOutputsDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; - -// AllreduceOp -class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { - public: - NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input of AllReduce op"); - AddInput("Communicator", "Communicator for communicating between gpus"); - AddOutput("Out", "The output of AllReduce op"); - AddAttr("reduction", - "(string, default 'ncclSum') " - "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.") - .SetDefault("ncclSum"); - AddComment(R"DOC( -NCCLAllReduce Operator. - -AllReduce the input tensors. - -)DOC"); - } -}; - // ReduceOp class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -214,6 +188,29 @@ Reduce the tensors. } }; +// BcastOp +class NCCLBcastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + " Input(X) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasInput("Communicator"), + " Input(Communicator) of Bcast op input should not be NULL"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + " Output(Out) of Bcast op output should not be NULL"); + + int root = ctx->Attrs().Get("root"); + PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set."); + + auto x_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + // BcastOp class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { public: diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc index 683a520e99fe72875d52393a18463324a8b6c3f2..ad623e1fe0f8941615b671a0c20bd3637ae6d407 100644 --- a/paddle/fluid/operators/nccl_op.cu.cc +++ b/paddle/fluid/operators/nccl_op.cu.cc @@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - - auto ins = ctx.MultiInput("X"); - auto outs = ctx.MultiOutput("Out"); - + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto* comm = ctx.Input("Communicator"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + ncclRedOp_t reduction_op_ = ncclSum; if (reduction == "ncclMin") { reduction_op_ = ncclMin; } else if (reduction == "ncclMax") { @@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel { } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } - - auto* comm = ctx.Input("Communicator"); - - auto stream = ctx.cuda_device_context().stream(); - // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - - for (size_t i = 0; i < ins.size(); ++i) { - VLOG(1) << "gpu : " - << " invoke allreduce. send " << ins[i]->numel() << " recv " - << outs[i]->numel(); - - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - ins[i]->data(), outs[i]->mutable_data(ctx.GetPlace()), - outs[i]->numel(), NCCLTypeWrapper::type, reduction_op_, - comm->comms().at(idx), stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " - << " finished allreduce. send " << ins[i]->numel() << " recv " - << outs[i]->numel(); - } + VLOG(3) << "gpu : " + << " invoke allreduce. send " << x->numel() << " recv " + << out->numel(); + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + x->data(), out->mutable_data(ctx.GetPlace()), out->numel(), + NCCLTypeWrapper::type, reduction_op_, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " + << " finished allreduce. send " << x->numel() << " recv " + << out->numel(); } }; @@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - - auto ins = ctx.MultiInput("X"); // x0, x1, x2 - auto outs = ctx.MultiOutput("Out"); - + auto x = ctx.Input("X"); // x0, x1, x2 + auto out = ctx.Output("Out"); + auto* comm = ctx.Input("Communicator"); + int root = ctx.Attr("root"); std::string reduction = ctx.Attr("reduction"); - ncclRedOp_t reduction_op_ = ncclSum; + ncclRedOp_t reduction_op_ = ncclSum; if (reduction == "ncclMin") { reduction_op_ = ncclMin; } else if (reduction == "ncclMax") { @@ -112,40 +100,23 @@ class NCCLReduceKernel : public framework::OpKernel { } else { PADDLE_THROW("Invalid reduction. default ncclSum."); } - - int root = ctx.Attr("root"); - auto* comm = ctx.Input("Communicator"); - - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - - auto ins_names = ctx.Inputs("X"); - std::hash hasher; - for (size_t i = 0; i < ins.size(); ++i) { - if (root == platform::kInvalidGPUId) { - root = hasher(ins_names[i]) % comm->comms().size(); - } - T* recvbuffer = nullptr; - if (root == gpu_id) { - recvbuffer = outs[i]->mutable_data(ctx.GetPlace()); - } - - VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send " - << ins[i]->numel() << " recv " << outs[i]->numel(); - - PADDLE_ENFORCE(platform::dynload::ncclReduce( - ins[i]->data(), recvbuffer, ins[i]->numel(), - NCCLTypeWrapper::type, reduction_op_, root, comm->comms().at(idx), - stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished reduce. send " - << ins[i]->numel() << " recv " << outs[i]->numel(); + T* recvbuffer = nullptr; + if (root == gpu_id) { + recvbuffer = out->mutable_data(ctx.GetPlace()); + } else { + out->Resize(framework::make_ddim({0})); } + VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel() + << " recv " << out->numel(); + PADDLE_ENFORCE(platform::dynload::ncclReduce( + x->data(), recvbuffer, x->numel(), NCCLTypeWrapper::type, + reduction_op_, root, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel() + << " recv " << out->numel(); } }; @@ -155,47 +126,27 @@ class NCCLBcastKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - int root = ctx.Attr("root"); - auto* comm = ctx.Input("Communicator"); - - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); // device id int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); - if (idx == root) { - auto ins = ctx.MultiInput("X"); - for (size_t i = 0; i < ins.size(); ++i) { - VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send " - << ins[i]->numel(); - - VLOG(1) << " before ncclBcast"; - PADDLE_ENFORCE(platform::dynload::ncclBcast( - (void*)ins[i]->data(), ins[i]->numel(), NCCLTypeWrapper::type, - root, comm->comms().at(idx), stream)); - VLOG(1) << " after ncclBcast"; - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished Bcast."; - } + auto* x = ctx.Input("X"); + VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel(); + PADDLE_ENFORCE(platform::dynload::ncclBcast( + (void*)x->data(), x->numel(), NCCLTypeWrapper::type, root, + comm->comms().at(idx), ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished Bcast."; } else { - auto outs = ctx.MultiOutput("Out"); - for (size_t i = 0; i < outs.size(); ++i) { - VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " - << framework::product(outs[i]->dims()); - - PADDLE_ENFORCE(platform::dynload::ncclBcast( - outs[i]->mutable_data(ctx.GetPlace()), outs[i]->numel(), - NCCLTypeWrapper::type, root, comm->comms().at(idx), stream)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - - VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv " - << outs[i]->numel(); - } + auto* out = ctx.Output("Out"); + VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " + << framework::product(out->dims()); + PADDLE_ENFORCE(platform::dynload::ncclBcast( + out->mutable_data(ctx.GetPlace()), out->numel(), + NCCLTypeWrapper::type, root, comm->comms().at(idx), + ctx.cuda_device_context().stream())); + VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel(); } } }; diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index b4021a5dacd80a042042eadf9f5d1a932f0f00a2..90f6f955cea51ded2dbb2bde459113458d7749a4 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -14,19 +14,15 @@ limitations under the License. */ #include #include -#include #include #include #include -#include #include -#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast); namespace f = paddle::framework; namespace p = paddle::platform; -static std::vector gpu_list; - // test data amount -const f::DDim kDims = {100, 100}; +const f::DDim kDims = {20, 20}; // nccl op common tester, init communicator. class NCCLTester : public ::testing::Test { public: virtual void SetUp() override { + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) + << "Cannot test multi-gpu nccl, because the CUDA device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + gpu_list_.emplace_back(i); + } + paddle::platform::CPUPlace cpu_place; - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { p::CUDAPlace place(i); - dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); + dev_ctxs_.emplace_back(new p::CUDADeviceContext(place)); } NCCLInitOp(); } virtual void TearDown() override { - for (auto &device_context : dev_ctxs) { + for (auto &device_context : dev_ctxs_) { delete device_context; } } @@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test { std::unique_ptr op1(new f::OpDesc); op1->SetType("ncclInit"); + op1->SetInput("parallel_scopes", {"p_scopes"}); op1->SetOutput("Communicator", {"comm"}); - op1->SetAttr("gpus", {gpu_list}); - auto *var = g_scope.Var("comm"); + auto *var = g_scope_.Var("comm"); var->GetMutable(); + auto *scope_var = g_scope_.Var("p_scopes"); + auto *p_scopes = scope_var->GetMutable>(); + (*p_scopes).resize(gpu_list_.size()); + auto op = f::OpRegistry::CreateOp(*op1); VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, cpu_place); + op->Run(g_scope_, cpu_place); VLOG(1) << "NCCLInitOp finished."; } + int GetGPUData(int gpu_id) { return gpu_id + 42; } + template void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { - std::unique_lock lk(mu); + std::unique_lock lk(mu_); const f::OpDesc *op1 = &op_desc; p::CUDAPlace place(gpu_id); - auto &ctx = dev_ctxs.at(gpu_id); + auto &ctx = dev_ctxs_.at(gpu_id); auto *send_tensor = scope->Var("st")->GetMutable(); auto *recv_tensor = scope->Var("rt")->GetMutable(); if (!send_tensor->numel()) { - send_tensor->Resize(kDims); send_tensor->mutable_data(kDims, place); - std::vector send_vector(f::product(kDims), gpu_id); + std::vector send_vector(f::product(kDims), GetGPUData(gpu_id)); paddle::framework::TensorFromVector(send_vector, *ctx, send_tensor); - ctx->Wait(); VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); } @@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test { } public: - std::vector dev_ctxs; - f::Scope g_scope; - std::mutex mu; + std::vector dev_ctxs_; + f::Scope g_scope_; + std::mutex mu_; + std::vector gpu_list_; }; // ncclInitOp with desc -TEST(NCCL, ncclInitOp) { - std::unique_ptr op_desc(new f::OpDesc); - - op_desc->SetType("ncclInit"); - op_desc->SetOutput("Communicator", {"x1"}); - op_desc->SetAttr("gpus", {gpu_list}); - - f::Scope g_scope; - paddle::platform::CPUPlace cpu_place; - - auto *var = g_scope.Var("x1"); - var->GetMutable(); - - auto op = f::OpRegistry::CreateOp(*op_desc); - VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, cpu_place); - VLOG(1) << "NCCLInitOp finished."; -} +TEST_F(NCCLTester, ncclInitOp) {} // ncclAllReduceOp with desc TEST_F(NCCLTester, ncclAllReduceOp) { @@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } - // check results - float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + float expected_result = 0.0; + for (int gpu_id : gpu_list_) { + expected_result = expected_result + GetGPUData(gpu_id); + } for (size_t i = 0; i < dev_scopes.size(); ++i) { p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[i]); + p::CUDAPlace gpu_place(gpu_list_[i]); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[i])->stream()); + static_cast(dev_ctxs_[i])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { - ASSERT_NEAR(ct[j], result, 1e-5); + ASSERT_NEAR(ct[j], expected_result, 1e-5); } } } @@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } - // check results on - float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); + float expected_result = 0.0; + for (int gpu_id : gpu_list_) { + expected_result = expected_result + GetGPUData(gpu_id); + } p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[kRoot]); + p::CUDAPlace gpu_place(gpu_list_[kRoot]); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[kRoot])->stream()); + static_cast(dev_ctxs_[kRoot])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { - ASSERT_NEAR(ct[j], result, 1e-5); + ASSERT_NEAR(ct[j], expected_result, 1e-5); } } @@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) { std::vector ths; - for (size_t i = 0; i < gpu_list.size(); ++i) { - dev_scopes.emplace_back(&g_scope.NewScope()); - std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list[i], + for (size_t i = 0; i < gpu_list_.size(); ++i) { + dev_scopes.emplace_back(&g_scope_.NewScope()); + std::thread th(&NCCLTester::PerThreadProgram, this, gpu_list_[i], *op2.get(), dev_scopes[i]); ths.emplace_back(std::move(th)); } - for (size_t i = 0; i < gpu_list.size(); ++i) { + for (size_t i = 0; i < gpu_list_.size(); ++i) { ths[i].join(); } const int idx = 1; - // check results on - float result = kRoot; + float result = GetGPUData(kRoot); p::CPUPlace cpu_place; - p::CUDAPlace gpu_place(gpu_list[idx]); + p::CUDAPlace gpu_place(gpu_list_[idx]); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt, recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs[idx])->stream()); + static_cast(dev_ctxs_[idx])->stream()); for (int64_t j = 0; j < f::product(kDims); ++j) { ASSERT_NEAR(ct[j], result, 1e-5); } } - -int main(int argc, char **argv) { - // FIXME(tonyyang-svail): - // Due to the driver issue on our CI, disable for now - return 0; - const int dev_count = p::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) - << "Cannot test multi-gpu nccl, because the CUDA device count is " - << dev_count; - return 0; - } - - std::vector places; - - places.emplace_back(paddle::platform::CPUPlace()); - int count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::CUDAPlace(i)); - gpu_list.emplace_back(i); - } - - VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Init(places); - - testing::InitGoogleTest(&argc, argv); - - // device context should be release before scope. - // otherwise driver will down. - return RUN_ALL_TESTS(); -} diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index bf4d0476df32d7454d4064cb6ee454e6ad5d6fc5..4001b9a130348b4e3ea99f3017eae6d85e41fc6e 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase { auto &place = places[place_idx]; auto *cur_scope = sub_scopes[place_idx]; - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); + workers.emplace_back( + framework::Async([program, cur_scope, place, block, place_idx] { + // Give the thread an id to distinguish parallel block with same id. + platform::RecordThread rt(static_cast(place_idx) + 1); + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); } for (auto &worker : workers) { worker.wait(); @@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase { auto *cur_scope = sub_scopes[i]; // execute - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); + workers.emplace_back( + framework::Async([program, cur_scope, place, block, i] { + // Give the thread an id to distinguish parallel block with same id. + platform::RecordThread rt(static_cast(i) + 1); + framework::Executor executor(place); + executor.Run(*program, cur_scope, block->ID(), + false /*create_local_scope*/); + })); } for (auto &worker : workers) { worker.wait(); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 781d96981e4c033d9287ab3de9860dfd9fcd2875..39c862b03ad497dca5c38ccecff20be510ab60e5 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -24,6 +24,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; using DataLayout = platform::DataLayout; using PoolingMode = platform::PoolingMode; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; template class PoolCUDNNOpKernel : public framework::OpKernel { @@ -78,8 +80,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel { // ------------------- cudnn pool algorithm --------------------- auto handle = ctx.cuda_device_context().cudnn_handle(); - T alpha = 1.0f, beta = 0.0f; - + ScalingParamType alpha = 1.0f, beta = 0.0f; PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, cudnn_output_desc, output_data)); @@ -144,8 +145,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { // ------------------- cudnn pool algorithm --------------------- auto handle = ctx.cuda_device_context().cudnn_handle(); - T alpha = 1.0f, beta = 0.0f; - + ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { T *input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. @@ -162,17 +162,19 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; -REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace, +REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::PoolCUDNNOpKernel, + ops::PoolCUDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, ops::PoolCUDNNGradOpKernel, ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace, +REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, ops::PoolCUDNNOpKernel, ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace, +REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, ops::PoolCUDNNGradOpKernel, ops::PoolCUDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index d78da10016a0e2b1d9a0ca9f3dfe4e8009bbe61d..b144ec5f7d315cb340dcd94b4a519bfcfd2a0e66 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -124,11 +124,15 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( } #endif + auto input_data_type = framework::ToDataType(ctx.Input("X")->type()); + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, + "float16 can only be used when CUDNN is used"); + } std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, + library_); } Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index be7898c22190339e0717317807b91e038f4949f6..7ba55437cb20f802cc12ceea7777d7d78bba62a6 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr>( "max_sizes", - "(vector) List of max sizes of generated prior boxes."); + "(vector) List of max sizes of generated prior boxes.") + .SetDefault(std::vector{}); AddAttr>( "aspect_ratios", "(vector) List of aspect ratios of generated prior boxes."); diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h index 0113d2f09af90eec32632180741db3a1fc40c724..18bb2deb6b5acf626dfb2883a5771d9d195d45c0 100644 --- a/paddle/fluid/operators/prior_box_op.h +++ b/paddle/fluid/operators/prior_box_op.h @@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel { boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - T inv_img_width = 1.0 / img_width; - T inv_img_height = 1.0 / img_height; - auto e_boxes = framework::EigenTensor::From(*boxes); for (int h = 0; h < feature_height; ++h) { for (int w = 0; w < feature_width; ++w) { @@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel { for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; // first prior: aspect_ratio = 1, size = min_size - box_width = box_height = min_size; + box_width = box_height = min_size / 2.; // xmin - e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; if (max_sizes.size() > 0) { auto max_size = max_sizes[s]; // second prior: aspect_ratio = 1, // size = sqrt(min_size * max_size) - box_width = box_height = sqrt(min_size * max_size); + box_width = box_height = sqrt(min_size * max_size) / 2.; // xmin - e_boxes(h, w, idx, 0) = - (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = - (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; } @@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel { if (fabs(ar - 1.) < 1e-6) { continue; } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; // xmin - e_boxes(h, w, idx, 0) = - (center_x - box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; // ymin - e_boxes(h, w, idx, 1) = - (center_y - box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; // xmax - e_boxes(h, w, idx, 2) = - (center_x + box_width * 0.5) * inv_img_width; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; // ymax - e_boxes(h, w, idx, 3) = - (center_y + box_height * 0.5) * inv_img_height; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; } } diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 335c5b26a864381bf87a2824b78f521cdce063e4..744bd3b7ef71f83ad82979eb966369c2e9456a7d 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -1,6 +1,24 @@ cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader) -op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry) -op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry) -op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry) -op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry) -set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE) +set(LOCAL_READER_LIBS) + +function(reader_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(options "") + set(common_deps reader_op_registry) + cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} ${reader_library_DEPS}) + set(LOCAL_READER_LIBS + ${TARGET_NAME} + ${LOCAL_READER_LIBS} + PARENT_SCOPE) +endfunction() + +reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) +reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) +reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) +reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc) +reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc) +# Export local libraries to parent +set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index b6a0609a1e23195ececee0f16a69daa1c1c46ed8..d0de092947eb04a1b7d06dedea919f6b1094dd06 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -24,11 +24,31 @@ static constexpr size_t kDoubleBufferSize = 2; class DoubleBufferReader : public framework::DecoratedReader { public: - explicit DoubleBufferReader(ReaderBase* reader) - : DecoratedReader(reader), - buffer_(framework::MakeChannel>( - kDoubleBufferSize)) { - std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); + struct Item { + Item() : ctx_(nullptr) {} + + std::vector payloads_; + platform::DeviceContext* ctx_; + }; + + explicit DoubleBufferReader( + ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) + : DecoratedReader(reader), place_(target_place) { + for (size_t i = 0; i < kDoubleBufferSize; ++i) { + if (platform::is_gpu_place(place_)) { +#ifdef PADDLE_WITH_CUDA + ctxs_.emplace_back(new platform::CUDADeviceContext( + boost::get(place_))); +#endif + } + } + + start_thread(); + } + + void start_thread() { + buffer_ = framework::MakeChannel(kDoubleBufferSize); + std::thread prefetch([this] { PrefetchThreadFunc(); }); prefetch.detach(); } @@ -37,10 +57,15 @@ class DoubleBufferReader : public framework::DecoratedReader { ~DoubleBufferReader() { buffer_->Close(); } + bool HasNext() const override; + private: void PrefetchThreadFunc(); - framework::Channel>* buffer_; + framework::Channel* buffer_; + platform::Place place_; + std::vector> ctxs_; + mutable Item local_buffer_; }; class CreateDoubleBufferReaderOp : public framework::OperatorBase { @@ -54,7 +79,20 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ->Get(); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); - out->Reset(new DoubleBufferReader(underlying_reader.Get())); + + auto place_str = Attr("place"); + platform::Place place; + if (place_str == "CPU") { + place = platform::CPUPlace(); + } else { + std::istringstream sin(place_str); + sin.seekg(std::string("CUDA:").size(), std::ios::beg); + size_t num; + sin >> num; + place = platform::CUDAPlace(static_cast(num)); + } + + out->Reset(new DoubleBufferReader(underlying_reader.Get(), place)); } }; @@ -69,41 +107,72 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { It launches another thread to execute the 'underlying reader' asynchronously, which prevents reading process from blocking subsequent training. )DOC"); + std::unordered_set enum_range; + constexpr size_t kMaxCUDADevs = 128; + for (size_t i = 0; i < kMaxCUDADevs; ++i) { + enum_range.insert(string::Sprintf("CUDA:%d", i)); + } + enum_range.insert("CPU"); + AddAttr("place", "The double buffer place, default is CPU") + .SetDefault("CPU") + .InEnum({enum_range}); } }; void DoubleBufferReader::ReadNext(std::vector* out) { - out->clear(); - buffer_->Receive(out); + if (local_buffer_.payloads_.empty()) { + buffer_->Receive(&local_buffer_); + } + + *out = local_buffer_.payloads_; + local_buffer_.payloads_.clear(); + if (local_buffer_.ctx_) { + local_buffer_.ctx_->Wait(); + } } void DoubleBufferReader::ReInit() { reader_->ReInit(); buffer_->Close(); - // The existing prefetch thread will terminate for the buffer_ is closed. - buffer_ = framework::MakeChannel>( - kDoubleBufferSize); - std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); - prefetch.detach(); + start_thread(); } void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "A new prefetch thread starts."; - while (true) { - std::vector batch; - reader_->ReadNext(&batch); - if (batch.empty()) { - // EOF - buffer_->Close(); - VLOG(5) << "Reached the end of the file. The prefetch thread terminates."; - break; + size_t gpu_ctx_offset = 0; + while (reader_->HasNext()) { + Item batch; + reader_->ReadNext(&batch.payloads_); + if (platform::is_gpu_place(place_)) { + std::vector gpu_batch; + auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++]; + gpu_ctx_offset %= this->ctxs_.size(); + gpu_batch.resize(batch.payloads_.size()); + for (size_t i = 0; i < batch.payloads_.size(); ++i) { + framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx, + &gpu_batch[i]); + gpu_batch[i].set_lod(batch.payloads_[i].lod()); + } + batch.ctx_ = gpu_ctx.get(); + std::swap(gpu_batch, batch.payloads_); } + if (!buffer_->Send(&batch)) { VLOG(5) << "WARNING: The double buffer channel has been closed. The " "prefetch thread terminates."; break; } } + buffer_->Close(); +} + +bool DoubleBufferReader::HasNext() const { + if (local_buffer_.payloads_.empty()) { + bool ok = buffer_->Receive(&local_buffer_); + return ok; + } else { + return true; + } } } // namespace reader diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc index 73c39b5da4484b27f75aeba3c8171c5ffed2398f..95d8674c08b63e872926ff8708d0c734da33684c 100644 --- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc +++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc @@ -19,11 +19,11 @@ namespace operators { namespace reader { template -class RandomDataGenerator : public framework::FileReader { +class RandomDataGenerator : public framework::ReaderBase { public: RandomDataGenerator(const std::vector& shapes, float min, float max) - : FileReader(shapes), min_(min), max_(max) { + : framework::ReaderBase(), min_(min), max_(max), shapes_(shapes) { PADDLE_ENFORCE_LE( min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); unsigned int seed = std::random_device()(); @@ -52,11 +52,14 @@ class RandomDataGenerator : public framework::FileReader { void ReInit() override { return; } + bool HasNext() const override { return true; } + private: float min_; float max_; std::minstd_rand engine_; std::uniform_real_distribution dist_; + std::vector shapes_; }; template diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4aa29c7206dbd3fe6a99b2a6c5ac6f083621944 --- /dev/null +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/reader_op_registry.h" +#include "paddle/fluid/recordio/scanner.h" + +namespace paddle { +namespace operators { +namespace reader { +class RecordIOFileReader : public framework::FileReader { + public: + explicit RecordIOFileReader(const std::string& filename, + const std::vector& dims) + : FileReader(dims), + scanner_(filename), + dev_ctx_(*platform::DeviceContextPool::Instance().Get( + platform::CPUPlace())) {} + + bool HasNext() const override { return scanner_.HasNext(); } + + void ReInit() override { scanner_.Reset(); } + + protected: + void ReadNextImpl(std::vector* out) override { + *out = framework::ReadFromRecordIO(scanner_, dev_ctx_); + } + + private: + recordio::Scanner scanner_; + const platform::DeviceContext& dev_ctx_; +}; + +class CreateRecordIOReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& shape_concat = Attr>("shape_concat"); + const auto& ranks = Attr>("ranks"); + PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); + PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), + int(shape_concat.size()), + "The accumulate of all ranks should be equal to the " + "shape concat's length."); + std::string filename = Attr("filename"); + + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset( + new RecordIOFileReader(filename, RestoreShapes(shape_concat, ranks))); + } +}; + +class CreateRecordIOReaderOpMaker : public FileReaderMakerBase { + public: + CreateRecordIOReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : FileReaderMakerBase(op_proto, op_checker) { + AddAttr("filename", "The filename of record io reader"); + AddComment(R"DOC( + CreateRecordIOReader Operator + + Create a reader from a record io file + )DOC"); + } +}; + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace reader = paddle::operators::reader; + +REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader, + reader::CreateRecordIOReaderOp, + reader::CreateRecordIOReaderOpMaker); + +REGISTER_FILE_READER(recordio, reader::RecordIOFileReader); diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index 4dac3831109beeed660d32f08fb27c7adf62ac2b..70e2f587dc414a850ddc341b98f26ae54636755c 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" namespace paddle { @@ -20,43 +23,53 @@ namespace reader { class ShuffleReader : public framework::DecoratedReader { public: - ShuffleReader(ReaderBase* reader, int buffer_size) - : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { - buffer_.reserve(buffer_size); + ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0) + : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { + VLOG(10) << "Create shuffle reader of " << reader_; + if (seed_ == 0) { + std::random_device device; + seed_ = device(); + } + ReadIntoBuffers(); } - void ReadNext(std::vector* out) override; + void ReadNext(std::vector* out) override { + if (iteration_pos_ >= buffer_.size()) { + VLOG(10) << "Resetting shuffle buffer"; + ReadIntoBuffers(); + } + *out = buffer_[iteration_pos_++]; + } - private: - int buffer_size_; - std::vector> buffer_; - size_t iteration_pos_; -}; + bool HasNext() const override { + return iteration_pos_ < buffer_.size() || reader_->HasNext(); + } -void ShuffleReader::ReadNext(std::vector* out) { - if (iteration_pos_ >= buffer_.size()) { - // Reload buffer with new data + private: + void ReadIntoBuffers() { buffer_.clear(); buffer_.reserve(buffer_size_); - for (int i = 0; i < buffer_size_; ++i) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - if (buffer_.back().empty()) { - buffer_.pop_back(); + iteration_pos_ = 0; + PADDLE_ENFORCE(reader_->HasNext()); + for (size_t i = 0; i < buffer_size_; ++i) { + if (!reader_->HasNext()) { break; } + buffer_.emplace_back(); + reader_->ReadNext(&buffer_.back()); } - // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be - // optimize. - std::random_shuffle(buffer_.begin(), buffer_.end()); - iteration_pos_ = 0; + std::mt19937 g(seed_); + std::shuffle(buffer_.begin(), buffer_.end(), g); + seed_ = g(); // update seed_; + VLOG(10) << "random buffer size = " << buffer_.size(); } - out->clear(); - if (!buffer_.empty()) { - std::swap(*out, buffer_[iteration_pos_++]); - } - // if buffer_ is empty, the 'out' will return as an empty vector. -} + + size_t buffer_size_; + std::vector> buffer_; + + size_t iteration_pos_; + size_t seed_; +}; class CreateShuffleReaderOp : public framework::OperatorBase { public: @@ -67,10 +80,10 @@ class CreateShuffleReaderOp : public framework::OperatorBase { const platform::Place& dev_place) const override { const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) ->Get(); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - out->Reset( - new ShuffleReader(underlying_reader.Get(), Attr("buffer_size"))); + auto& var = detail::Ref(scope.FindVar(Output("Out"))); + var.GetMutable()->Reset( + new ShuffleReader(underlying_reader.Get(), + static_cast(Attr("buffer_size")))); } }; diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index 7ea4f4b8d9feecac5bc2d0338bbbe9ab7a532040..0ba4f3854431742eb354f8c90eb395f5d7b32b2e 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -31,11 +31,16 @@ std::vector RestoreShapes(const std::vector& shape_concat, return res; } +std::unordered_map& FileReaderRegistry() { + static std::unordered_map regs; + return regs; +} + FileReaderMakerBase::FileReaderMakerBase( framework::OpProtoAndCheckerMaker::OpProto* op_proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(op_proto, op_checker) { - AddOutput("Out", "(ReaderHolder) The created random reader."); + AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable(); AddAttr>("shape_concat", "The concat of all data's shapes."); AddAttr>( "ranks", @@ -49,6 +54,10 @@ FileReaderMakerBase::FileReaderMakerBase( } void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE( + !ctx->IsRuntime(), + "'FileReaderInferShape' should only be invoked during compile time."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output file reader should not be null."); const auto shape_concat = ctx->Attrs().Get>("shape_concat"); @@ -56,16 +65,14 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { std::vector shapes = RestoreShapes(shape_concat, ranks); ctx->SetReaderDims("Out", shapes); - if (ctx->IsRuntime()) { - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); - } + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); } void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, @@ -77,19 +84,21 @@ void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, void DecoratedReaderInferShape::operator()( framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(!ctx->IsRuntime(), + "'DecoratedReaderInferShape' should only be invoked during " + "compile time."); + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), "Input(UnderlyingReader) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "The output decorated reader should not be null."); ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); - if (ctx->IsRuntime()) { - framework::VarDesc* in_reader = boost::get( - ctx->GetInputVarPtrs("UnderlyingReader")[0]); - framework::VarDesc* out_reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - out_reader->SetLoDLevels(in_reader->GetLoDLevels()); - } + framework::VarDesc* in_reader = boost::get( + ctx->GetInputVarPtrs("UnderlyingReader")[0]); + framework::VarDesc* out_reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + out_reader->SetLoDLevels(in_reader->GetLoDLevels()); } void DecoratedReaderInferVarType::operator()( const framework::OpDesc& op_desc, framework::BlockDesc* block) const { diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h index d1f0498f4692247cda72fbcbdd5070ddfaa11553..58f9b4ba35546571fd3b1d0c3ce128f18e248f01 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.h +++ b/paddle/fluid/operators/reader/reader_op_registry.h @@ -21,6 +21,20 @@ namespace paddle { namespace operators { namespace reader { +using FileReaderCreator = std::function&)>; + +std::unordered_map& FileReaderRegistry(); + +template +int RegisterFileReader(const std::string& filetype) { + FileReaderRegistry()[filetype] = []( + const std::string& fn, const std::vector& dim) { + return new Reader(fn, dim); + }; + return 0; +} + extern std::vector RestoreShapes( const std::vector& shape_concat, const std::vector& ranks); @@ -73,3 +87,15 @@ class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker { paddle::operators::reader::DecoratedReaderInferShape, \ paddle::framework::EmptyGradOpMaker, \ paddle::operators::reader::DecoratedReaderInferVarType) + +#define REGISTER_FILE_READER(_filetype, _reader) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + _reg_file_reader_##_filetype, \ + "Must use REGISTER_FILE_READER in global namespace"); \ + int TouchFileReader##_filetype() { return 0; } \ + int _reg_file_reader_entry_##filetype = \ + paddle::operators::reader::RegisterFileReader<_reader>(#_filetype) + +#define USE_FILE_READER(filetype) \ + extern int TouchFileReader##filetype(); \ + static int _use_##filetype = TouchFileReader##filetype() diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc index 69e8f8081e93cb74177eac1a57e0eaf284951e3f..7879367830216cdd875f9f95f95e2a88f282ac64 100644 --- a/paddle/fluid/operators/reduce_op.cc +++ b/paddle/fluid/operators/reduce_op.cc @@ -173,6 +173,15 @@ class ReduceMinOpMaker : public ReduceOpMaker { } }; +class ReduceProdOpMaker : public ReduceOpMaker { + public: + ReduceProdOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + SetComment("ReduceProd", "production"); + AddComment(comment_); + } +}; + } // namespace operators } // namespace paddle @@ -190,6 +199,9 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); +REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker, + reduce_prod_grad, ops::ReduceGradOp); + #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL(reduce_type, \ ops::ReduceKernel + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { + y.device(place) = x.prod(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, + const Dim& dim, int size) { + dx.device(place) = dy.broadcast(dim) * y.broadcast(dim) * x.inverse(); + } +}; + template class ReduceKernel : public framework::OpKernel { public: @@ -254,4 +270,5 @@ class ReduceGradKernel : public framework::OpKernel { __macro(reduce_sum, SumFunctor, SumGradFunctor); \ __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ - __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); + __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \ + __macro(reduce_prod, ProdFunctor, ProdGradFunctor); diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3fb8b56d2676f90ff7e1cefa46c459ee37f63ca8..d6fd6214711f4ee66b1daffa4db2e84aa7201e79 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Ref"), - "Input(Ref) of ScatterOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Index"), - "Input(Index) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of ScatterOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Updates"), "Input(Updates) of ScatterOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ScatterOp should not be null."); auto updates_dims = ctx->GetInputDim("Updates"); - auto ref_dims = ctx->GetInputDim("Ref"); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, - "Update Index should be 1-D."); + auto ref_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1, + "Update Ids should be 1-D."); PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), - "Reference and Updates should have the same shape size"); + "Xerence and Updates should have the same shape size"); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], - ctx->GetInputDim("Index")[0], - "Updates and Index should have same batch-size."); + ctx->GetInputDim("Ids")[0], + "Updates and Ids should have same batch-size."); framework::DDim data_dim(updates_dims); for (int i = 1; i < data_dim.size(); ++i) { PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); @@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Ref")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } }; @@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Ref")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } }; @@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { public: ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("Ref", "The source input of scatter op"); - AddInput("Index", - "The index input of scatter op where Ref will be updated"); + AddInput("X", "The source input of scatter op"); + AddInput("Ids", "The index input of scatter op where X will be updated"); AddInput("Updates", "The updated value of updates op"); AddOutput("Out", "The output of add op"); AddComment(R"DOC( @@ -91,8 +90,8 @@ Scatter Operator. This operator obtains output by updating the input on selected indices on the first axis: $$ -Out = Ref \\ -Out[Index] = Ref[Index] + Updates +Out = X \\ +Out[Ids] = X[Ids] + Updates $$ )DOC"); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index bdabb29fa680f8f87873b4381acf0dbd2b6195d0..ef7d700659d8d713715a10910baf739954ba0786 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto *Ref = ctx.Input("Ref"); - auto *Index = ctx.Input("Index"); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Ids"); auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - Out->ShareDataWith(*Ref); + Out->ShareDataWith(*X); - GPUScatterAssign(ctx.device_context(), *Updates, *Index, Out); + GPUScatterAssign(ctx.device_context(), *Updates, *Ids, Out); } }; @@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "This kernel only runs on GPU device."); - auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dX = ctx.Output(framework::GradVarName("X")); auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Index = ctx.Input("Index"); + auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + // In place gradient: dX = dO + dX->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Index] - GPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + // Gradient by Gather: dUpdates = dO[Ids] + GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); } }; diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 3c6e7ece320229e1a311ef6d7a27387d40be3c2a..2151d8a9240fc88966533f4a07d5cf56b6c1c3bc 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "This kernel only runs on CPU."); - auto *Ref = ctx.Input("Ref"); - auto *Index = ctx.Input("Index"); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Ids"); auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - // In place output: Out = Ref, Out[Index] += Updates - Out->ShareDataWith(*Ref); + // In place output: Out = X, Out[Ids] += Updates + Out->ShareDataWith(*X); // Apply ScatterUpdate: Out[index] += Updates[:] - ScatterAssign(ctx.device_context(), *Updates, *Index, Out); + ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); } }; @@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "This kernel only runs on CPU."); - auto *dRef = ctx.Output(framework::GradVarName("Ref")); + auto *dX = ctx.Output(framework::GradVarName("X")); auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Index = ctx.Input("Index"); + auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - // In place gradient: dRef = dO - dRef->ShareDataWith(*dOut); + // In place gradient: dX = dO + dX->ShareDataWith(*dOut); dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Index] - CPUGather(ctx.device_context(), *dOut, *Index, dUpdates); + // Gradient by Gather: dUpdates += dO[Ids] + CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); } }; diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8344a239df7b3fcbe91f91a17a3c5958013b55a6 --- /dev/null +++ b/paddle/fluid/operators/select_op.cc @@ -0,0 +1,414 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/concurrency/channel_util.h" + +namespace paddle { +namespace operators { + +static constexpr char kX[] = "X"; +static constexpr char kCaseToExecute[] = "case_to_execute"; + +static constexpr char kCases[] = "cases"; +static constexpr char kCasesBlock[] = "sub_block"; + +class SelectOp : public framework::OperatorBase { + public: + SelectOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + private: + enum class SelectOpCaseType { + DEFAULT = 0, + SEND = 1, + RECEIVE = 2, + }; + + struct SelectOpCase { + int caseIndex; + SelectOpCaseType caseType; + std::string channelName; + std::string varName; + + SelectOpCase() {} + + SelectOpCase(int caseIndex, SelectOpCaseType caseType, + std::string channelName, std::string varName) + : caseIndex(caseIndex), + caseType(caseType), + channelName(channelName), + varName(varName) {} + }; + + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + std::vector casesConfigs = + Attr>(kCases); + + framework::BlockDesc *casesBlock = + Attr(kCasesBlock); + + framework::Scope &casesBlockScope = scope.NewScope(); + + std::string caseToExecuteVarName = Input(kCaseToExecute); + framework::Variable *caseToExecuteVar = + casesBlockScope.FindVar(caseToExecuteVarName); + + // Construct cases from "conditional_block_op"(s) in the casesBlock + std::vector> cases = + ParseAndShuffleCases(&casesConfigs); + + // Get all unique channels involved in select + std::set channelsSet; + for (auto c : cases) { + if (!c->channelName.empty()) { + auto channelVar = scope.FindVar(c->channelName); + framework::ChannelHolder *ch = + channelVar->GetMutable(); + + if (channelsSet.find(ch) == channelsSet.end()) { + channelsSet.insert(ch); + } + } + } + + // Order all channels by their pointer address + std::vector channels(channelsSet.begin(), + channelsSet.end()); + std::sort(channels.begin(), channels.end()); + + // Poll all cases + int32_t caseToExecute = pollCases(&scope, &cases, channels); + + // At this point, the case to execute has already been determined, + // so we can proceed with executing the cases block + framework::LoDTensor *caseToExecuteTensor = + caseToExecuteVar->GetMutable(); + caseToExecuteTensor->data()[0] = caseToExecute; + + // Execute the cases block, only one case will be executed since we set the + // case_to_execute value to the index of the case we want to execute + framework::Executor executor(dev_place); + framework::ProgramDesc *program = casesBlock->Program(); + executor.Run(*program, &casesBlockScope, casesBlock->ID(), + false /*create_local_scope*/); + } + + /** + * Goes through all operators in the casesConfigs and processes + * "conditional_block" operators. These operators are mapped to our + * SelectOpCase objects. We randomize the case orders, and set the + * default case (if any exists) as the last case) + * @param casesBlock + * @return + */ + std::vector> ParseAndShuffleCases( + std::vector *casesConfigs) const { + std::vector> cases; + std::shared_ptr defaultCase; + + if (casesConfigs != nullptr) { + boost::char_delimiters_separator sep(false, ",", ""); + for (std::vector::iterator itr = casesConfigs->begin(); + itr < casesConfigs->end(); ++itr) { + std::string caseConfig = *itr; + boost::tokenizer<> tokens(caseConfig, sep); + + boost::tokenizer<>::iterator tok_iter = tokens.begin(); + PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index"); + std::string caseIndexString = *tok_iter; + int caseIndex = std::stoi(caseIndexString); + + ++tok_iter; + PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type"); + std::string caseTypeString = *tok_iter; + SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString); + + std::string caseChannel; + std::string caseChannelVar; + + ++tok_iter; + if (caseType != SelectOpCaseType::DEFAULT) { + PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel"); + caseChannel = *tok_iter; + + ++tok_iter; + PADDLE_ENFORCE(tok_iter != tokens.end(), + "Cannot get case channel variable"); + caseChannelVar = *tok_iter; + } + + auto c = std::make_shared(caseIndex, caseType, + caseChannel, caseChannelVar); + + if (caseType == SelectOpCaseType::DEFAULT) { + PADDLE_ENFORCE(defaultCase == nullptr, + "Select can only contain one default case."); + defaultCase = c; + } else { + cases.push_back(c); + } + } + } + + // Randomly sort cases, with default case being last + std::random_shuffle(cases.begin(), cases.end()); + if (defaultCase != nullptr) { + cases.push_back(defaultCase); + } + + return cases; + } + + /** + * This method will recursively poll the cases and determines if any case + * condition is true. + * If none of the cases conditions are true (and there is no default case), + * then block + * the thread. The thread may be woken up by a channel operation, at which + * point we + * execute the case. + * @param scope + * @param cases + * @param channels + * @return + */ + int32_t pollCases(const framework::Scope *scope, + std::vector> *cases, + std::vector channels) const { + // Lock all involved channels + lockChannels(channels); + + std::atomic caseToExecute(-1); + + std::vector>::iterator it = cases->begin(); + while (it != cases->end()) { + std::shared_ptr c = *it; + + auto chVar = scope->FindVar(c->channelName); + framework::ChannelHolder *ch = + chVar->GetMutable(); + + switch (c->caseType) { + case SelectOpCaseType::SEND: + PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel"); + if (ch->CanSend()) { + // We can send to channel directly, send the data to channel + // and execute case + auto chVar = scope->FindVar(c->varName); + concurrency::ChannelSend(ch, chVar); + caseToExecute = c->caseIndex; + } + break; + case SelectOpCaseType::RECEIVE: + if (ch->CanReceive()) { + // We can receive from channel directly, send the data to channel + // and execute case + auto chVar = scope->FindVar(c->varName); + concurrency::ChannelReceive(ch, chVar); + caseToExecute = c->caseIndex; + } + break; + case SelectOpCaseType::DEFAULT: + caseToExecute = c->caseIndex; + break; + } + + if (caseToExecute != -1) { + // We found a case to execute, stop looking at other case statements + break; + } + + ++it; + } + + if (caseToExecute == -1) { + // None of the cases are eligible to execute, enqueue current thread + // into all the sending/receiving queue of each involved channel + std::atomic completed(false); + std::recursive_mutex mutex; + std::unique_lock lock{mutex}; + // std::condition_variable_any selectCond; + auto selectCond = std::make_shared(); + + std::recursive_mutex callbackMutex; + pushThreadOnChannelQueues(scope, cases, selectCond, caseToExecute, + completed, callbackMutex); + + // TODO(thuan): Atomically unlock all channels and sleep current thread + unlockChannels(channels); + selectCond->wait(lock, [&completed]() { return completed.load(); }); + + // Select has been woken up by case operation + lockChannels(channels); + removeThreadOnChannelQueues(scope, cases); + + if (caseToExecute == -1) { + // Recursively poll cases, since we were woken up by a channel close + // TODO(thuan): Need to test if this is a valid case + unlockChannels(channels); + return pollCases(scope, cases, channels); + } + } + + // At this point, caseToExecute != -1, and we can proceed with executing + // the case block + unlockChannels(channels); + + return caseToExecute; + } + + void lockChannels(std::vector chs) const { + std::vector::iterator it = chs.begin(); + while (it != chs.end()) { + framework::ChannelHolder *ch = *it; + ch->Lock(); + ++it; + } + } + + void unlockChannels(std::vector chs) const { + std::vector::reverse_iterator it = chs.rbegin(); + while (it != chs.rend()) { + framework::ChannelHolder *ch = *it; + ch->Unlock(); + ++it; + } + } + + void pushThreadOnChannelQueues( + const framework::Scope *scope, + std::vector> *cases, + std::shared_ptr rCond, + std::atomic &caseToExecute, std::atomic &completed, + std::recursive_mutex &callbackMutex) const { + std::vector>::iterator it = cases->begin(); + while (it != cases->end()) { + std::shared_ptr c = *it; + + auto chVar = scope->FindVar(c->channelName); + framework::ChannelHolder *ch = + chVar->GetMutable(); + + std::function cb = + [&caseToExecute, &completed, &callbackMutex, + c](framework::ChannelAction channelAction) { + std::lock_guard lock{callbackMutex}; + + bool canProcess = false; + if (!completed) { + // If the channel wasn't closed, we set the caseToExecute index + // as this current case + if (channelAction != framework::ChannelAction::CLOSE) { + caseToExecute = c->caseIndex; + } + // This will allow our conditional variable to break out of wait + completed = true; + canProcess = true; + } + + return canProcess; + }; + + switch (c->caseType) { + case SelectOpCaseType::SEND: { + auto chOutputVar = scope->FindVar(c->varName); + concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb); + break; + } + case SelectOpCaseType::RECEIVE: { + auto chOutputVar = scope->FindVar(c->varName); + concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb); + break; + } + default: + break; + } + ++it; + } + } + + void removeThreadOnChannelQueues( + const framework::Scope *scope, + std::vector> *cases) const { + std::vector>::iterator it = cases->begin(); + while (it != cases->end()) { + std::shared_ptr c = *it; + + auto chVar = scope->FindVar(c->channelName); + framework::ChannelHolder *ch = + chVar->GetMutable(); + switch (c->caseType) { + case SelectOpCaseType::SEND: { + ch->RemoveFromSendQ(this); + break; + } + case SelectOpCaseType::RECEIVE: { + ch->RemoveFromReceiveQ(this); + break; + } + default: + break; + } + ++it; + } + } +}; + +class SelectOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SelectOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput(kX, + "A set of variables, which are required by operators inside the " + "cases of Select Op") + .AsDuplicable(); + AddInput(kCaseToExecute, + "(Int) The variable the sets the index of the case to execute, " + "after evaluating the channels being sent to and received from") + .AsDuplicable(); + AddAttr>(kCases, + "(String vector) Serialized list of" + "all cases in the select op. Each" + "case is serialized as: " + "',,,'" + "where type is 0 for default, 1 for" + "send, and 2 for receive" + "No channel and values are needed for" + "default cases."); + AddAttr(kCasesBlock, + "The cases block inside select_op"); + AddComment(R"DOC( +)DOC"); + } +}; + +// TODO(thuan): Implement Gradient Operator for SELECT_OP + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(select, paddle::operators::SelectOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::SelectOpMaker); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 8fdd08eae6b22cd57506d6e75182c1a7e2022562..443f40e803ea31c3961ed77842bd0775e0f74f35 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -88,6 +88,12 @@ class SendOp : public framework::OperatorBase { rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); } PADDLE_ENFORCE(rpc_client->Wait()); + // tell pservers that current trainer have called fetch + for (auto& ep : endpoints) { + VLOG(3) << "send fetch barrier, ep: " << ep; + rpc_client->AsyncSendFetchBarrier(ep); + } + PADDLE_ENFORCE(rpc_client->Wait()); } } }; diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..5661f4b42f37fed7f589c515e25fd66cfcede2c7 --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/softmax.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto lod = x->lod(); + auto dims = x->dims(); + + const size_t level = lod.size() - 1; + PADDLE_ENFORCE_EQ(dims[0], static_cast(lod[level].back()), + "The first dimension of Input(X) should be equal to the " + "sum of all sequences' lengths."); + PADDLE_ENFORCE_EQ(dims[0], x->numel(), + "The width of each timestep in Input(X) of " + "SequenceSoftmaxOp should be 1."); + + out->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor x_i = x->Slice(start_pos, end_pos); + Tensor out_i = out->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = + // framework::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL}); + framework::make_ddim({1UL, end_pos - start_pos}); + x_i.Resize(dims_i); + out_i.Resize(dims_i); + math::SoftmaxCUDNNFunctor()( + ctx.template device_context(), &x_i, + &out_i); + } + } +}; + +template +class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* out_grad = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + + auto lod = x->lod(); + const size_t level = lod.size() - 1; + + x_grad->mutable_data(ctx.GetPlace()); + for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + + Tensor out_i = out->Slice(start_pos, end_pos); + Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); + Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); + + // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) + framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); + out_i.Resize(dims_i); + out_grad_i.Resize(dims_i); + x_grad_i.Resize(dims_i); + math::SoftmaxGradCUDNNFunctor()( + ctx.template device_context(), &out_i, + &out_grad_i, &x_grad_i); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace, + ops::SequenceSoftmaxCUDNNKernel, + ops::SequenceSoftmaxCUDNNKernel) +REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::SequenceSoftmaxGradCUDNNKernel, + ops::SequenceSoftmaxGradCUDNNKernel) diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc index 7e685eb3dc7b12ef38f06b37d99a1212cfbc992c..e8b4df04286d327f568f4c43886f9fcf89cc4a88 100644 --- a/paddle/fluid/operators/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_softmax_op.cc @@ -29,6 +29,29 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // choose cudnn kernel if the runtime supported. + bool use_cudnn = ctx.Attr("use_cudnn"); + bool runtime_cudnn_support = false; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; + } +#endif + framework::LibraryType library_ = framework::LibraryType::kPlain; + if (use_cudnn && runtime_cudnn_support) { + library_ = framework::LibraryType::kCUDNN; + } + std::string data_format = ctx.Attr("data_format"); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + framework::StringToDataLayout(data_format), library_); + } }; class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { @@ -41,6 +64,17 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension " "of length 1."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Sequence Softmax Operator. @@ -91,6 +125,29 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // choose cudnn kernel if the runtime supported. + bool use_cudnn = ctx.Attr("use_cudnn"); + bool runtime_cudnn_support = false; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; + } +#endif + framework::LibraryType library_ = framework::LibraryType::kPlain; + if (use_cudnn && runtime_cudnn_support) { + library_ = framework::LibraryType::kCUDNN; + } + std::string data_format = ctx.Attr("data_format"); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + framework::StringToDataLayout(data_format), library_); + } }; } // namespace operators @@ -102,7 +159,9 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, ops::SequenceSoftmaxGradOp); REGISTER_OP_CPU_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel); + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel); REGISTER_OP_CPU_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc index 295c68c5b936d6522666a4cc4e621db6f5f5f3ed..57adea3a1b9dbcbb5787d005e4d3ec595f61d4b2 100644 --- a/paddle/fluid/operators/sequence_softmax_op.cu.cc +++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc @@ -17,7 +17,10 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel) + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel) REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..47cb336d87f8627d86ac33d6ac32c04d5d93f753 --- /dev/null +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SoftmaxCUDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Out = context.Output("Out"); + + // allocate memory on device. + Out->mutable_data(context.GetPlace()); + + math::SoftmaxCUDNNFunctor()( + context.template device_context(), X, Out); + } +}; + +template +class SoftmaxGradCUDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* Out = context.Input("Out"); + auto* dOut = context.Input(framework::GradVarName("Out")); + auto* dX = context.Output(framework::GradVarName("X")); + + // allocate memory on device. + dX->mutable_data(context.GetPlace()); + + math::SoftmaxGradCUDNNFunctor()( + context.template device_context(), Out, + dOut, dX); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace, + ops::SoftmaxCUDNNKernel); +REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::SoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 09275ef290e8c78dc0902033e904cc4e7ccd7adb..1b63f8a499e5d20d2f10c3cd1024d1bcf78764d4 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -33,6 +33,29 @@ class SoftmaxOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // choose cudnn kernel if the runtime supported. + bool use_cudnn = ctx.Attr("use_cudnn"); + bool runtime_cudnn_support = false; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; + } +#endif + framework::LibraryType library_ = framework::LibraryType::kPlain; + if (use_cudnn && runtime_cudnn_support) { + library_ = framework::LibraryType::kCUDNN; + } + std::string data_format = ctx.Attr("data_format"); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + framework::StringToDataLayout(data_format), library_); + } }; class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { @@ -43,6 +66,17 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { "The input tensor of softmax. " "2-D with shape [batch_size, input_feature_dimensions]."); AddOutput("Out", "The normalized values with the same shape as X."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Softmax Operator. @@ -80,6 +114,29 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // choose cudnn kernel if the runtime supported. + bool use_cudnn = ctx.Attr("use_cudnn"); + bool runtime_cudnn_support = false; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = + ctx.template device_context(); + runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; + } +#endif + framework::LibraryType library_ = framework::LibraryType::kPlain; + if (use_cudnn && runtime_cudnn_support) { + library_ = framework::LibraryType::kCUDNN; + } + std::string data_format = ctx.Attr("data_format"); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + framework::StringToDataLayout(data_format), library_); + } }; } // namespace operators diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 39b246a5bedb2819fc9b7fc407cfe03e59af0b68..8f7840cee1dd95a828fd4ac8815e335a5db47e3d 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -23,21 +23,21 @@ using Tensor = framework::Tensor; namespace { template -__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, - const int64_t* labels, const int batch_size, - const int class_num) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int sample_idx = tid / class_num; - - if (tid < batch_size) { - PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num); - logit_grad[tid * class_num + labels[tid]] -= static_cast(1.); +__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, + const int batch_size, const int class_num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; + i += blockDim.x * gridDim.x) { + int idx = i * class_num + labels[i]; + logit_grad[idx] -= static_cast(1.); } +} - __syncthreads(); - - if (tid < batch_size * class_num) { - logit_grad[tid] *= loss_grad[sample_idx]; +template +__global__ void Scale(T* logit_grad, const T* loss_grad, const int num, + const int class_num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + logit_grad[i] *= loss_grad[i / class_num]; } } @@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { const int batch_size = logit_grad->dims()[0]; const int class_num = logit_grad->dims()[1]; int block = 512; - int grid = (batch_size * class_num + block - 1) / block; + auto stream = context.cuda_device_context().stream(); if (context.Attr("soft_label")) { + int grid = (batch_size * class_num + block - 1) / block; const T* label_data = labels->data(); - SoftCrossEntropyGradientKernel< - T><<() - .stream()>>>(logit_grad_data, loss_grad_data, label_data, - batch_size, class_num); + SoftCrossEntropyGradientKernel<<>>( + logit_grad_data, loss_grad_data, label_data, batch_size, class_num); } else { + int grid = (batch_size + block - 1) / block; const int64_t* label_data = labels->data(); - CrossEntropyGrad< - T><<() - .stream()>>>(logit_grad_data, loss_grad_data, label_data, - batch_size, class_num); + CrossEntropyGrad<<>>( + logit_grad_data, label_data, batch_size, class_num); + int num = batch_size * class_num; + grid = (num + block - 1) / block; + Scale<<>>(logit_grad_data, loss_grad_data, num, + class_num); } } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index a1a743d94e204ca506c994f8fabb6bbf8c22cea5..7eec6ab657723c6390dfa14a78d6c49a76f2a279 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) -nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 1842ecd745e3f5cb75600ce00d89018f81682632..7e001ecc56173db76e8c576e7efd66f41192f292 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" namespace paddle { @@ -80,6 +81,22 @@ enum class PoolingMode { template class CudnnDataType; +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_HALF; + // The scaling param type is float for HALF and FLOAT tensors + typedef const float ScalingParamType; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + template <> class CudnnDataType { public: @@ -289,7 +306,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_CUDA if (use_cudnn) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bb9fbd468f38fffc94107e321e777fc0e772fbe6..98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); + compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); @@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaGetLastError()); } +int CUDADeviceContext::GetComputeCapability() const { + return compute_capability; +} + int CUDADeviceContext::GetMaxPhysicalThreadCount() const { return multi_process * max_threads_per_mp; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e779644190de1246cd650fbf91eeaeb03494643f..603b890af13b529c490c29112a73a09cc815d07a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return place in the device context. */ Place GetPlace() const override; + /*! \brief Return compute capability in the device context. */ + int GetComputeCapability() const; + /*! \brief Return the max physical thread count in the device context */ int GetMaxPhysicalThreadCount() const; @@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext { cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + int compute_capability; int multi_process; int max_threads_per_mp; }; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 78e00d5420bbea40c9bea4be919ec4ce5ececdcb..3b4437f576e1c2e931a86ec6d5e823ec1f344c52 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -26,8 +26,14 @@ limitations under the License. */ namespace paddle { namespace platform { namespace { +// Current thread's id. Note, we don't distinguish nested threads +// for now. +thread_local int cur_thread_id = 0; +// Tracking the nested block stacks of each thread. +thread_local std::deque block_id_stack; +// Tracking the nested event stacks. +thread_local std::deque annotation_stack; -thread_local const char *cur_annotation = nullptr; std::once_flag tracer_once_flag; DeviceTracer *tracer = nullptr; } // namespace @@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer { correlations_[id] = anno; } - void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) { - if (!anno) { - // TODO(panyx0718): Currently, it doesn't support nested situation - // Up-level can be cleared by low-level and therefore get nullptr - // here. + void AddCPURecords(const std::string &anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, int64_t thread_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; return; } std::lock_guard l(trace_mu_); - cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0}); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, device_id, thread_id}); } void AddMemRecords(const std::string &name, uint64_t start_ns, - uint64_t end_ns, uint32_t device_id, uint32_t stream_id, + uint64_t end_ns, int64_t device_id, int64_t stream_id, uint32_t correlation_id, uint64_t bytes) { // 0 means timestamp information could not be collected for the kernel. if (start_ns == 0 || end_ns == 0) { @@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } - void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, - uint32_t stream_id, uint32_t correlation_id) { + void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, + int64_t stream_id, uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. if (start == 0 || end == 0) { VLOG(3) << correlation_id << " cannot be traced"; @@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer { continue; } auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); event->set_name(correlations_.at(r.correlation_id)); event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); - event->set_stream_id(r.stream_id); + event->set_sub_device_id(r.stream_id); event->set_device_id(r.device_id); } for (const CPURecord &r : cpu_records_) { auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); event->set_name(r.name); event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); - event->set_stream_id(r.thread_id); - event->set_device_id(-1); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); } for (const MemRecord &r : mem_records_) { auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); event->set_name(r.name); event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); - event->set_stream_id(r.stream_id); + event->set_sub_device_id(r.stream_id); event->set_device_id(r.device_id); event->mutable_memcopy()->set_bytes(r.bytes); } @@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer { if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { if (cbInfo->callbackSite == CUPTI_API_ENTER) { - const std::string anno = - cur_annotation ? cur_annotation : cbInfo->symbolName; + const std::string anno = !annotation_stack.empty() + ? annotation_stack.back() + : cbInfo->symbolName; tracer->AddAnnotation(cbInfo->correlationId, anno); } } else { @@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer { void AddAnnotation(uint64_t id, const std::string &anno) {} - void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {} + void AddCPURecords(const std::string &anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, int64_t thread_id) {} void AddMemRecords(const std::string &name, uint64_t start_ns, - uint64_t end_ns, uint32_t device_id, uint32_t stream_id, + uint64_t end_ns, int64_t device_id, int64_t stream_id, uint32_t correlation_id, uint64_t bytes) {} - void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, - uint32_t stream_id, uint32_t correlation_id) {} + void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, + int64_t stream_id, uint32_t correlation_id) {} bool IsEnabled() { return false; } @@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() { return tracer; } -void SetCurAnnotation(const char *anno) { cur_annotation = anno; } +void SetCurAnnotation(const std::string &anno) { + annotation_stack.push_back(anno); +} + +void ClearCurAnnotation() { annotation_stack.pop_back(); } + +std::string CurAnnotation() { + if (annotation_stack.empty()) return ""; + return annotation_stack.back(); +} + +void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } + +void ClearCurBlock() { block_id_stack.pop_back(); } + +int BlockDepth() { return block_id_stack.size(); } + +void SetCurThread(int thread_id) { cur_thread_id = thread_id; } -void ClearCurAnnotation() { cur_annotation = nullptr; } +void ClearCurThread() { cur_thread_id = 0; } -const char *CurAnnotation() { return cur_annotation; } +int CurThread() { return cur_thread_id; } } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 23f7cdbdffc9f48ac5555455bf745233c81dd0cb..deb3d23f786353b8e7a2f28d094e364158885a34 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -32,22 +32,23 @@ class DeviceTracer { struct KernelRecord { uint64_t start_ns; uint64_t end_ns; - uint32_t device_id; - uint32_t stream_id; + int64_t device_id; + int64_t stream_id; uint32_t correlation_id; }; struct CPURecord { std::string name; uint64_t start_ns; uint64_t end_ns; - uint64_t thread_id; + int64_t device_id; + int64_t thread_id; }; struct MemRecord { std::string name; uint64_t start_ns; uint64_t end_ns; - uint32_t device_id; - uint32_t stream_id; + int64_t device_id; + int64_t stream_id; uint32_t correlation_id; uint64_t bytes; }; @@ -64,18 +65,18 @@ class DeviceTracer { virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddMemRecords(const std::string& name, uint64_t start_ns, - uint64_t end_ns, uint32_t device_id, - uint32_t stream_id, uint32_t correlation_id, + uint64_t end_ns, int64_t device_id, + int64_t stream_id, uint32_t correlation_id, uint64_t bytes) = 0; - virtual void AddCPURecords(const char* anno, uint64_t start_ns, - uint64_t end_ns) = 0; + virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, + uint64_t end_ns, int64_t device_id, + int64_t thread_id) = 0; // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. - virtual void AddKernelRecords(uint64_t start, uint64_t end, - uint32_t device_id, uint32_t stream_id, - uint32_t correlation_id) = 0; + virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, + int64_t stream_id, uint32_t correlation_id) = 0; // Generate a proto after done (Disabled). virtual proto::Profile GenProfile(const std::string& profile_path) = 0; @@ -87,10 +88,18 @@ class DeviceTracer { DeviceTracer* GetDeviceTracer(); // Set a name for the cuda kernel operation being launched by the thread. -void SetCurAnnotation(const char* anno); +void SetCurAnnotation(const std::string& anno); // Clear the name after the operation is done. void ClearCurAnnotation(); // Current name of the operation being run in the thread. -const char* CurAnnotation(); +std::string CurAnnotation(); + +void SetCurBlock(int block_id); +void ClearCurBlock(); +int BlockDepth(); + +void SetCurThread(int thread_id); +void ClearCurThread(); +int CurThread(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index da4041bad0d82fe1c8c7a12fd0c7177e6dbddef3..dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -33,6 +33,15 @@ int GetCUDADeviceCount() { return count; } +int GetCUDAComputeCapability(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + cudaDeviceProp device_prop; + PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id), + "cudaGetDeviceProperties failed in " + "paddle::platform::GetCUDAComputeCapability"); + return device_prop.major * 10 + device_prop.minor; +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index c38ccf0f2ade1d2405177b541b33fd84283726ff..fa469fa77f5ca780da153cc87da8d04f239711f3 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse = //! Get the total number of GPU devices in system. int GetCUDADeviceCount(); +//! Get the compute capability of the ith GPU (format: major * 10 + minor) +int GetCUDAComputeCapability(int i); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu deleted file mode 100644 index 32a293796c09e5254c5eb48d11fa74617b3465ac..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/nccl_test.cu +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "glog/logging.h" -#include "gtest/gtest.h" - -#include "paddle/fluid/framework/init.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/dynload/nccl.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" - -static int dev_count = 0; - -namespace paddle { -namespace platform { - -TEST(NCCL, init) { - std::vector comms; - comms.resize(dev_count); - PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); - - for (int i = 0; i < dev_count; ++i) { - dynload::ncclCommDestroy(comms[i]); - } -} - -template -struct PerThreadData { - thrust::device_vector send_buff; - thrust::device_vector recv_buff; - CUDADeviceContext dev_ctx; - - T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); } - - T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } - - PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { - send_buff.resize(size); - for (size_t i = 0; i < size; ++i) { - send_buff[i] = static_cast(i); - } - recv_buff.resize(size); - } -}; - -static constexpr int ELEM_COUNT = 10000; - -TEST(NCCL, all_reduce) { - std::vector comms; - comms.resize(dev_count); - VLOG(1) << "Initializing ncclComm"; - dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); - VLOG(1) << "ncclComm initialized"; - VLOG(1) << "Creating thread data"; - std::vector>> data; - data.reserve(dev_count); - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Creating thread data for device " << i; - SetDeviceId(i); - data.emplace_back(new PerThreadData(i, ELEM_COUNT)); - } - VLOG(1) << "Thread data created"; - - VLOG(1) << "Check send_buf data"; - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Check on device " << i; - SetDeviceId(i); - thrust::host_vector tmp = data[i]->send_buff; - for (size_t j = 0; j < tmp.size(); ++j) { - ASSERT_NEAR(static_cast(j), tmp[j], 1e-5); - } - } - - VLOG(1) << "Invoking ncclAllReduce"; - - dynload::ncclGroupStart(); - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Invoking ncclAllReduce with device " << i; - SetDeviceId(i); - PADDLE_ENFORCE(dynload::ncclAllReduce( - data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble, - ncclSum, comms[i], data[i]->dev_ctx.stream())); - VLOG(1) << "Invoked ncclAllReduce for device " << i; - } - dynload::ncclGroupEnd(); - - VLOG(1) << "Invoked ncclAllReduce"; - - VLOG(1) << "Sync devices"; - for (int i = 0; i < dev_count; ++i) { - VLOG(1) << "Sync device " << i; - SetDeviceId(i); - data[i]->dev_ctx.Wait(); - } - VLOG(1) << "device synced"; - - for (int i = 0; i < dev_count; ++i) { - SetDeviceId(i); - VLOG(1) << "Checking vector on device " << i; - thrust::host_vector tmp = data[i]->recv_buff; - for (size_t j = 0; j < tmp.size(); ++j) { - auto elem = static_cast(j); - elem *= dev_count; - ASSERT_NEAR(tmp[j], elem, 1e-4); - } - } - - for (int i = 0; i < dev_count; ++i) { - dynload::ncclCommDestroy(comms[i]); - } -} -} // namespace platform -} // namespace paddle - -int main(int argc, char** argv) { - dev_count = paddle::platform::GetCUDADeviceCount(); - if (dev_count <= 1) { - LOG(WARNING) - << "Cannot test multi-gpu nccl, because the CUDA device count is " - << dev_count; - return 0; - } - - std::vector places; - - places.emplace_back(paddle::platform::CPUPlace()); - int count = paddle::platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::CUDAPlace(i)); - } - - VLOG(0) << " DeviceCount " << count; - paddle::platform::DeviceContextPool::Init(places); - - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 28ef3e04b1c50e0d42eeb27608259c6449429da5..b25206ff35cc87dcdd363bc0de54530f629d73ed 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) name_ = name; PushEvent(name_, dev_ctx_); // Maybe need the same push/pop behavior. - SetCurAnnotation(name_.c_str()); + SetCurAnnotation(name_); } RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { - tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec()); + tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), + BlockDepth(), CurThread()); } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); } +RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) { + if (g_state == ProfilerState::kDisabled) return; + SetCurBlock(block_id); + name_ = string::Sprintf("block_%d", block_id); +} + +RecordBlock::~RecordBlock() { + if (g_state == ProfilerState::kDisabled) return; + DeviceTracer* tracer = GetDeviceTracer(); + if (tracer) { + // We try to put all blocks at the same nested depth in the + // same timeline lane. and distinguish the using thread_id. + tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), + CurThread()); + } + ClearCurBlock(); +} + +RecordThread::RecordThread(int thread_id) { + if (g_state == ProfilerState::kDisabled) return; + SetCurThread(thread_id); +} + +RecordThread::~RecordThread() { + if (g_state == ProfilerState::kDisabled) return; + ClearCurThread(); +} + void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 3542ce6cda87e3b013d60393e4ba93da61921940..de9a5cc20d76bf84778e0933831f218abb66c465 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -118,6 +118,20 @@ struct RecordEvent { std::string full_name_; }; +struct RecordBlock { + explicit RecordBlock(int block_id); + ~RecordBlock(); + + private: + std::string name_; + uint64_t start_ns_; +}; + +struct RecordThread { + explicit RecordThread(int thread_id); + ~RecordThread(); +}; + // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index 71b5a9b12ef4a045ebfd3ee3d06ee25032083ff5..7b42aa785ec6ad5731e3adee1e9f189127a826a1 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -18,12 +18,17 @@ package paddle.platform.proto; message MemCopy { optional uint64 bytes = 1; } message Event { + enum EventType { + CPU = 0; + GPUKernel = 1; + } + optional EventType type = 8; optional string name = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; // When positive, it represents gpu id. When -1, it represents CPU. optional int64 device_id = 5; - optional uint32 stream_id = 6; + optional int64 sub_device_id = 6; optional MemCopy memcopy = 7; } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d62f34030894e2fa21925bbc44e24b4e7d738d15..8942b5c9430ffa4e499b0ad1d2b5acf6d18ec0ab 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED - SRCS pybind.cc exception.cc protobuf.cc const_value.cc + SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method ${GLOB_OP_LIB}) if(NOT APPLE AND NOT ANDROID) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index b0a2497d919b65afbe5eeaf4fe47c19baa1aba1c..45a64f43846e79c27295e52c59dca6bdfaa120a3 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -161,6 +161,8 @@ void BindBlockDesc(py::module &m) { py::return_value_policy::reference) .def("prepend_op", &BlockDesc::PrependOp, py::return_value_policy::reference) + .def("insert_op", &BlockDesc::InsertOp, + py::return_value_policy::reference) .def("remove_op", &BlockDesc::RemoveOp) .def("var", [](BlockDesc &self, py::bytes byte_name) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ac7d1efb577505b70e10a70cdcfd3ed9c5fe1f5c..6c05442466f5f3d8e04a8f0a2206443b1007a107 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -26,16 +26,20 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/pybind.h" +#include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/tensor_py.h" + #include "paddle/fluid/string/to_string.h" #ifdef PADDLE_WITH_CUDA @@ -100,12 +104,14 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("set_float_element", TensorSetElement) @@ -217,8 +223,18 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) -> operators::NetOp * { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_reader", + [](Variable &self) -> framework::ReaderHolder * { + PADDLE_ENFORCE(self.IsType()); + return self.GetMutable(); + }, py::return_value_policy::reference); + py::class_(m, "Reader", "") + .def("has_next", &framework::ReaderHolder::HasNext) + .def("reset", &framework::ReaderHolder::ReInit); + py::class_(m, "Scope", "") .def("var", [](Scope &self, const std::string &name) -> Variable * { @@ -302,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle. #endif }); // clang-format on - #ifdef PADDLE_WITH_CUDA py::class_(m, "Communicator").def(py::init<>()); #endif @@ -410,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_devices", &framework::InitDevices); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); +#ifdef PADDLE_WITH_CUDA + m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { + // Only GPUs with Compute Capability >= 53 support float16 + return platform::GetCUDAComputeCapability(place.device) >= 53; + }); +#endif m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); @@ -474,6 +495,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("enable_profiler", platform::EnableProfiler); m.def("disable_profiler", platform::DisableProfiler); m.def("reset_profiler", platform::ResetProfiler); + + BindRecordIOWriter(m); return m.ptr(); } } // namespace pybind diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc new file mode 100644 index 0000000000000000000000000000000000000000..16f8bfb1a2e3a840670594d3cc2970e690dce891 --- /dev/null +++ b/paddle/fluid/pybind/recordio.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pybind/recordio.h" +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/recordio/writer.h" + +namespace paddle { +namespace pybind { + +class RecordIOWriter { + public: + RecordIOWriter(const std::string& filename, recordio::Compressor compressor, + size_t max_num_record) + : stream_(filename), writer_(&stream_, compressor, max_num_record) {} + + void AppendTensor(const framework::LoDTensor& tensor) { + tensors_.push_back(tensor); + } + + void CompleteAppendTensor() { + auto& ctx = + *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); + framework::WriteToRecordIO(writer_, tensors_, ctx); + tensors_.clear(); + } + + void Close() { + PADDLE_ENFORCE(tensors_.empty()); + writer_.Flush(); + stream_.close(); + } + + private: + std::vector tensors_; + std::ofstream stream_; + recordio::Writer writer_; +}; + +void BindRecordIOWriter(py::module& m) { + py::class_ writer(m, "RecordIOWriter", ""); + py::enum_(writer, "Compressor", "") + .value("Snappy", recordio::Compressor::kSnappy) + .value("NoCompress", recordio::Compressor::kNoCompress); + + writer + .def("__init__", + [](RecordIOWriter& self, const std::string& filename, + recordio::Compressor compressor, size_t max_num_record) { + new (&self) RecordIOWriter(filename, compressor, max_num_record); + }) + .def("append_tensor", &RecordIOWriter::AppendTensor) + .def("complete_append_tensor", &RecordIOWriter::CompleteAppendTensor) + .def("close", &RecordIOWriter::Close); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/recordio.h b/paddle/fluid/pybind/recordio.h new file mode 100644 index 0000000000000000000000000000000000000000..60e6a9e8595614b38375fca8c13d520739af9aaf --- /dev/null +++ b/paddle/fluid/pybind/recordio.h @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +extern void BindRecordIOWriter(py::module& m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 1b0916ea0370d95a0c7dd149ee3f7b294c5e2351..6f8c597f8e610594851c318c122563523e4e7ea6 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" @@ -71,27 +72,39 @@ struct CastToPyBufferImpl { paddle::platform::GpuMemcpyAsync( dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), cudaMemcpyDeviceToHost, dev_ctx->stream()); + dev_ctx->Wait(); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; } - return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), - py::format_descriptor::format(), - (size_t)framework::arity(dst_tensor.dims()), - dims_outside, strides); + + if (std::type_index(typeid(CUR_TYPE)) == + std::type_index(typeid(platform::float16))) { + return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), + "e", /* np.dtype('e') == np.float16 */ + (size_t)framework::arity(dst_tensor.dims()), + dims_outside, strides); + } else { + return py::buffer_info(dst_tensor.data(), sizeof(CUR_TYPE), + py::format_descriptor::format(), + (size_t)framework::arity(dst_tensor.dims()), + dims_outside, strides); + } } else { constexpr bool less = I + 1 < std::tuple_size>::value; return CastToPyBufferImpl()(tensor); } } }; + } // namespace details + inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { auto buffer_info = - details::CastToPyBufferImpl()( - tensor); + details::CastToPyBufferImpl()(tensor); return buffer_info; } @@ -136,6 +149,22 @@ void PyCPUTensorSetFromArray( std::memcpy(dst, array.data(), sizeof(T) * array.size()); } +template <> +void PyCPUTensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::CPUPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); +} + #ifdef PADDLE_WITH_CUDA template void PyCUDATensorSetFromArray( @@ -157,6 +186,28 @@ void PyCUDATensorSetFromArray( paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } + +template <> +void PyCUDATensorSetFromArray( + framework::Tensor &self, + py::array_t array, + paddle::platform::CUDAPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back((int)array.shape()[i]); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto dev_ctx = + static_cast(pool.Get(place)); + paddle::platform::GpuMemcpyAsync(dst, array.data(), + sizeof(uint16_t) * array.size(), + cudaMemcpyHostToDevice, dev_ctx->stream()); +} #endif } // namespace pybind diff --git a/paddle/fluid/recordio/CMakeLists.txt b/paddle/fluid/recordio/CMakeLists.txt index e1e7c2cdb3d0c960d5cd408420b5aaead73e70d7..92e97a6c85d7c8f01c8473feb9772f2285d49673 100644 --- a/paddle/fluid/recordio/CMakeLists.txt +++ b/paddle/fluid/recordio/CMakeLists.txt @@ -3,4 +3,7 @@ cc_library(header SRCS header.cc) cc_test(header_test SRCS header_test.cc DEPS header) cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib) cc_test(chunk_test SRCS chunk_test.cc DEPS chunk) -cc_library(recordio DEPS chunk header) +cc_library(writer SRCS writer.cc DEPS chunk) +cc_library(scanner SRCS scanner.cc DEPS chunk) +cc_test(writer_scanner_test SRCS writer_scanner_test.cc DEPS writer scanner) +cc_library(recordio DEPS chunk header writer scanner) diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc index 587fd375c38ca83e1c65cb3ccc20b3509b6348c7..187a6a4ea7bd9d3a8ae48fa262e18f71b0f7d20d 100644 --- a/paddle/fluid/recordio/chunk.cc +++ b/paddle/fluid/recordio/chunk.cc @@ -24,33 +24,52 @@ namespace paddle { namespace recordio { constexpr size_t kMaxBufSize = 1024; +/** + * Read Stream by a fixed sized buffer. + * @param in input stream + * @param limit read at most `limit` bytes from input stream. 0 means no limit + * @param callback A function object with (const char* buf, size_t size) -> void + * as its type. + */ template -static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) { +static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) { char buf[kMaxBufSize]; std::streamsize actual_size; size_t counter = 0; - do { - auto actual_max = - limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize; - actual_size = in.readsome(buf, actual_max); + size_t actual_max; + while (!in.eof() || + (limit != 0 && counter >= limit)) { // End of file or reach limit + actual_max = + limit != 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize; + in.read(buf, actual_max); + actual_size = in.gcount(); if (actual_size == 0) { break; } callback(buf, actual_size); - if (limit > 0) { + if (limit != 0) { counter += actual_size; } - } while (actual_size == kMaxBufSize); + } + in.clear(); // unset eof state } +/** + * Copy stream in to another stream + */ static void PipeStream(std::istream& in, std::ostream& os) { ReadStreamByBuf( - in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); }); + in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); }); } -static uint32_t Crc32Stream(std::istream& in, int limit = -1) { - auto crc = crc32(0, nullptr, 0); + +/** + * Calculate CRC32 from an input stream. + */ +static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) { + uint32_t crc = static_cast(crc32(0, nullptr, 0)); ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) { - crc = crc32(crc, reinterpret_cast(buf), len); + crc = static_cast(crc32( + crc, reinterpret_cast(buf), static_cast(len))); }); return crc; } @@ -85,28 +104,29 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const { compressed_stream.reset(); } - auto end_pos = sout.tellg(); + sout.seekg(0, std::ios::end); + uint32_t len = static_cast(sout.tellg()); sout.seekg(0, std::ios::beg); - uint32_t len = static_cast(end_pos - sout.tellg()); uint32_t crc = Crc32Stream(sout); - sout.seekg(0, std::ios::beg); - Header hdr(static_cast(records_.size()), crc, ct, len); hdr.Write(os); + sout.seekg(0, std::ios::beg); + sout.clear(); PipeStream(sout, os); return true; } -void Chunk::Parse(std::istream& sin) { +bool Chunk::Parse(std::istream& sin) { Header hdr; - hdr.Parse(sin); + bool ok = hdr.Parse(sin); + if (!ok) { + return ok; + } auto beg_pos = sin.tellg(); - auto crc = Crc32Stream(sin, hdr.CompressSize()); + uint32_t crc = Crc32Stream(sin, hdr.CompressSize()); PADDLE_ENFORCE_EQ(hdr.Checksum(), crc); - Clear(); - - sin.seekg(beg_pos, std::ios::beg); + sin.seekg(beg_pos, sin.beg); std::unique_ptr compressed_stream; switch (hdr.CompressType()) { case Compressor::kNoCompress: @@ -126,8 +146,10 @@ void Chunk::Parse(std::istream& sin) { std::string buf; buf.resize(rec_len); stream.read(&buf[0], rec_len); + PADDLE_ENFORCE_EQ(rec_len, stream.gcount()); Add(buf); } + return true; } } // namespace recordio diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h index 0ba9c63abbe72e7a51ddb1af5f0d206aa9f6cc5b..bf20ebd455c26ddeebeeea8db04cf7103b0c085f 100644 --- a/paddle/fluid/recordio/chunk.h +++ b/paddle/fluid/recordio/chunk.h @@ -26,9 +26,9 @@ namespace recordio { class Chunk { public: Chunk() : num_bytes_(0) {} - void Add(std::string buf) { - records_.push_back(buf); + void Add(const std::string& buf) { num_bytes_ += buf.size(); + records_.emplace_back(buf); } // dump the chunk into w, and clears the chunk and makes it ready for // the next add invocation. @@ -37,10 +37,15 @@ public: records_.clear(); num_bytes_ = 0; } - void Parse(std::istream& sin); - size_t NumBytes() { return num_bytes_; } + + // returns true if ok, false if eof + bool Parse(std::istream& sin); + size_t NumBytes() const { return num_bytes_; } + size_t NumRecords() const { return records_.size(); } const std::string& Record(int i) const { return records_[i]; } + bool Empty() const { return records_.empty(); } + private: std::vector records_; // sum of record lengths in bytes. diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc index a67ba32ed6ab8bda230d1414975c96a0be6d682b..1f0e36a14d373ca96167199d4582bc8f17290ae8 100644 --- a/paddle/fluid/recordio/chunk_test.cc +++ b/paddle/fluid/recordio/chunk_test.cc @@ -26,7 +26,7 @@ TEST(Chunk, SaveLoad) { ch.Add(std::string("123", 4)); std::stringstream ss; ch.Write(ss, Compressor::kNoCompress); - ch.Clear(); + ss.seekg(0); ch.Parse(ss); ASSERT_EQ(ch.NumBytes(), 10U); } diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc index 3641caaa8981020519cbc31e5362348c02d3bbce..e50de15b7c2b480357f5f6c7daa2b4a676749679 100644 --- a/paddle/fluid/recordio/header.cc +++ b/paddle/fluid/recordio/header.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/recordio/header.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace recordio { @@ -26,23 +27,33 @@ Header::Header() Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs) : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {} -void Header::Parse(std::istream& is) { +bool Header::Parse(std::istream& is) { + uint32_t magic; + size_t read_size = + is.readsome(reinterpret_cast(&magic), sizeof(uint32_t)); + if (read_size < sizeof(uint32_t)) { + return false; + } + PADDLE_ENFORCE_EQ(magic, kMagicNumber); + is.read(reinterpret_cast(&num_records_), sizeof(uint32_t)) .read(reinterpret_cast(&checksum_), sizeof(uint32_t)) .read(reinterpret_cast(&compressor_), sizeof(uint32_t)) .read(reinterpret_cast(&compress_size_), sizeof(uint32_t)); + return true; } void Header::Write(std::ostream& os) const { - os.write(reinterpret_cast(&num_records_), sizeof(uint32_t)) + os.write(reinterpret_cast(&kMagicNumber), sizeof(uint32_t)) + .write(reinterpret_cast(&num_records_), sizeof(uint32_t)) .write(reinterpret_cast(&checksum_), sizeof(uint32_t)) .write(reinterpret_cast(&compressor_), sizeof(uint32_t)) .write(reinterpret_cast(&compress_size_), sizeof(uint32_t)); } std::ostream& operator<<(std::ostream& os, Header h) { - os << h.NumRecords() << h.Checksum() - << static_cast(h.CompressType()) << h.CompressSize(); + os << "Header: " << h.NumRecords() << ", " << h.Checksum() << ", " + << static_cast(h.CompressType()) << ", " << h.CompressSize(); return os; } diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h index cbd52642a668d1eaeeafb672e50af1a476975080..9200ac090de4514bef3704ac502039222eef2284 100644 --- a/paddle/fluid/recordio/header.h +++ b/paddle/fluid/recordio/header.h @@ -19,8 +19,6 @@ namespace paddle { namespace recordio { -// Default ChunkSize -constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024; // MagicNumber for memory checking constexpr uint32_t kMagicNumber = 0x01020304; @@ -44,7 +42,9 @@ public: Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs); void Write(std::ostream& os) const; - void Parse(std::istream& is); + + // returns true if OK, false if eof + bool Parse(std::istream& is); uint32_t NumRecords() const { return num_records_; } uint32_t Checksum() const { return checksum_; } diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc new file mode 100644 index 0000000000000000000000000000000000000000..d842f8fe5a4c9d1a2b564c738d97fffb02f3ccb5 --- /dev/null +++ b/paddle/fluid/recordio/scanner.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace recordio { +Scanner::Scanner(std::unique_ptr &&stream) + : stream_(std::move(stream)) { + Reset(); +} + +Scanner::Scanner(const std::string &filename) { + stream_.reset(new std::ifstream(filename)); + Reset(); +} + +void Scanner::Reset() { + stream_->seekg(0, std::ios::beg); + ParseNextChunk(); +} + +std::string Scanner::Next() { + PADDLE_ENFORCE(!eof_, "StopIteration"); + auto rec = cur_chunk_.Record(offset_++); + if (offset_ == cur_chunk_.NumRecords()) { + ParseNextChunk(); + } + return rec; +} + +void Scanner::ParseNextChunk() { + eof_ = !cur_chunk_.Parse(*stream_); + offset_ = 0; +} + +bool Scanner::HasNext() const { return !eof_; } +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h new file mode 100644 index 0000000000000000000000000000000000000000..f3f17b69f195ddd92f5a39ead9755a7b8e2dd329 --- /dev/null +++ b/paddle/fluid/recordio/scanner.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/recordio/chunk.h" +namespace paddle { +namespace recordio { + +class Scanner { +public: + explicit Scanner(std::unique_ptr&& stream); + + explicit Scanner(const std::string& filename); + + void Reset(); + + std::string Next(); + + bool HasNext() const; + +private: + std::unique_ptr stream_; + Chunk cur_chunk_; + size_t offset_; + bool eof_; + + void ParseNextChunk(); +}; +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc new file mode 100644 index 0000000000000000000000000000000000000000..196d66edff8cc6000afcd74fb945c05dcab7106a --- /dev/null +++ b/paddle/fluid/recordio/writer.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/recordio/writer.h" +#include "paddle/fluid/platform/enforce.h" +namespace paddle { +namespace recordio { +void Writer::Write(const std::string& record) { + cur_chunk_.Add(record); + if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) { + Flush(); + } +} + +void Writer::Flush() { + cur_chunk_.Write(stream_, compressor_); + cur_chunk_.Clear(); +} + +Writer::~Writer() { + PADDLE_ENFORCE(cur_chunk_.Empty(), "Writer must be flushed when destroy."); +} + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h new file mode 100644 index 0000000000000000000000000000000000000000..0c478d507547b10b8ebaaf5e512557a5c8c13e65 --- /dev/null +++ b/paddle/fluid/recordio/writer.h @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/recordio/chunk.h" +namespace paddle { +namespace recordio { + +class Writer { +public: + Writer(std::ostream* sout, + Compressor compressor, + size_t max_num_records_in_chunk = 1000) + : stream_(*sout), + max_num_records_in_chunk_(max_num_records_in_chunk), + compressor_(compressor) {} + + void Write(const std::string& record); + + void Flush(); + + ~Writer(); + +private: + std::ostream& stream_; + size_t max_num_records_in_chunk_; + Chunk cur_chunk_; + Compressor compressor_; +}; + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e764f0d9439709ad101af2b8864dc0158bd359b --- /dev/null +++ b/paddle/fluid/recordio/writer_scanner_test.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/recordio/writer.h" + +TEST(WriterScanner, Normal) { + std::stringstream* stream = new std::stringstream(); + + { + paddle::recordio::Writer writer(stream, + paddle::recordio::Compressor::kSnappy); + writer.Write("ABC"); + writer.Write("BCD"); + writer.Write("CDE"); + writer.Flush(); + } + + { + stream->seekg(0, std::ios::beg); + std::unique_ptr stream_ptr(stream); + paddle::recordio::Scanner scanner(std::move(stream_ptr)); + ASSERT_TRUE(scanner.HasNext()); + ASSERT_EQ(scanner.Next(), "ABC"); + ASSERT_EQ("BCD", scanner.Next()); + ASSERT_TRUE(scanner.HasNext()); + ASSERT_EQ("CDE", scanner.Next()); + ASSERT_FALSE(scanner.HasNext()); + } +} + +TEST(WriterScanner, TinyChunk) { + std::stringstream* stream = new std::stringstream(); + { + paddle::recordio::Writer writer( + stream, paddle::recordio::Compressor::kNoCompress, 2 /*max chunk num*/); + writer.Write("ABC"); + writer.Write("BCD"); + writer.Write("CDE"); + writer.Write("DEFG"); + writer.Flush(); + } + + { + stream->seekg(0, std::ios::beg); + std::unique_ptr stream_ptr(stream); + paddle::recordio::Scanner scanner(std::move(stream_ptr)); + ASSERT_TRUE(scanner.HasNext()); + ASSERT_EQ(scanner.Next(), "ABC"); + ASSERT_EQ(scanner.Next(), "BCD"); + ASSERT_EQ(scanner.Next(), "CDE"); + ASSERT_EQ(scanner.Next(), "DEFG"); + ASSERT_FALSE(scanner.HasNext()); + } +} \ No newline at end of file diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 6e24cbdd3f6a4f05c1691dc643d880f6f454429d..90c2dfbba78418fb7b731f5363017d70577b1ae5 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,27 +1,29 @@ - -file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) -file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) -file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py) - set(PY_FILES paddle/__init__.py - ${TRAINER_PY_FILES} - ${HELPERS_PY_FILES} ${UTILS_PY_FILES} - ${V2_PY_FILES} ${FLUID_PY_FILES}) -add_custom_target(copy_paddle_master) +if(NOT WITH_FLUID) + file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) + file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) + file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) + set(PY_FILES ${PY_FILES} + ${TRAINER_PY_FILES} + ${HELPERS_PY_FILES} + ${V2_PY_FILES}) -SET(COPY_PADDLE_MASTER "") -if(WITH_GOLANG) - SET(COPY_PADDLE_MASTER "copy_paddle_master") - add_custom_command(TARGET ${COPY_PADDLE_MASTER} - COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ - ) - add_dependencies(copy_paddle_master paddle_master) -endif(WITH_GOLANG) + add_custom_target(copy_paddle_master) + + SET(COPY_PADDLE_MASTER "") + if(WITH_GOLANG) + SET(COPY_PADDLE_MASTER "copy_paddle_master") + add_custom_command(TARGET ${COPY_PADDLE_MASTER} + COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ + ) + add_dependencies(copy_paddle_master paddle_master) + endif(WITH_GOLANG) +endif() set(MKL_SHARED_LIBS "") set(MKL_DEPENDS "") @@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) -set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS}) -if(WITH_SWIG_PY) - list(APPEND paddle_python_deps python_api_wheel) +set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) +if(NOT WITH_FLUID) + set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model) + if(WITH_SWIG_PY) + list(APPEND paddle_python_deps python_api_wheel) + endif() endif() add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) if (WITH_TESTING) - add_subdirectory(paddle/trainer_config_helpers/tests) - if (WITH_SWIG_PY) - # enable v2 API unittest only when paddle swig api is compiled - add_subdirectory(paddle/v2/tests) - add_subdirectory(paddle/v2/reader/tests) - add_subdirectory(paddle/v2/plot/tests) - add_subdirectory(paddle/fluid/tests) + if(NOT WITH_FLUID) + add_subdirectory(paddle/trainer_config_helpers/tests) + if (WITH_SWIG_PY) + # enable v2 API unittest only when paddle swig api is compiled + add_subdirectory(paddle/v2/tests) + add_subdirectory(paddle/v2/reader/tests) + add_subdirectory(paddle/v2/plot/tests) + endif() endif() + add_subdirectory(paddle/fluid/tests) endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 0df3fd0343dbdaee88192a402d990fdfc2235811..fcea28220485039c9daf3c5fa2688c31f9f34c42 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -35,11 +35,12 @@ from core import LoDTensor, CPUPlace, CUDAPlace from distribute_transpiler import DistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler from concurrency import (Go, make_channel, channel_send, channel_recv, - channel_close) + channel_close, Select) import clip -from memory_optimization_transpiler import memory_optimize +from memory_optimization_transpiler import memory_optimize, release_memory import profiler import unique_name +import recordio_writer Tensor = LoDTensor @@ -63,8 +64,10 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ 'SimpleDistributeTranspiler', 'DistributeTranspiler', 'memory_optimize', + 'release_memory', 'profiler', 'unique_name', + 'recordio_writer', ] diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 09d137c9017bb4185308af94287fe0e8aa005505..7af6ed1463ab737e871da487f2a687301652ef2d 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -248,12 +248,15 @@ def _callback_lookup_(op): if o_argu in self.param_grad_names: allreduce_out_name = o_argu + "__nccl_all_reduce__" op_desc = _create_op_desc_( - "ncclAllReduce", { + "ncclReduce", + { "X": [o_argu], "Communicator": ['nccl_com__do_not_change_'] - }, {"Out": [allreduce_out_name]}, - {"reduction": "ncclSum"}) + }, + {"Out": [allreduce_out_name]}, + {"reduction": "ncclSum", + "root": 0}, ) block.desc.append_op().copy_from(op_desc) op_desc = _create_op_desc_( @@ -457,7 +460,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, "Out": [_append_grad_suffix_(loss.name)] }, {"shape": [1], "value": 1.0, - "dtype": loss.dtype}) + "dtype": loss.dtype, + "force_cpu": False}) root_block.desc.append_op().copy_from(op_desc) block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index dec224fc886cd0739add0ebb6488625ef5063b8d..535e881c42f675198a2679cb7974af64b65cc194 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -12,17 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -from layers.control_flow import BlockGuard +from layers.control_flow import BlockGuard, Select from layer_helper import LayerHelper, unique_name from layers import fill_constant import core __all__ = [ - 'Go', - 'make_channel', - 'channel_send', - 'channel_recv', - 'channel_close', + 'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close', + 'Select' ] @@ -198,7 +195,7 @@ def channel_recv(channel, return_value): ch = fluid.make_channel(dtype='int32', capacity=10) with fluid.Go(): - returned_value = fluid.channel_recv(ch, 'int32') + returned_value, return_status = fluid.channel_recv(ch, 'int32') # Code to send data through the channel. """ diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py index b7a906654a7ed60d610fc2e9801bf6108e555dcb..97fa182c4007cc730c06e9f95259a2509e01ecdf 100644 --- a/python/paddle/fluid/debuger.py +++ b/python/paddle/fluid/debuger.py @@ -16,6 +16,7 @@ import sys import re from graphviz import GraphPreviewGenerator import proto.framework_pb2 as framework_pb2 +import paddle.fluid.core as core _vartype2str_ = [ "UNK", @@ -52,9 +53,11 @@ reprtpl = "{ttype} {name} ({reprs})" def repr_lodtensor(proto): - if not proto.lod_tensor: return - level = proto.lod_tensor.lod_level - reprs = repr_tensor(proto.lod_tensor.tensor) + if proto.type.type != framework_pb2.VarType.LOD_TENSOR: + return + + level = proto.type.lod_tensor.lod_level + reprs = repr_tensor(proto.type.lod_tensor.tensor) return reprtpl.format( ttype="LoDTensor" if level > 0 else "Tensor", name=proto.name, @@ -62,20 +65,24 @@ def repr_lodtensor(proto): def repr_selected_rows(proto): - if not proto.selected_rows: return + if proto.type.type != framework_pb2.VarType.SELECTED_ROWS: + return + return reprtpl.format( ttype="SelectedRows", name=proto.name, - reprs=repr_tensor(proto.selected_rows)) + reprs=repr_tensor(proto.type.selected_rows)) def repr_tensor_array(proto): - if not proto.tensor_array: return + if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY: + return + return reprtpl.format( ttype="TensorArray", name=proto.name, - reprs="level=%d, %s" % (proto.tensor_array.lod_level, - repr_tensor(proto.lod_tensor))) + reprs="level=%d, %s" % (proto.type.tensor_array.lod_level, + repr_tensor(proto.type.lod_tensor.tensor))) type_handlers = [ @@ -119,6 +126,7 @@ def pprint_block_codes(block_desc, show_backward=False): def is_var_backward(var_desc): return "@GRAD" in var_desc.name + #print(type(block_desc)) if type(block_desc) is not framework_pb2.BlockDesc: block_desc = framework_pb2.BlockDesc.FromString( block_desc.serialize_to_string()) diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index bb2ce4d45d5da6b2fbd097a94479f3696271e5ec..3d3a6c116eeb39fb7236d0e9707415cdd6b828bd 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -250,6 +250,8 @@ class DistributeTranspiler: def get_trainer_program(self): # remove optimize ops and add a send op to main_program self.program.global_block().delete_ops(self.optimize_ops) + # FIXME(typhoonzero): serialize once will fix error occurs when clone. + self.program.__str__() return self.program def get_pserver_program(self, endpoint): @@ -309,7 +311,8 @@ class DistributeTranspiler: for _, opt_op in enumerate(opt_op_on_pserver): if ufind.is_connected(op, opt_op): if self._is_opt_op(op): - self._append_pserver_ops(optimize_block, op, endpoint) + self._append_pserver_ops(optimize_block, op, endpoint, + default_main_program()) else: self._append_pserver_non_opt_ops(optimize_block, op) break @@ -520,7 +523,8 @@ class DistributeTranspiler: orig_var_name = varname[:suff_idx] return orig_var_name - def _append_pserver_ops(self, optimize_block, opt_op, endpoint): + def _append_pserver_ops(self, optimize_block, opt_op, endpoint, + origin_program): program = optimize_block.program pserver_block = program.global_block() new_inputs = dict() @@ -576,7 +580,17 @@ class DistributeTranspiler: elif key == "LearningRate": # leraning rate variable has already be created by non-optimize op, # don't create it once again. - new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]] + lr_varname = opt_op.input(key)[0] + if pserver_block.vars.has_key(lr_varname): + new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]] + else: + origin_var = origin_program.global_block().vars[lr_varname] + tmpvar = pserver_block.create_var( + name=origin_var.name, + persistable=origin_var.persistable, + dtype=origin_var.dtype, + shape=origin_var.shape) + new_inputs[key] = tmpvar for key in opt_op.input_names: new_shape = None diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d14d6349b1bcf598e25bbeb9913d2d0da71a5054..70ecffd910a46570b5a8e576d88039fa5e22e726 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -487,7 +487,7 @@ class Operator(object): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', - 'channel_send', 'channel_recv' + 'channel_send', 'channel_recv', 'select' } if type not in no_kernel_op_set: self.desc.infer_var_type(self.block.desc) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 35aa80a2ae9a6289665b581275fb86c3931fd7a8..1c0f1f6eb415b1c05c1052c1f52743a19c49f017 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -47,7 +47,7 @@ def is_parameter(var): def is_persistable(var): if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST: + var.desc.type() == core.VarDesc.VarType.FETCH_LIST: return False return var.persistable diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b..02cd0a05a11d8d1d52d42c2b62799f1093d0abc2 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -16,7 +16,7 @@ import contextlib from layer_function_generator import autodoc from tensor import assign, fill_constant from .. import core -from ..framework import Program, Variable, Operator +from ..framework import Program, Variable, Operator, Block from ..layer_helper import LayerHelper, unique_name from ops import logical_and, logical_not, logical_or @@ -29,6 +29,7 @@ __all__ = [ 'WhileGuard', 'While', 'Switch', + 'Select', 'lod_rank_table', 'max_sequence_len', 'topk', @@ -1211,6 +1212,186 @@ class Switch(object): return True +class SelectCase(object): + DEFAULT = 0 + SEND = 1 + RECEIVE = 2 + + def __init__(self, + case_idx, + case_to_execute, + channel_action_fn=None, + channel=None, + value=None): + self.helper = LayerHelper('conditional_block') + self.main_program = self.helper.main_program + self.is_scalar_condition = True + + self.case_to_execute = case_to_execute + self.idx = case_idx + + # Since we aren't going to use the `channel_send` or `channel_recv` + # functions directly, we just need to capture the name. + self.action = (self.SEND + if channel_action_fn.__name__ == ('channel_send') else + self.RECEIVE) if channel_action_fn else (self.DEFAULT) + self.value = value + self.channel = channel + + def __enter__(self): + self.block = self.main_program.create_block() + + def construct_op(self): + main_program = self.helper.main_program + cases_block = main_program.current_block() + + inner_outputs = set() + input_set = set() + params = set() + + for op in self.block.ops: + # Iterate over all operators, get all the inputs + # and add as input to the SelectCase operator. + for iname in op.input_names: + for in_var_name in op.input(iname): + if in_var_name not in inner_outputs: + input_set.add(in_var_name) + + for oname in op.output_names: + for out_var_name in op.output(oname): + inner_outputs.add(out_var_name) + + param_list = [ + cases_block.var(each_name) for each_name in params + if each_name not in input_set + ] + + # Iterate over all operators, get all the outputs + # add to the output list of SelectCase operator only if + # they exist in the parent block. + out_vars = [] + for inner_out_name in inner_outputs: + if inner_out_name in cases_block.vars: + out_vars.append(cases_block.var(inner_out_name)) + + # First, create an op that will determine whether or not this is the + # conditional variable to execute. + should_execute_block = equal( + fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx), + self.case_to_execute) + + step_scope = cases_block.create_var( + type=core.VarDesc.VarType.STEP_SCOPES) + + cases_block.append_op( + type='conditional_block', + inputs={'X': [should_execute_block], + 'Params': param_list}, + outputs={'Out': out_vars, + 'Scope': [step_scope]}, + attrs={ + 'sub_block': self.block, + 'is_scalar_condition': self.is_scalar_condition + }) + + return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name + if self.channel else '', self.value.name + if self.value else '') + + def __exit__(self, exc_type, exc_val, exc_tb): + self.main_program.rollback() + if exc_type is not None: + return False # re-raise exception + return True + + +class Select(BlockGuard): + def __init__(self, name=None): + self.helper = LayerHelper('select', name=name) + self.cases = [] + + super(Select, self).__init__(self.helper.main_program) + self.case_to_execute = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1) + + def __enter__(self): + super(Select, self).__enter__() + return self + + def case(self, channel_action_fn, channel, value): + """Create a new block for this condition. + """ + select_case = SelectCase( + len(self.cases), self.case_to_execute, channel_action_fn, channel, + value) + + self.cases.append(select_case) + + return select_case + + def default(self): + """Create a default case block for this condition. + """ + default_case = SelectCase(len(self.cases), self.case_to_execute) + + self.cases.append(default_case) + + return default_case + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + return False + + # Create a select op and another block to wrap its + # case blocks. + select_block = self.helper.main_program.current_block() + parent_block = self.helper.main_program.block(select_block.parent_idx) + + # Construct each case op, inside the newly created select block. + serialized_cases = [] + for case in self.cases: + serialized_cases.append(case.construct_op()) + + intermediate = set() + params = set() + + for case_block in select_block.ops: + if case_block.attrs and 'sub_block' in case_block.attrs: + for each_op in case_block.attrs['sub_block'].ops: + assert isinstance(each_op, Operator) + for iname in each_op.input_names: + for in_var_name in each_op.input(iname): + if in_var_name not in intermediate: + params.add(in_var_name) + + for oname in each_op.output_names: + for out_var_name in each_op.output(oname): + intermediate.add(out_var_name) + + # TODO(varunarora): Figure out if defining output is needed. + out_list = [ + parent_block.var(var_name) for var_name in parent_block.vars + if var_name in intermediate + ] + + X = [select_block.var_recursive(x_name) for x_name in params] + + # Needs to be used by `equal` inside the cases block. + X.append(self.case_to_execute) + + # Construct the select op. + parent_block.append_op( + type='select', + inputs={'X': X, + 'case_to_execute': self.case_to_execute}, + attrs={'sub_block': select_block, + 'cases': serialized_cases}, + outputs={}) + + return super(Select, self).__exit__(exc_type, exc_val, exc_tb) + + class IfElseBlockGuard(object): def __init__(self, is_true, ifelse): if not isinstance(ifelse, IfElse): diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index d326c5651fa39a81bfc9148cfc005c559b10cf1f..3ced35d6ced8ad48514f775e5586e0dc96ad69d1 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -129,8 +129,13 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') - nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) + old_shape = scores.shape + scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) + scores = nn.softmax(input=scores) + scores = ops.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) + + nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", inputs={'Scores': scores, @@ -561,16 +566,16 @@ def multi_box_head(inputs, base_size, num_classes, aspect_ratios, - min_ratio, - max_ratio, + min_ratio=None, + max_ratio=None, min_sizes=None, max_sizes=None, steps=None, step_w=None, step_h=None, offset=0.5, - variance=[0.1, 0.1, 0.1, 0.1], - flip=False, + variance=[0.1, 0.1, 0.2, 0.2], + flip=True, clip=False, kernel_size=1, pad=0, @@ -613,7 +618,7 @@ def multi_box_head(inputs, the inputs[i] will be automatically calculated. Default: None. offset(float): Prior boxes center offset. Default: 0.5 variance(list|tuple): the variances to be encoded in prior boxes. - Default:[0.1, 0.1, 0.1, 0.1]. + Default:[0.1, 0.1, 0.2, 0.2]. flip(bool): Whether to flip aspect ratios. Default:False. clip(bool): Whether to clip out-of-boundary boxes. Default: False. kernel_size(int): The kernel size of conv2d. Default: 1. @@ -667,6 +672,19 @@ def multi_box_head(inputs, helper = LayerHelper("prior_box", **locals()) dtype = helper.input_dtype() + attrs = { + 'min_sizes': min_sizes, + 'aspect_ratios': aspect_ratios, + 'variances': variance, + 'flip': flip, + 'clip': clip, + 'step_w': step_w, + 'step_h': step_h, + 'offset': offset + } + if len(max_sizes) > 0 and max_sizes[0] > 0: + attrs['max_sizes'] = max_sizes + box = helper.create_tmp_variable(dtype) var = helper.create_tmp_variable(dtype) helper.append_op( @@ -675,17 +693,7 @@ def multi_box_head(inputs, "Image": image}, outputs={"Boxes": box, "Variances": var}, - attrs={ - 'min_sizes': min_sizes, - 'max_sizes': max_sizes, - 'aspect_ratios': aspect_ratios, - 'variances': variance, - 'flip': flip, - 'clip': clip, - 'step_w': step_w, - 'step_h': step_h, - 'offset': offset - }) + attrs=attrs, ) return box, var def _reshape_with_axis_(input, axis=1): @@ -713,7 +721,7 @@ def multi_box_head(inputs, if num_layer <= 2: assert min_sizes is not None and max_sizes is not None assert len(min_sizes) == num_layer and len(max_sizes) == num_layer - else: + elif min_sizes is None and max_sizes is None: min_sizes = [] max_sizes = [] step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2))) @@ -758,9 +766,6 @@ def multi_box_head(inputs, min_size = [min_size] if not _is_list_or_tuple_(max_size): max_size = [max_size] - if not (len(max_size) == len(min_size)): - raise ValueError( - 'the length of max_size and min_size should be equal.') aspect_ratio = [] if aspect_ratios is not None: @@ -778,7 +783,7 @@ def multi_box_head(inputs, num_boxes = box.shape[2] - # get box_loc + # get loc num_loc_output = num_boxes * 4 mbox_loc = nn.conv2d( input=input, @@ -795,7 +800,7 @@ def multi_box_head(inputs, mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape) mbox_locs.append(mbox_loc_flatten) - # get conf_loc + # get conf num_conf_output = num_boxes * num_classes conf_loc = nn.conv2d( input=input, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index af3ae54248a744e7e2fed8190aeeb0eb481cb315..9c91f395e7c9d7ca76c1a5cc310bc3bbc06daec9 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -13,11 +13,16 @@ # limitations under the License. from .. import core -from ..layer_helper import LayerHelper +from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program +from ..unique_name import generate as unique_name from control_flow import BlockGuard from ..layer_helper import LayerHelper +from ..executor import global_scope -__all__ = ['data', 'BlockGuardServ', 'ListenAndServ', 'Send'] +__all__ = [ + 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', + 'read_file', 'create_shuffle_reader', 'create_double_buffer_reader' +] def data(name, @@ -224,3 +229,101 @@ def Recv(endpoints, get_vars): outputs={"Out": get_vars}, attrs={"endpoints": endpoints, "epmap": epmap}) + + +def monkey_patch_reader_methods(reader): + def __get_reader__(): + scope = global_scope() + var = scope.find_var(reader.name) + return var.get_reader() + + def eof(): + return not __get_reader__().has_next() + + def reset(): + return __get_reader__().reset() + + reader.eof = eof + reader.reset = reset + reader.stop_gradient = True + reader.persistable = True + return reader + + +def _copy_reader_var_(block, var): + new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) + new_var.desc.set_shapes(var.desc.shapes()) + new_var.desc.set_dtypes(var.desc.dtypes()) + new_var.persistable = True + return monkey_patch_reader_methods(new_var) + + +def open_recordio_file(filename, shapes, lod_levels, dtypes): + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) + + var_name = unique_name('open_recordio_file') + + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_recordio_file_reader', + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'filename': filename, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + return _copy_reader_var_(default_main_program().current_block(), + startup_var) + + +def __create_decorated_reader__(op_type, reader, attrs): + var_name = unique_name(op_type) + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type=op_type, + inputs={'UnderlyingReader': reader}, + outputs={'Out': [startup_var]}, + attrs=attrs) + startup_var.persistable = True + return _copy_reader_var_(default_main_program().current_block(), + startup_var) + + +def create_shuffle_reader(reader, buffer_size): + return __create_decorated_reader__('create_shuffle_reader', reader, + {'buffer_size': int(buffer_size)}) + + +def create_double_buffer_reader(reader, place=None): + attrs = dict() + if place is not None: + attrs['place'] = str(place).upper() + return __create_decorated_reader__('create_double_buffer_reader', reader, + attrs) + + +def read_file(file_obj): + helper = LayerHelper('read_file') + out = [ + helper.create_tmp_variable( + stop_gradient=True, dtype='float32') + for _ in range(len(file_obj.desc.shapes())) + ] + helper.append_op( + type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out}) + if len(out) == 1: + return out[0] + else: + return out diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index bd79022a0c39cf18bd05d49ac62986d342a4ae06..35b01a79914b3427836d4abd51aa2e2eb471d517 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -16,10 +16,7 @@ import cStringIO import functools import warnings -from .. import proto - -framework_pb2 = proto.framework_pb2 - +from ..proto import framework_pb2 from ..framework import OpProtoHolder, Variable from ..layer_helper import LayerHelper diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 67a6fd808480d2912b10869bef616b69cdd431fd..48d244f3f600e493b416f1a44dea451469784990 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -39,6 +39,8 @@ __all__ = [ 'sequence_conv', 'conv2d', 'sequence_pool', + 'sequence_softmax', + 'softmax', 'pool2d', 'batch_norm', 'beam_search_decode', @@ -49,6 +51,7 @@ __all__ = [ 'reduce_mean', 'reduce_max', 'reduce_min', + 'reduce_prod', 'sequence_first_step', 'sequence_last_step', 'dropout', @@ -85,13 +88,12 @@ def fc(input, **Fully Connected Layer** The fully connected layer can take multiple tensors as its inputs. It - creates a variable (one for each input tensor) called weights for each - input tensor, which represents a fully connected weight matrix from - each input unit to each output unit. The fully connected layer - multiplies each input tensor with its coresponding weight to produce - an output Tensor. If multiple input tensors are given, the results of - multiple multiplications will be sumed up. If bias_attr is not None, - a biases variable will be created and added to the output. Finally, + creates a variable called weights for each input tensor, which represents + a fully connected weight matrix from each input unit to each output unit. + The fully connected layer multiplies each input tensor with its coresponding + weight to produce an output Tensor. If multiple input tensors are given, + the results of multiple multiplications will be sumed up. If bias_attr is + not None, a bias variable will be created and added to the output. Finally, if activation is not None, it will be applied to the output as well. This process can be formulated as follows: @@ -110,44 +112,27 @@ def fc(input, * :math:`Out`: The output tensor. Args: - input(Variable|list): The input tensor(s) to the fully connected layer. - size(int): The number of output units in the fully connected layer. - num_flatten_dims(int): The fc layer can accept an input tensor with more - than two dimensions. If this happens, the - multidimensional tensor will first be flattened - into a 2-dimensional matrix. The parameter - `num_flatten_dims` determines how the input tensor - is flattened: the first `num_flatten_dims` - (inclusive, index starts from 1) dimensions will - be flatten to form the first dimension of the - final matrix (height of the matrix), and the rest - `rank(X) - num_flatten_dims` dimensions are - flattened to form the second dimension of the - final matrix (width of the matrix). For example, - suppose `X` is a 6-dimensional tensor with a shape - [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, - the flattened matrix will have a shape - [2 x 3 x 4, 5 x 6] = [24, 30]. By default, - `num_flatten_dims` is set to 1. - param_attr(ParamAttr|list): The parameter attribute for learnable - parameters/weights of the fully connected - layer. - param_initializer(ParamAttr|list): The initializer used for the - weight/parameter. If set None, - XavierInitializer() will be used. - bias_attr(ParamAttr|list): The parameter attribute for the bias parameter - for this layer. If set None, no bias will be - added to the output units. - bias_initializer(ParamAttr|list): The initializer used for the bias. - If set None, then ConstantInitializer() - will be used. - act(str): Activation to be applied to the output of the fully connected - layer. - name(str): Name/alias of the fully connected layer. - + input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of + the input tensor(s) is at least 2. + size(int): The number of output units in this layer. + num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than + two dimensions. If this happens, the multidimensional tensor will first be flattened + into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input + tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) + dimensions will be flatten to form the first dimension of the final matrix (height of + the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to + form the second dimension of the final matrix (width of the matrix). For example, suppose + `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable + parameters/weights of this layer. + bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to None, no bias will be added to the output units. + act (str, default None): Activation to be applied to the output of this layer. + name (str, default None): The name of this layer. Returns: - Variable: The output tensor variable. + A tensor variable storing the transformation result. Raises: ValueError: If rank of the input tensor is less than 2. @@ -1103,6 +1088,30 @@ def sequence_conv(input, return helper.append_activation(pre_act) +def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): + helper = LayerHelper('sequence_softmax', **locals()) + dtype = helper.input_dtype() + softmax_out = helper.create_tmp_variable(dtype) + helper.append_op( + type="sequence_softmax", + inputs={"X": input}, + outputs={"Out": softmax_out}, + attrs={"use_cudnn": use_cudnn}) + return softmax_out + + +def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): + helper = LayerHelper('softmax', **locals()) + dtype = helper.input_dtype() + softmax_out = helper.create_tmp_variable(dtype) + helper.append_op( + type="softmax", + inputs={"X": input}, + outputs={"Out": softmax_out}, + attrs={"use_cudnn": use_cudnn}) + return softmax_out + + def conv2d(input, num_filters, filter_size, @@ -2203,6 +2212,53 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): return out +def reduce_prod(input, dim=None, keep_dim=False, name=None): + """ + Computes the product of tensor elements over the given dimension. + + Args: + input (Variable): The input variable which is a Tensor or LoDTensor. + dim (int|None): The dimension along which the product is performed. If + :attr:`None`, multipy all elements of :attr:`input` and return a + Tensor variable with a single element, otherwise must be in the + range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, + the dimension to reduce is :math:`rank + dim`. + keep_dim (bool|False): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the :attr:`input` unless :attr:`keep_dim` is true. + name(str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The reduced Tensor variable. + + Examples: + .. code-block:: python + + # x is a Tensor variable with following elements: + # [[0.2, 0.3, 0.5, 0.9] + # [0.1, 0.2, 0.6, 0.7]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_prod(x) # [0.0002268] + fluid.layers.reduce_prod(x, dim=0) # [0.02, 0.06, 0.3, 0.63] + fluid.layers.reduce_prod(x, dim=-1) # [0.027, 0.0084] + fluid.layers.reduce_prod(x, dim=1, + keep_dim=True) # [[0.027], [0.0084]] + """ + helper = LayerHelper('reduce_prod', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='reduce_prod', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'dim': dim if dim != None else 0, + 'keep_dim': keep_dim, + 'reduce_all': True if dim == None else False + }) + return out + + def split(input, num_or_sections, dim=-1, name=None): """ Split the input tensor into multiple sub-tensors. diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 20dd1b47525f0ec32dc53c574b04820cb7d5941c..43a78a27c6ca945f39b61cc7491ff18792c61e71 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -58,8 +58,6 @@ __all__ = [ 'elementwise_pow', 'clip', 'clip_by_norm', - 'softmax', - 'sequence_softmax', 'logical_and', 'logical_or', 'logical_xor', @@ -69,6 +67,7 @@ __all__ = [ 'gaussian_random', 'gaussian_random_batch_size_like', 'cumsum', + 'scatter', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py index 4fa2d03ef563b98b2eec576bf87d4b2e54ca0a36..41d1eca82e8b680977f44f1756c25c37340668a4 100644 --- a/python/paddle/fluid/memory_optimization_transpiler.py +++ b/python/paddle/fluid/memory_optimization_transpiler.py @@ -29,7 +29,10 @@ dtype_to_size = { core.VarDesc.VarType.BOOL: 1 } -sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"] +sub_block_ops = [ + "while", "while_grad", "parallel_do", "parallel_do_grad", + "conditional_block", "conditional_block_grad" +] PRINT_LOG = False @@ -122,36 +125,80 @@ class ControlFlowGraph(object): else: return block_desc.find_var_recursive(str(var_name)) - def memory_optimize(self): - def check_var_validity(block_desc, x, is_forward): - if str(x) == "@EMPTY@": - return False - if not self._has_var(block_desc, x, is_forward): - return False - if self._find_var(block_desc, x, is_forward).persistable(): - return False - if self._find_var( - block_desc, x, - is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: - return False - if x in self._skip_opt: - return False - if not self._find_var(block_desc, x, is_forward).shape(): - return False - return True + def _check_var_validity(self, block_desc, x, is_forward): + if str(x) == "@EMPTY@": + return False + if not self._has_var(block_desc, x, is_forward): + return False + if self._find_var(block_desc, x, is_forward).persistable(): + return False + if self._find_var(block_desc, x, + is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: + return False + if x in self._skip_opt: + return False + if not self._find_var(block_desc, x, is_forward).shape(): + return False + return True - self._build_graph() + def _update_skip_opt_set(self): + for i in range(self.op_size): + op = self._ops[i] + if op.type() == "fill_constant" and op.attr("force_cpu") == True: + self._skip_opt.update(op.output_arg_names()) + + def release_memory(self): self._dataflow_analyze() + self._update_skip_opt_set() + fwd_id = 0 + bwd_id = 0 + for i in range(self.op_size): + op = self._ops[i] + if op.type() in sub_block_ops: + continue + block_desc = op.block() + is_forward = i < self._forward_num + in_diff, out_diff = self._get_diff(self._live_in[i], + self._live_out[i]) + can_optimize = filter( + lambda x: self._check_var_validity(block_desc, x, is_forward), + in_diff) + if can_optimize: + index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1 + delete_op = block_desc.insert_op(index) + delete_op.set_type("delete_var") + delete_op.set_input("X", can_optimize) + if is_forward: + fwd_id += 1 + else: + bwd_id += 1 + + def memory_optimize(self, level=0): + def compare_shape(x_shape, cache_shape, opt_level): + if opt_level == 0: + return x_shape == cache_shape + if opt_level == 1: + if (x_shape[0] == -1) ^ (cache_shape[0] == -1): + return False + x_size = abs(reduce(lambda x, y: x * y, x_shape)) + cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) + if x_size <= cache_size: + return True + return False + + self._dataflow_analyze() + self._update_skip_opt_set() self.pool = [] for i in range(self.op_size): op = self._ops[i] if op.type() in sub_block_ops: continue block_desc = op.block() + self.current_block_desc = block_desc is_forward = i < self._forward_num if self.pool: defs_can_optimize = filter( - lambda x: check_var_validity(block_desc, x, is_forward), + lambda x: self._check_var_validity(block_desc, x, is_forward), self._defs[i]) out_pair = [ (x, self._find_var(block_desc, x, is_forward).shape()) @@ -164,7 +211,7 @@ class ControlFlowGraph(object): for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] - if x_shape == cache_shape: + if compare_shape(x_shape, cache_shape, level): if self._has_var(block_desc, cache_var, is_forward): x_dtype = self._find_var(block_desc, x, is_forward).dtype() @@ -196,7 +243,7 @@ class ControlFlowGraph(object): in_diff, out_diff = self._get_diff(self._live_in[i], self._live_out[i]) can_optimize = filter( - lambda x: check_var_validity(block_desc, x, is_forward), + lambda x: self._check_var_validity(block_desc, x, is_forward), in_diff) if can_optimize: for var_name in can_optimize: @@ -270,7 +317,8 @@ def _get_cfgs(input_program): ([block_desc.op(i) for i in range(op_size)], op_size, set())) sub_block_pair = [("while", "while_grad"), ("parallel_do", - "parallel_do_grad")] + "parallel_do_grad"), + ("conditional_block", "conditional_block_grad")] ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair)) @@ -281,9 +329,15 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, print_log=False): +def memory_optimize(input_program, print_log=False, level=0): global PRINT_LOG PRINT_LOG = print_log cfgs = _get_cfgs(input_program) for cfg in cfgs: - cfg.memory_optimize() + cfg.memory_optimize(level) + + +def release_memory(input_program): + cfgs = _get_cfgs(input_program) + for cfg in cfgs: + cfg.release_memory() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1c12d53e4f352b7e4dea980301973a69665e49f9..8b8621469d856e63dfd9685d7cd3d4c1d2ada1ce 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -92,7 +92,10 @@ class Optimizer(object): # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] - return self.global_learning_rate() * param_lr + if param_lr == 1.0: + return self.global_learning_rate() + else: + return self.global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -220,6 +223,8 @@ class Optimizer(object): params_grads = append_backward(loss, parameter_list, no_grad_set, [error_clip_callback]) + params_grads = sorted(params_grads, key=lambda x: x[0].name) + params_grads = append_gradient_clip_ops(params_grads) # Add regularization if any diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..5accaacd5361165d30b92c71ae4fd62e23e44e07 --- /dev/null +++ b/python/paddle/fluid/recordio_writer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import core +import contextlib + +__all__ = ['convert_reader_to_recordio_file'] + + +@contextlib.contextmanager +def create_recordio_writer(filename, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000): + writer = core.RecordIOWriter(filename, compressor, max_num_records) + yield writer + writer.close() + + +def convert_reader_to_recordio_file( + filename, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + counter = 0 + with create_recordio_writer(filename, compressor, + max_num_records) as writer: + for batch in reader_creator(): + res = feeder.feed(batch) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + return counter diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index a29f9a208ebefc75b531030c9f0de9487f2b136c..604c6f9ab36c2332223d1ba943d67113922615b3 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -13,6 +13,7 @@ # limitations under the License. import framework +from . import core __all__ = [ 'append_regularization_ops', @@ -43,16 +44,20 @@ def append_regularization_ops(parameters_and_grads, regularization=None): """ params_and_grads = [] for param, grad in parameters_and_grads: + # If no gradient then we don't need to do anything + if grad is None: + params_and_grads.append((param, grad)) + continue + regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad.block) + regularization_term = param.regularizer(param, grad, grad.block) elif regularization is not None: - regularization_term = regularization(param, grad.block) + regularization_term = regularization(param, grad, grad.block) - # If no gradient or no regularization specified, - # then we don't need to do anything - if grad is None or regularization_term is None: + # If no regularization specified, then we don't need to do anything + if regularization_term is None: params_and_grads.append((param, grad)) continue @@ -82,7 +87,7 @@ class WeightDecayRegularizer(object): def __init__(self): pass - def __call__(self, param, block): + def __call__(self, param, grad, block): """Add corresponding weight decay operations to the network """ raise NotImplementedError() @@ -102,7 +107,7 @@ class L2DecayRegularizer(WeightDecayRegularizer): super(L2DecayRegularizer, self).__init__() self._regularization_coeff = regularization_coeff - def __call__(self, param, block): + def __call__(self, param, grad, block): """Add L2 weight decay ops to network Adds L2 weight decay ops. @@ -117,8 +122,23 @@ class L2DecayRegularizer(WeightDecayRegularizer): """ assert isinstance(param, framework.Parameter) assert isinstance(block, framework.Block) + decay = block.create_var( dtype="float32", shape=param.shape, lod_level=param.lod_level) + + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + decay = block.create_var( + dtype="float32", + shape=param.shape, + type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='lookup_table', + inputs={'W': param, + 'Ids': grad}, + outputs={'Out': decay}, + attrs={'is_sparse': True}) + param = decay + # Append Op to calculate decay block.append_op( type='scale', @@ -141,7 +161,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): super(L1DecayRegularizer, self).__init__() self._regularization_coeff = regularization_coeff - def __call__(self, param, block): + def __call__(self, param, grad, block): """Add L1 weight decay ops to network Adds L1 weight decay ops. @@ -158,6 +178,19 @@ class L1DecayRegularizer(WeightDecayRegularizer): assert isinstance(block, framework.Block) decay = block.create_var( dtype="float32", shape=param.shape, lod_level=param.lod_level) + + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + decay = block.create_var( + dtype="float32", + shape=param.shape, + type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='lookup_table', + inputs={'W': param, + 'Ids': grad}, + outputs={'Out': decay}, + attrs={'is_sparse': True}) + # Append sign op block.append_op( type='sign', inputs={"X": param}, outputs={"Out": decay}) diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index caa9596a100de4f9364467690db1e80ee227c3c1..fa38bd3762423497b82c3b421b3a1db4cd87525b 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -181,7 +181,10 @@ def train_main(use_cuda, is_sparse, is_local=True): cost = pd.cross_entropy(input=rnn_out, label=label) avg_cost = pd.mean(cost) - optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer = fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)) optimize_ops, params_grads = optimizer.minimize(avg_cost) train_data = paddle.batch( diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index c9d2a5ecaab0669f308b5b9c5cf74d0212fa462a..ad79e96b958b36a06c8a3cc990dbe3608e32c9ac 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -50,6 +50,7 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) sgd_optimizer.minimize(avg_cost) fluid.memory_optimize(fluid.default_main_program(), print_log=True) +# fluid.release_memory(fluid.default_main_program()) BATCH_SIZE = 200 @@ -69,8 +70,6 @@ exe.run(fluid.default_startup_program()) PASS_NUM = 100 for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): avg_loss_value, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 80ff11f8d78b0a22fc6aefd722c9e6a2c23fbd5c..204669d7e6176e9e8250e8aebc2d10441fa24b67 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -125,9 +125,10 @@ opts = optimizer.minimize(avg_cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) -fluid.memory_optimize(fluid.default_main_program()) +# fluid.memory_optimize(fluid.default_main_program(), level=0) +fluid.release_memory(fluid.default_main_program()) -BATCH_SIZE = 128 +BATCH_SIZE = 16 PASS_NUM = 1 # fix the order of training data @@ -159,8 +160,7 @@ for pass_id in range(PASS_NUM): print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( pass_acc)) # this model is slow, so if we can train two mini batch, we think it works properly. - - if i > 2: + if i > 0: exit(0) if math.isnan(float(loss)): sys.exit("got NaN loss, training failed.") diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index 689a75afc7ccdf84142f5531a438e1f9af7af4ca..a24834a6f0b19d1265f6c8d7089d31583af82d1f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -105,7 +105,8 @@ def main(): optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) + # fluid.memory_optimize(fluid.default_main_program()) + fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch( diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py index 9f7bf63c5e017251e87af94690ff32c47c538c6b..924895a9afac610059bac5f617c49712441339cc 100644 --- a/python/paddle/fluid/tests/test_concurrency.py +++ b/python/paddle/fluid/tests/test_concurrency.py @@ -15,9 +15,9 @@ import unittest import paddle.fluid as fluid import paddle.fluid.core as core -from paddle.fluid import framework, unique_name +from paddle.fluid import framework, unique_name, layer_helper from paddle.fluid.executor import Executor -from paddle.fluid.layers import fill_constant +from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print class TestRoutineOp(unittest.TestCase): @@ -86,8 +86,7 @@ class TestRoutineOp(unittest.TestCase): self.assertEqual(leftmost_data[0][0], n + 1) def _create_one_dim_tensor(self, value): - one_dim_tensor = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT64, value=value) + one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value) one_dim_tensor.stop_gradient = True return one_dim_tensor @@ -95,6 +94,180 @@ class TestRoutineOp(unittest.TestCase): return framework.default_main_program().current_block().create_var( name=unique_name.generate(name), type=type, dtype=dtype) + def _create_persistable_tensor(self, name, type, dtype): + return framework.default_main_program().current_block().create_var( + name=unique_name.generate(name), + type=type, + dtype=dtype, + persistable=True) + + def test_select(self): + with framework.program_guard(framework.Program()): + ch1 = fluid.make_channel( + dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) + + result1 = self._create_tensor('return_value', + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.FP64) + + input_value = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.FP64, value=10) + + with fluid.Select() as select: + with select.case(fluid.channel_send, ch1, input_value): + # Execute something. + pass + + with select.default(): + pass + + # This should not block because we are using a buffered channel. + result1, status = fluid.channel_recv(ch1, result1) + fluid.channel_close(ch1) + + cpu = core.CPUPlace() + exe = Executor(cpu) + + result = exe.run(fetch_list=[result1]) + self.assertEqual(result[0][0], 10) + + def test_fibonacci(self): + """ + Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5 + """ + with framework.program_guard(framework.Program()): + quit_ch_input_var = self._create_persistable_tensor( + 'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.INT32) + quit_ch_input = fill_constant( + shape=[1], + dtype=core.VarDesc.VarType.INT32, + value=0, + out=quit_ch_input_var) + + result = self._create_persistable_tensor( + 'result', core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.INT32) + fill_constant( + shape=[1], + dtype=core.VarDesc.VarType.INT32, + value=0, + out=result) + + x = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) + y = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=1) + + while_cond = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True) + + while_false = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False) + + x_tmp = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) + + def fibonacci(channel, quit_channel): + while_op = While(cond=while_cond) + with while_op.block(): + result2 = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) + x_to_send_tmp = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) + + # TODO(abhinav): Need to perform copy when doing a channel send. + # Once this is complete, we can remove these lines + assign(input=x, output=x_to_send_tmp) + + with fluid.Select() as select: + with select.case(fluid.channel_send, channel, + x_to_send_tmp): + assign(input=x, output=x_tmp) + assign(input=y, output=x) + assign(elementwise_add(x=x_tmp, y=y), output=y) + + with select.case(fluid.channel_recv, quit_channel, + result2): + # Quit + helper = layer_helper.LayerHelper('assign') + helper.append_op( + type='assign', + inputs={'X': [while_false]}, + outputs={'Out': [while_cond]}) + + ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) + quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) + + with fluid.Go(): + for i in xrange(10): + fluid.channel_recv(ch1, result) + Print(result) + + fluid.channel_send(quit_ch, quit_ch_input) + + fibonacci(ch1, quit_ch) + + fluid.channel_close(ch1) + fluid.channel_close(quit_ch) + + cpu = core.CPUPlace() + exe = Executor(cpu) + + exe_result = exe.run(fetch_list=[result]) + self.assertEqual(exe_result[0][0], 34) + + def test_ping_pong(self): + """ + Mimics Ping Pong example: https://gobyexample.com/channel-directions + """ + with framework.program_guard(framework.Program()): + result = self._create_tensor('return_value', + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.FP64) + + ping_result = self._create_tensor('ping_return_value', + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.FP64) + + pong_result = self._create_tensor('pong_return_value', + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.FP64) + + def ping(ch, message): + message_to_send_tmp = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.FP64, value=0) + + assign(input=message, output=message_to_send_tmp) + fluid.channel_send(ch, message_to_send_tmp) + + def pong(ch1, ch2): + fluid.channel_recv(ch1, ping_result) + assign(input=ping_result, output=pong_result) + fluid.channel_send(ch2, pong_result) + + pings = fluid.make_channel( + dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) + pongs = fluid.make_channel( + dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) + + msg = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.FP64, value=9) + + ping(pings, msg) + pong(pings, pongs) + + fluid.channel_recv(pongs, result) + + fluid.channel_close(pings) + fluid.channel_close(pongs) + + cpu = core.CPUPlace() + exe = Executor(cpu) + + exe_result = exe.run(fetch_list=[result]) + self.assertEqual(exe_result[0][0], 9) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6b3fc2a83c649c28d21c9a8a0b35c2f2fa04f269 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/.gitignore @@ -0,0 +1 @@ +mnist.recordio diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f96c2ca4f0593b6c2624d449304f23425c69ab93..0ad273c7161977e18f91f952fd3a9dc144bf73f0 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -11,7 +11,6 @@ list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 -list(REMOVE_ITEM TEST_OPS test_detection_output_op) # FIXME: detection_output_op will be rewritten. This unittest should be list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 26835336add6024d0ac43373d217dd0bc4ef706b..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -469,6 +469,28 @@ class OpTest(unittest.TestCase): tensor.set_lod(lod) return tensor + @staticmethod + def np_dtype_to_fluid_dtype(input): + """Change the dtype of float16 numpy array + + numpy float16 is binded to paddle::platform::float16 + in tensor_py.h via the help of uint16 data type since + the internal memory representation of float16 is + uint16_t in paddle and np.uint16 in numpy, which are + themselves binded together by pybind. + + Args: + input: input numpy array + + Returns: + input: The dtype of input will be changed to np.uint16 if + it is originally np.float16, such that the internal memory + of input will be reinterpreted as of dtype np.uint16. + """ + if input.dtype == np.float16: + input.dtype = np.uint16 + return input + def _get_gradient(self, input_to_check, place, output_names, no_grad_set): prog = Program() block = prog.global_block() diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py index 8fb8d03828393ccfe57c0848d79b960c641ad39a..b8d3ed3aa3eb0e47e79f46cdf681a3b9cca46036 100644 --- a/python/paddle/fluid/tests/unittests/test_cast_op.py +++ b/python/paddle/fluid/tests/unittests/test_cast_op.py @@ -18,7 +18,7 @@ import numpy as np import paddle.fluid.core as core -class TestCastOp(op_test.OpTest): +class TestCastOp1(op_test.OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} @@ -36,5 +36,36 @@ class TestCastOp(op_test.OpTest): self.check_grad(['X'], ['Out']) +class TestCastOp2(op_test.OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]) + # numpy float16 is binded to fluid float16 via uint16 + self.inputs = {'X': ipt.astype('float16').view(np.uint16)} + self.outputs = {'Out': ipt.astype('float32')} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP16), + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output(atol=1e-3) + + +class TestCastOp3(op_test.OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]) + self.inputs = {'X': ipt.astype('float32')} + self.outputs = {'Out': ipt.astype('float16')} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP32), + 'out_dtype': int(core.VarDesc.VarType.FP16) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output(atol=1e-3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index a49fecf09509f7b1d9f758eebcf90bf9fbf7669f..4b6e3fb69a12095c77f343515fe3b6d1f3fccb14 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -63,9 +63,11 @@ def conv2d_forward_naive(input, filter, group, conv_param): class TestConv2dOp(OpTest): def setUp(self): + self.op_type = "conv2d" self.use_cudnn = False self.use_mkldnn = False - self.init_op_type() + self.dtype = np.float32 + self.init_kernel_type() self.init_group() self.init_dilation() self.init_test_case() @@ -75,12 +77,16 @@ class TestConv2dOp(OpTest): 'pad': self.pad, 'dilation': self.dilations } - input = np.random.random(self.input_size).astype("float32") - filter = np.random.random(self.filter_size).astype("float32") + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype('float32') + conv2d_param).astype(self.dtype) - self.inputs = {'Input': input, 'Filter': filter} + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -99,6 +105,8 @@ class TestConv2dOp(OpTest): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return if self.use_cudnn: place = core.CUDAPlace(0) self.check_grad_with_place( @@ -111,6 +119,8 @@ class TestConv2dOp(OpTest): set(['Input', 'Filter']), 'Output', max_relative_error=0.02) def test_check_grad_no_filter(self): + if self.dtype == np.float16: + return if self.use_cudnn: place = core.CUDAPlace(0) self.check_grad_with_place( @@ -126,6 +136,8 @@ class TestConv2dOp(OpTest): no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): + if self.dtype == np.float16: + return if self.use_cudnn: place = core.CUDAPlace(0) self.check_grad_with_place( @@ -154,8 +166,8 @@ class TestConv2dOp(OpTest): def init_group(self): self.groups = 1 - def init_op_type(self): - self.op_type = "conv2d" + def init_kernel_type(self): + pass class TestWithPad(TestConv2dOp): @@ -227,39 +239,105 @@ class TestWithInput1x1Filter1x1(TestConv2dOp): #----------------Conv2dCUDNN---------------- class TestCUDNN(TestConv2dOp): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNN(TestConv2dOp): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestCUDNNWithPad(TestWithPad): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNNWithPad(TestWithPad): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestCUDNNWithStride(TestWithStride): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNNWithStride(TestWithStride): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestCUDNNWithGroup(TestWithGroup): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNNWithGroup(TestWithGroup): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestCUDNNWith1x1(TestWith1x1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNNWith1x1(TestWith1x1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "conv2d" + + +class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) class TestDepthwiseConv(TestConv2dOp): @@ -295,21 +373,18 @@ class TestDepthwiseConv2(TestConv2dOp): #----------------Conv2dMKLDNN---------------- class TestMKLDNN(TestConv2dOp): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "conv2d" class TestMKLDNNWithPad(TestWithPad): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "conv2d" class TestMKLDNNWithStride(TestWithStride): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "conv2d" if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py new file mode 100644 index 0000000000000000000000000000000000000000..2b7bbf9218f9b8fd8f5b29ac3cbc2f9680f471eb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_debugger.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import debuger +from paddle.fluid.framework import Program + + +class TestDebugger(unittest.TestCase): + def test_debug_str(self): + p = Program() + b = p.current_block() + + #selected_rows + b.create_var( + name='selected_rows', + dtype="float32", + shape=[5, 10], + type=core.VarDesc.VarType.SELECTED_ROWS) + + #tensor array + b.create_var( + name='tensor_array', + shape=[5, 10], + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY) + + #operator + mul_x = b.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = b.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = b.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + b.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + + print(debuger.pprint_program_codes(p.desc)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f3197a623efb1730d27467c5650bc90f2762e7b2..a905a854ad157ffa3d7816dfbd445f3e344a1249 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -166,8 +166,6 @@ class TestDetectionMAPOp(OpTest): elif not difficult: label_count[label] += 1 - true_pos = collections.defaultdict(list) - false_pos = collections.defaultdict(list) for (label, score, tp, fp) in tf_pos: true_pos[label].append([score, tp]) false_pos[label].append([score, fp]) diff --git a/python/paddle/fluid/tests/unittests/test_detection_output_op.py b/python/paddle/fluid/tests/unittests/test_detection_output_op.py deleted file mode 100644 index 94681319144ee3e0d51b57944f5692183578c01b..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_detection_output_op.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import numpy as np -from op_test import OpTest - - -class TestUnpoolOp(OpTest): - def setUp(self): - self.op_type = "detection_output" - self.init_test_case() - - #loc.shape ((1, 4, 4, 1, 1)) - #conf.shape ((1, 4, 2, 1, 1)) - - loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]], - [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], - [[[0.1]], [[0.1]], [[0.1]], [[0.1]]], - [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]]) - conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]], - [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]]) - priorbox = np.array([ - 0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1, - 0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4, - 0.8, 0.8, 0.1, 0.1, 0.2, 0.2 - ]) - - output = np.array([ - 0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031 - ]) - self.inputs = { - 'Loc': loc.astype('float32'), - 'Conf': conf.astype('float32'), - 'PriorBox': priorbox.astype('float32') - } - self.attrs = { - 'num_classes': self.num_classes, - 'top_k': self.top_k, - 'nms_top_k': self.nms_top_k, - 'background_label_id': self.background_label_id, - 'nms_threshold': self.nms_threshold, - 'confidence_threshold': self.confidence_threshold, - } - self.outputs = {'Out': output.astype('float32')} - - def test_check_output(self): - self.check_output() - - def init_test_case(self): - self.num_classes = 2 - self.top_k = 10 - self.nms_top_k = 20 - self.background_label_id = 0 - self.nms_threshold = 0.01 - self.confidence_threshold = 0.01 - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 6944cca394fbc1ddde09dfeb0bc82e357a3cd225..90d70aa39fdc4d4d3f9062eb6a3eb0cdd014acfc 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -220,7 +220,7 @@ class TestBook(unittest.TestCase): seq_data = layers.data( name='seq_data', shape=[10, 10], dtype='float32', lod_level=1) seq = layers.fc(input=seq_data, size=20) - self.assertIsNotNone(layers.sequence_softmax(x=seq)) + self.assertIsNotNone(layers.sequence_softmax(seq)) print(str(program)) def test_softmax(self): @@ -228,7 +228,7 @@ class TestBook(unittest.TestCase): with program_guard(program): data = layers.data(name='data', shape=[10], dtype='float32') hid = layers.fc(input=data, size=20) - self.assertIsNotNone(layers.softmax(x=hid)) + self.assertIsNotNone(layers.softmax(hid)) print(str(program)) def test_get_places(self): diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 00a6f7c237d58458ea083abf47dd09585cd6f235..6382e290eb30c621da64d5c600be6d8a7c6254f1 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -98,6 +98,9 @@ class TestLearningRateDecay(unittest.TestCase): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) + + fluid.memory_optimize(fluid.default_main_program()) + for step in range(10): lr_val, = exe.run(fluid.default_main_program(), feed={}, diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py index 03a5bd24a133e703855400532517c293196b64f0..ed920ad388ff0e01887404e70fe82565b4cd28fa 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py @@ -15,6 +15,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestLookupTableOp(OpTest): @@ -47,5 +49,52 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): pass +class TestLookupTableIdsIsSelectedRows(OpTest): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Variable + height = 10 + rows = [0, 4, 4, 7] + row_numel = 12 + + # create and initialize W Variable + W = scope.var('W').get_tensor() + W_array = np.full((height, row_numel), 1.0).astype("float32") + for i in range(height): + W_array[i] *= i + W.set(W_array, place) + + # create and initialize Ids Variable + ids_selected_rows = scope.var('Ids').get_selected_rows() + ids_selected_rows.set_height(len(rows)) + ids_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float32") + ids_tensor = ids_selected_rows.get_tensor() + ids_tensor.set(np_array, place) + + # create Out Variable + Out = scope.var('Out').get_selected_rows() + + # create and run lookup_table operator + concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out') + concat_rows_op.run(scope, place) + + # get result from Out + Out_tensor = Out.get_tensor() + result_array = np.array(Out_tensor) + + # all(): return True if all elements of the iterable are true (or if the iterable is empty) + for idx, row in enumerate(rows): + assert (row == result_array[idx]).all() + + def test_concat_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py index 7f2352c5882ce36d8d681a737806f3ee0e3ace98..eaff45cbb2a58798e9d55149510bec72eea370cd 100644 --- a/python/paddle/fluid/tests/unittests/test_lrn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py @@ -41,7 +41,7 @@ class TestLRNOp(OpTest): mid.fill(self.k) for m in range(0, self.N): for i in range(0, self.C): - for c in range(start, end + 1): + for c in range(start, end): ch = i + c if ch < 0 or ch >= self.C: continue diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 9d1da420c7f70bd2a89d183a5f0a2b145f0ff475..40440bea1267112b84b66002a0bf921be3029265 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import paddle.fluid.core as core from op_test import OpTest @@ -69,5 +70,42 @@ class TestMulOp2(OpTest): ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) +class TestFP16MulOp1(OpTest): + def setUp(self): + self.op_type = "mul" + x = np.random.random((32, 84)).astype("float16") + y = np.random.random((84, 100)).astype("float16") + self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + self.outputs = {'Out': np.dot(x, y)} + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-1) + + +class TestFP16MulOp2(OpTest): + def setUp(self): + self.op_type = "mul" + x = np.random.random((15, 4, 12, 10)).astype("float16") + y = np.random.random((4, 30, 8, 2, 9)).astype("float16") + self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot( + x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9)) + result = result.reshape(15, 4, 8, 2, 9) + self.outputs = {'Out': result} + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-1) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 9d87f4daa98da42fcc33aba4b51a4528343fb137..e775db1d10f4561b6fb90631757a25c9f74cb777 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -21,31 +21,43 @@ from paddle.fluid.backward import append_backward class TestOptimizer(unittest.TestCase): def test_sgd_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out") - block.append_op( - type="mul", - inputs={"X": mul_x, - "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) - opts, _ = sgd_optimizer.minimize(mean_out, init_program) + def check_sgd_optimizer(optimizer_attr): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + opts, _ = sgd_optimizer.minimize(mean_out, init_program) + return opts + + opts = check_sgd_optimizer({'learning_rate': 1.1}) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "sgd"]) + opts = check_sgd_optimizer({'learning_rate': 1.0}) + self.assertEqual(len(opts), 1) + self.assertEqual([op.type for op in opts], ["sgd"]) + class TestMomentumOptimizer(unittest.TestCase): class MockMomentum(optimizer.MomentumOptimizer): @@ -60,7 +72,11 @@ class TestMomentumOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -110,7 +126,11 @@ class TestMomentumOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -169,7 +189,11 @@ class TestAdagradOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -229,7 +253,11 @@ class TestAdamOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -292,7 +320,11 @@ class TestAdamaxOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( @@ -352,7 +384,11 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr={'learning_rate': 1.1}) mul_y = block.create_var( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py index 1a7551c57b26f576ab286e7b18177b9120261623..79bea148f9398152a02d70946cdc5fff1f47ba6b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py @@ -15,6 +15,7 @@ import unittest import paddle.fluid as fluid +import paddle.fluid.profiler as profiler import numpy @@ -60,20 +61,23 @@ class BaseParallelForTest(unittest.TestCase): feed=feed, fetch=fetch, place=gpu, - use_parallel=False) + use_parallel=False, + use_gpu=True) result_gpu_parallel = self._run_test_impl_( callback=callback, feed=feed, fetch=fetch, place=gpu, - use_parallel=True) + use_parallel=True, + use_gpu=True) result_gpu_nccl = self._run_test_impl_( callback=callback, feed=feed, fetch=fetch, place=gpu, use_parallel=True, - use_nccl=True) + use_nccl=True, + use_gpu=True) self._assert_same_(fetch, result_cpu, result_cpu_parallel, result_gpu, result_gpu_parallel, result_gpu_nccl) else: @@ -85,7 +89,8 @@ class BaseParallelForTest(unittest.TestCase): fetch, place, use_parallel=False, - use_nccl=False): + use_nccl=False, + use_gpu=False): """ Run a single test, returns the fetch values Args: @@ -132,7 +137,12 @@ class BaseParallelForTest(unittest.TestCase): exe = fluid.Executor(place) exe.run(startup) - return exe.run(main, feed=feed, fetch_list=fetch) + if use_gpu: + profile_type = 'GPU' + else: + profile_type = 'CPU' + with profiler.profiler(profile_type, 'total', '/tmp/profiler'): + return exe.run(main, feed=feed, fetch_list=fetch) def _assert_same_(self, fetch, *args): """ diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 964d78f1966aa10e36eeaabe943d44e002d50293..764fa575fba1615de3171e848890b3836e640849 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -78,20 +78,22 @@ def avg_pool2D_forward_naive(x, class TestPool2d_Op(OpTest): def setUp(self): + self.op_type = "pool2d" self.use_cudnn = False self.use_mkldnn = False + self.dtype = np.float32 self.init_test_case() self.init_global_pool() - self.init_op_type() + self.init_kernel_type() self.init_pool_type() self.init_ceil_mode() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] - input = np.random.random(self.shape).astype("float32") + input = np.random.random(self.shape).astype(self.dtype) output = self.pool2D_forward_naive(input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode).astype("float32") - self.inputs = {'X': input} + self.ceil_mode).astype(self.dtype) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { 'strides': self.strides, @@ -105,7 +107,7 @@ class TestPool2d_Op(OpTest): 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } - self.outputs = {'Out': output.astype('float32')} + self.outputs = {'Out': output} def test_check_output(self): if self.use_cudnn: @@ -115,6 +117,8 @@ class TestPool2d_Op(OpTest): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return if self.use_cudnn and self.pool_type != "max": place = core.CUDAPlace(0) self.check_grad_with_place( @@ -128,8 +132,8 @@ class TestPool2d_Op(OpTest): self.strides = [1, 1] self.paddings = [0, 0] - def init_op_type(self): - self.op_type = "pool2d" + def init_kernel_type(self): + pass def init_pool_type(self): self.pool_type = "avg" @@ -149,9 +153,6 @@ class TestCase1(TestPool2d_Op): self.strides = [1, 1] self.paddings = [0, 0] - def init_op_type(self): - self.op_type = "pool2d" - def init_pool_type(self): self.pool_type = "avg" self.pool2D_forward_naive = avg_pool2D_forward_naive @@ -167,9 +168,6 @@ class TestCase2(TestPool2d_Op): self.strides = [1, 1] self.paddings = [1, 1] - def init_op_type(self): - self.op_type = "pool2d" - def init_pool_type(self): self.pool_type = "avg" self.pool2D_forward_naive = avg_pool2D_forward_naive @@ -179,27 +177,18 @@ class TestCase2(TestPool2d_Op): class TestCase3(TestPool2d_Op): - def init_op_type(self): - self.op_type = "pool2d" - def init_pool_type(self): self.pool_type = "max" self.pool2D_forward_naive = max_pool2D_forward_naive class TestCase4(TestCase1): - def init_op_type(self): - self.op_type = "pool2d" - def init_pool_type(self): self.pool_type = "max" self.pool2D_forward_naive = max_pool2D_forward_naive class TestCase5(TestCase2): - def init_op_type(self): - self.op_type = "pool2d" - def init_pool_type(self): self.pool_type = "max" self.pool2D_forward_naive = max_pool2D_forward_naive @@ -207,39 +196,105 @@ class TestCase5(TestCase2): #--------------------test pool2d-------------------- class TestCUDNNCase1(TestPool2d_Op): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase1(TestPool2d_Op): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase2(TestCase1): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase2(TestCase1): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase3(TestCase2): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase3(TestCase2): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase4(TestCase3): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase4(TestCase3): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase5(TestCase4): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase5(TestCase4): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCUDNNCase6(TestCase5): - def init_op_type(self): + def init_kernel_type(self): self.use_cudnn = True - self.op_type = "pool2d" + + +class TestFP16CUDNNCase6(TestCase5): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) class TestCeilModeCase1(TestCUDNNCase1): @@ -264,39 +319,33 @@ class TestCeilModeCase4(TestCase2): #--------------------test pool2d MKLDNN-------------------- class TestMKLDNNCase1(TestPool2d_Op): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" class TestMKLDNNCase2(TestCase1): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" class TestMKLDNNCase3(TestCase2): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" class TestMKLDNNCase4(TestCase3): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" class TestMKLDNNCase5(TestCase4): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" class TestMKLDNNCase6(TestCase5): - def init_op_type(self): + def init_kernel_type(self): self.use_mkldnn = True - self.op_type = "pool2d" if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index 1da6b94eea30e65913ce713e0e5e355507534161..cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase): with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + hidden1 = fluid.layers.fc(input=image, size=64, act='relu') + i = layers.zeros(shape=[1], dtype='int64') + counter = fluid.layers.zeros( + shape=[1], dtype='int64', force_cpu=True) + until = layers.fill_constant([1], dtype='int64', value=10) + data_arr = layers.array_write(hidden1, i) + cond = fluid.layers.less_than(x=counter, y=until) + while_op = fluid.layers.While(cond=cond) + with while_op.block(): + hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') + layers.array_write(hidden_n, i, data_arr) + fluid.layers.increment(x=counter, value=1, in_place=True) + layers.less_than(x=counter, y=until, cond=cond) + + hidden_n = layers.array_read(data_arr, i) + hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..24a0074d9b9621d902d12eb8cb29d9b65be22ed3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py @@ -0,0 +1,81 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle.fluid as fluid +import paddle.v2 as paddle +import paddle.v2.dataset.mnist as mnist + + +class TestRecordIO(unittest.TestCase): + def setUp(self): + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=32) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( + './mnist.recordio', reader, feeder) + + def test_main(self, decorator_callback=None): + # use new program + with fluid.program_guard(fluid.Program(), fluid.Program()): + data_file = fluid.layers.open_recordio_file( + './mnist.recordio', + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + if decorator_callback is not None: + data_file = decorator_callback(data_file) + img, label = fluid.layers.read_file(data_file) + + hidden = fluid.layers.fc(input=img, size=100, act='tanh') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + + fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss) + + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + avg_loss_np = [] + + # train a pass + batch_id = 0 + while not data_file.eof(): + tmp, = exe.run(fetch_list=[avg_loss]) + avg_loss_np.append(tmp) + batch_id += 1 + data_file.reset() + self.assertEqual(batch_id, self.num_batches) + self.assertLess(avg_loss_np[-1], avg_loss_np[0]) + + def test_shuffle_reader(self): + self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200)) + + def test_double_buffer_reader(self): + self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader, + place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu')) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 5e656bddb7927b6e7935255c120e5e858505125a..9b0cc3534dc551e7fdf7ef8111cad1c172f8bfa4 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -70,6 +70,19 @@ class TestMinOp(OpTest): self.check_output() +class TestProdOp(OpTest): + def setUp(self): + self.op_type = "reduce_prod" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].prod(axis=0)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + class TestKeepDimReduce(OpTest): def setUp(self): self.op_type = "reduce_sum" diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index bb02a40d449860cf6c0576662e79a5e36e6e0635..fb1728743630b3ea908ae835444eff7fd71b72c8 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -25,7 +25,7 @@ class TestScatterOp(OpTest): updates_np = np.random.random((2, 3)).astype("float32") output_np = np.copy(ref_np) output_np[index_np] = updates_np - self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np} + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} self.outputs = {'Out': output_np} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py index 9e5c1e7a3d0bdf514de11e797d7139f577002c52..d6dc99bb3106feee33daa52bffb386f07cc16de5 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py @@ -16,11 +16,15 @@ import unittest import numpy as np from op_test import OpTest from test_softmax_op import stable_softmax +import paddle.fluid.core as core class TestSequenceSoftmaxOp(OpTest): def setUp(self): self.op_type = "sequence_softmax" + self.use_cudnn = False + self.init_op_type() + x = np.random.uniform(0.1, 1, (11, 1)).astype("float32") lod = [[0, 4, 5, 8, 11]] @@ -34,12 +38,31 @@ class TestSequenceSoftmaxOp(OpTest): self.inputs = {"X": (x, lod)} self.outputs = {"Out": out} + self.attrs = {'use_cudnn': self.use_cudnn, } + + def init_op_type(self): + pass def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - self.check_grad(["X"], "Out", max_relative_error=0.01) + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ["X"], "Out", max_relative_error=0.01) + else: + self.check_grad(["X"], "Out", max_relative_error=0.01) + + +# ----------------cudnn Sequencesoftmax---------------- +class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp): + def init_op_type(self): + self.use_cudnn = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 8f8312edca7e2d98eb4e881f671c6afdda01c57a..4f20da2b926823db9e7ec92c95178b6d3d1feec9 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core def stable_softmax(x): @@ -27,18 +28,37 @@ def stable_softmax(x): class TestSoftmaxOp(OpTest): def setUp(self): self.op_type = "softmax" + self.use_cudnn = False self.inputs = { 'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32") } self.outputs = { 'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) } + self.attrs = {'use_cudnn': self.use_cudnn, } + + def init_op_type(self): + pass def test_check_output(self): - self.check_output() + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Out') + if self.use_cudnn: + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ["X"], "Out", max_relative_error=0.01) + else: + self.check_grad(["X"], "Out", max_relative_error=0.01) + + +class TestSoftmaxCUDNNOp(TestSoftmaxOp): + def init_op_type(self): + self.use_cudnn = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index 889fea2ce66e64d439b51498722e571f48cd1f96..c0d9fc8f22a7c4f791d80a9cad87d003b5d54299 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): def setUp(self): self.op_type = "softmax_with_cross_entropy" - batch_size = 2 + batch_size = 41 class_num = 37 logits = np.random.uniform(0.1, 1.0, @@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): def setUp(self): self.op_type = "softmax_with_cross_entropy" - batch_size = 2 + batch_size = 41 class_num = 37 logits = np.random.uniform(0.1, 1.0, diff --git a/python/setup.py.in b/python/setup.py.in index f830039a3af581d593d510326f15139377cb25f1..4cb5409524457b7bc5a99c88a0dbbfc8834923fa 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') packages=['paddle', - 'paddle.proto', - 'paddle.trainer', - 'paddle.trainer_config_helpers', 'paddle.utils', - 'paddle.v2', - 'paddle.v2.dataset', - 'paddle.v2.reader', - 'paddle.v2.master', - 'paddle.v2.plot', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', - 'paddle.fluid.layers', - 'py_paddle'] + 'paddle.fluid.layers'] + +if '${WITH_FLUID}'== 'OFF': + packages+=['paddle.proto', + 'paddle.trainer', + 'paddle.trainer_config_helpers', + 'paddle.v2', + 'paddle.v2.dataset', + 'paddle.v2.reader', + 'paddle.v2.master', + 'paddle.v2.plot', + 'py_paddle'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: setup_requires = f.read().splitlines() @@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: setup_requires+=['opencv-python'] # the prefix is sys.prefix which should always be usr -paddle_bin_dir = 'opt/paddle/bin' -paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', - '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', - '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', - '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] +paddle_bins = '' +if '${WITH_FLUID}'== 'OFF': + paddle_bin_dir = 'opt/paddle/bin' + paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', + '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', + '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', + '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] + +package_data={'paddle.fluid': ['core.so']} +if '${WITH_FLUID}'== 'OFF': + package_data['paddle.v2.master']=['libpaddle_master.so'] + package_data['py_paddle']=['*.py','_swig_paddle.so'] + +package_dir={ + '': '${CMAKE_CURRENT_SOURCE_DIR}', + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', + 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', +} +if '${WITH_FLUID}'== 'OFF': + package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle' + paddle_rt_lib_dir = 'lib' paddle_rt_libs = ['${WARPCTC_LIBRARIES}'] @@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}', install_requires=setup_requires, packages=packages, ext_modules=[Extension('_foo', ['stub.cc'])], - package_data={ - 'paddle.v2.master': ['libpaddle_master.so'], - 'paddle.fluid': ['core.so'], - 'py_paddle':['*.py','_swig_paddle.so'] - }, - package_dir={ - '': '${CMAKE_CURRENT_SOURCE_DIR}', - # The paddle.fluid.proto will be generated while compiling. - # So that package points to other directory. - 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', - 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', - 'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle' - }, + package_data=package_data, + package_dir=package_dir, scripts=paddle_bins, data_files=[(paddle_rt_lib_dir, paddle_rt_libs)] ) diff --git a/tools/timeline.py b/tools/timeline.py index ee83a1baecdd4243bb6c546486a837393980fb65..f4083c824e7333a74661d096d4954609f767c83e 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -121,27 +121,34 @@ class Timeline(object): def _allocate_pids(self): for event in self._profile_pb.events: - if event.device_id not in self._devices: - pid = self._allocate_pid() - self._devices[event.device_id] = pid - if event.device_id >= 0: - self._chrome_trace.emit_pid("gpu:%s:stream:%d" % - (pid, event.stream_id), pid) - elif event.device_id == -1: - self._chrome_trace.emit_pid("cpu:thread_hash:%d" % - event.stream_id, pid) + if event.type == profiler_pb2.Event.CPU: + if (event.device_id, "CPU") not in self._devices: + pid = self._allocate_pid() + self._devices[(event.device_id, "CPU")] = pid + self._chrome_trace.emit_pid("cpu:block:%d" % + (event.device_id), pid) + elif event.type == profiler_pb2.Event.GPUKernel: + if (event.device_id, "GPUKernel") not in self._devices: + pid = self._allocate_pid() + self._devices[(event.device_id, "GPUKernel")] = pid + self._chrome_trace.emit_pid("gpu:%d" % (event.device_id), + pid) def _allocate_events(self): for event in self._profile_pb.events: - pid = self._devices[event.device_id] + if event.type == profiler_pb2.Event.CPU: + type = "CPU" + elif event.type == profiler_pb2.Event.GPUKernel: + type = "GPUKernel" + pid = self._devices[(event.device_id, type)] args = {'name': event.name} if event.memcopy.bytes > 0: args = {'mem_bytes': event.memcopy.bytes} # TODO(panyx0718): Chrome tracing only handles ms. However, some # ops takes micro-seconds. Hence, we keep the ns here. - self._chrome_trace.emit_region(event.start_ns, - (event.end_ns - event.start_ns) / - 1.0, pid, 0, 'Op', event.name, args) + self._chrome_trace.emit_region( + event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, + event.sub_device_id, 'Op', event.name, args) def generate_chrome_trace(self): self._allocate_pids()