diff --git a/Dockerfile b/Dockerfile index d99d3d182ef5cb4531ecaff999c048ce806eae80..164fe84904947bfc3cf71132b5fba04744460b26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,7 @@ RUN apt-get update && \ automake locales clang-format swig doxygen cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - net-tools libtool && \ + net-tools libtool ccache && \ apt-get clean -y # Install Go and glide diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh new file mode 100644 index 0000000000000000000000000000000000000000..8c0501439e9d5fa175f5aa9b62d286e690a10904 --- /dev/null +++ b/benchmark/cluster/vgg16/run_vgg_dist.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Update to point to the source file. +VGG_SRC="vgg16_fluid.py" + +export TRAINING_ROLE=PSERVER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & + +# Need to wait for the ps to start first. +sleep 10 +echo "done start ps" + +export TRAINING_ROLE=TRAINER +export TRAINERS=2 +export POD_IP=127.0.0.1 +export PADDLE_INIT_PORT=6174 +CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & +CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 8b29227cfab2a36d5b9f6d17b837b33da8d2a92e..6c7d2c10363d3e311dfae455f3dd8fcfc51077a0 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -200,18 +200,19 @@ def main(): num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" - % (pass_id, iters, loss, acc, - len(data) / (time.time() - ts)) + "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " + "Speed = %.2f img/s " % (args.task_index, pass_id, iters, + loss, acc, + len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = train_pass_acc.eval() pass_test_acc = test(exe) - print( - "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" - % (pass_id, num_samples / pass_elapsed, pass_train_acc, - pass_test_acc)) + print("Task:%d Pass = %d, Training performance = %f imgs/s, " + "Train accuracy = %f, Test accuracy = %f\n" % + (args.task_index, pass_id, num_samples / pass_elapsed, + pass_train_acc, pass_test_acc)) if args.local: # Parameter initialization @@ -239,8 +240,6 @@ def main(): t = fluid.DistributeTranspiler() t.transpile( - optimize_ops, - params_grads, trainer_id=args.task_index, pservers=args.ps_hosts, trainers=trainers) diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp index 24b0020636c0141b87dc80f5079f7342ec28157c..733d49cacfda17ad19b7bd7918be73c1fd14a64f 100644 --- a/paddle/capi/Matrix.cpp +++ b/paddle/capi/Matrix.cpp @@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat, paddle_error paddle_matrix_get_shape(paddle_matrix mat, uint64_t* height, uint64_t* width) { - if (mat == nullptr) return kPD_NULLPTR; + if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR; if (height != nullptr) { *height = cast(mat)->mat->getHeight(); } diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 6c4f09dacb47c431db2e2610a3e61390a82dcba0..402302a5bfa877be33bf3bbd6ba05c8f3e48c3ba 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef HL_BASE_H_ -#define HL_BASE_H_ +#pragma once #include @@ -207,8 +206,8 @@ typedef struct { #ifdef __NVCC__ -#include "cuda_runtime.h" -#include "hl_cuda.h" +#include +#include "paddle/cuda/include/hl_cuda.h" #include "paddle/utils/Logging.h" extern __thread bool g_sync_flag; @@ -228,6 +227,19 @@ extern __thread cudaStream_t default_stream; << "CUDA error: " << hl_get_device_error_string((size_t)err); \ } -#endif /* __NVCC__ */ +// __shfl has been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T +__shfl_sync(unsigned, T val, int src_line, int width) { + return __shfl(val, src_line, width); +} -#endif /* HL_BASE_H_ */ +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + +#endif // __NVCC__ diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu index 38371366f8e2ad738974cd84a75926f72820e05f..e30fcddffdf99417a4b9b811a0b0cb0a12e79b99 100644 --- a/paddle/cuda/src/hl_cuda_lstm.cu +++ b/paddle/cuda/src/hl_cuda_lstm.cu @@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue, } __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { - int addr = idx % 32; + const int warp_size = 32; + int addr = idx % warp_size; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, addr < warp_size); #pragma unroll for (int k = 1; k < 32; k++) { // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32); - addr = __shfl_sync(addr, (idx + 1) % 32, 32); - a[k] = __shfl_sync(a[k], addr, 32); + addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32); + a[k] = __shfl_sync(mask, a[k], addr, 32); } #pragma unroll @@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { } addr = (32 - idx) % 32; + CREATE_SHFL_MASK(mask, idx % 32 < warp_size); #pragma unroll for (int k = 0; k < 32; k++) { - a[k] = __shfl_sync(a[k], addr, 32); - addr = __shfl_sync(addr, (idx + 31) % 32, 32); + a[k] = __shfl_sync(mask, a[k], addr, 32); + addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32); } } diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu index 94c9cceb2c37f5a9d7a1f903864f42f1a3ebbcdc..59ba552f560dab904d4983e0778ff57be9477c3e 100644 --- a/paddle/cuda/src/hl_top_k.cu +++ b/paddle/cuda/src/hl_top_k.cu @@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, if (--beamSize == 0) break; __syncthreads(); + unsigned mask = 0u; + // CREATE_SHFL_MASK(mask, tid < len); + if (tid == maxId[0]) { if (beam < maxLength) { shTopK[tid] = topK[beam]; } } if (maxId[0] / 32 == warp) { - if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break; + if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break; } } } diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index c2eb1c31b4f5625e662436e278a33c55b38bb004..daba9bf2dd861d6366723ed71eada66bc67c78f2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss, + const std::vector &local_scopes, bool use_default_grad_scale, platform::NCCLContextMap *nccl_ctxs) : loss_var_name_(loss_var_name), places_(places), @@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool skip_scale_loss) + const std::vector &local_scopes, bool use_default_grad_scale) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes) { @@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( for (auto &p : params) { grad_names_.insert(GradVarName(p)); } - skip_scale_loss_ = skip_scale_loss; + use_default_grad_scale_ = use_default_grad_scale; } void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, @@ -126,8 +126,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } else if (IsDistTrainOp(*op, send_op)) { CreateComputationalOps(&result, *op, 1); } else if (IsScaleLossOp(*op)) { - // user can customize loss@grad if skip_scale_loss_ - if (!skip_scale_loss_) { + // user can customize loss@grad if not use_default_grad_scale_ + if (use_default_grad_scale_) { CreateScaleLossGradOp(&result); } is_forwarding = false; diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index fa4d31bdc49da5d30340a710c950dcc8cd70180b..bad47458ef4cd1cd42e902341e8be66da5c210ed 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - bool skip_scale_loss); + bool use_default_grad_scale); #endif std::unique_ptr Build(const ProgramDesc &program) const override; @@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #ifdef PADDLE_WITH_CUDA platform::NCCLContextMap *nccl_ctxs_; #endif - bool skip_scale_loss_; + bool use_default_grad_scale_; bool IsScaleLossOp(const OpDesc &op) const; diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 97ab98f09b1a902a942d9667bc7716a28b98d54c..77e5ec4c7dd14b7ebb6d606b8c401ee714259d40 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) { std::unique_ptr stream_ptr(stream); recordio::Scanner scanner(std::move(stream_ptr)); auto tensors = ReadFromRecordIO(&scanner, ctx); - ASSERT_EQ(tensors.size(), 2); + ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); tensors = ReadFromRecordIO(&scanner, ctx); - ASSERT_EQ(tensors.size(), 2); + ASSERT_EQ(tensors.size(), static_cast(2)); assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[1]); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 32576423a62a1a12f085d565e7ff267145bf979c..d70f26026c28867e592a9f8e37cc53e6c1d6d85e 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { RunImpl(scope, place); } +bool OperatorBase::HasInputs(const std::string& name) const { + if (inputs_.find(name) != inputs_.end()) { + return true; + } else { + return false; + } +} + std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE(ins.size(), 1UL, @@ -109,6 +117,14 @@ const std::vector& OperatorBase::Inputs( return it->second; } +bool OperatorBase::HasOutputs(const std::string& name) const { + if (outputs_.find(name) != outputs_.end()) { + return true; + } else { + return false; + } +} + std::string OperatorBase::Output(const std::string& name) const { auto& outs = Outputs(name); PADDLE_ENFORCE_LE(outs.size(), 1UL, @@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const { if (op_info == nullptr || op_info->proto_ == nullptr) return; for (auto& in : op_info->Proto().inputs()) { - PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), - "Type %s's input %s is not set", Type(), in.name()); + if (!in.dispensable()) { + PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), + "Operator %s's input, %s, is not set", Type(), in.name()); + } } for (auto& out : op_info->Proto().outputs()) { - PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), - "Type %s's output %s is not set", Type(), out.name()); + if (!out.dispensable()) { + PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), + "Operator %s's output, %s, is not set", Type(), + out.name()); + } } } @@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext { : op_(op), scope_(scope) {} bool HasInput(const std::string& name) const override { + if (!op_.HasInputs(name)) { + return false; + } auto& ins = Inputs(name); size_t length = ins.size(); if (length == 0) { @@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasOutput(const std::string& name) const override { + if (!op_.HasOutputs(name)) { + return false; + } auto& outs = Outputs(name); size_t length = outs.size(); if (length == 0) { @@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasInputs(const std::string& name) const override { + if (!op_.HasInputs(name)) { + return false; + } auto inputs = op_.Inputs(name); if (inputs.empty()) { return false; @@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasOutputs(const std::string& name) const override { + if (!op_.HasOutputs(name)) { + return false; + } auto outputs = op_.Outputs(name); if (outputs.empty()) { return false; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 826cc57b725ab4b52e5d67ab82e939cbd62a8460..d373c48b1a75c5f75c7520b56f230bc2c146b174 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -105,6 +105,7 @@ class OperatorBase { const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Outputs() const { return outputs_; } + bool HasInputs(const std::string& name) const; //! Get a input with argument's name described in `op_proto` std::string Input(const std::string& name) const; //! Get a input which has multiple variables. @@ -112,6 +113,7 @@ class OperatorBase { //! Get all inputs variable names std::vector InputVars() const; + bool HasOutputs(const std::string& name) const; //! Get a output with argument's name described in `op_proto` std::string Output(const std::string& name) const; //! Get an output which has multiple variables. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index de644e851999920251c762a75c050e8182e950c6..4712efeff68cf98a50b610acf78bb52d7519c520 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, bool allow_op_delay, - bool customize_scale_loss) + bool use_default_grad_scale) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder( member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss, member_->nccl_ctxs_.get()); + use_default_grad_scale, member_->nccl_ctxs_.get()); #else details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, - customize_scale_loss); + use_default_grad_scale); #endif auto graph = builder.Build(main_program); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 49da123d98181c3d3abcdd64d14c5583142eba58..ecd107d81f8f5bf5d8b899d0c07797114a7ab767 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -40,7 +40,7 @@ class ParallelExecutor { const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, const std::vector& local_scopes, - bool allow_op_delay, bool customize_scale_loss); + bool allow_op_delay, bool use_default_grad_scale); ~ParallelExecutor(); diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 794e7f743413b068119afd5df232bfc2bb91a8c7..56cf6693caf4529d6e157e6e9a0d5c27d05ee0c3 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const { : true; } -std::vector SelectedRows::Get(std::vector keys, - framework::Tensor* value) const { +std::vector> SelectedRows::Get( + std::vector keys, framework::Tensor* value) const { PADDLE_ENFORCE(value->IsInitialized(), "The value tensor should be initialized."); - std::vector non_keys; + std::vector> non_keys_pair; int64_t value_width = value_->numel() / value_->dims()[0]; PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], "output tensor should have the same shape with table " @@ -133,7 +133,7 @@ std::vector SelectedRows::Get(std::vector keys, for (size_t i = 0; i < keys.size(); ++i) { int64_t index = Index(keys[i]); if (index == -1) { - non_keys.push_back(keys[i]); + non_keys_pair.push_back(std::make_pair(keys[i], static_cast(i))); } else { framework::VisitDataType( framework::ToDataType(value_->type()), @@ -141,7 +141,7 @@ std::vector SelectedRows::Get(std::vector keys, index * value_width, value_width)); } } - return non_keys; + return non_keys_pair; } bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index d6c9507b1681855e759a4b1b9d3dddf6fcb2fc13..c27c927ee751c4392840bfb71f4814991b23a8c9 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -78,10 +79,11 @@ class SelectedRows { /* * @brief Get value by the key list, if the * - * @return a list of keys which does not exists in table + * @return a list of pair which contains the non-exists key and the index in + * the value */ - std::vector Get(std::vector keys, - framework::Tensor* tensor) const; + std::vector> Get(std::vector keys, + framework::Tensor* value) const; /* * @brief Set a key-value pair into the table. diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 39fe6d92940606084c28eec1a4d6486cb58844ce..eefcaa5672c5a3debf162f5c8eda653408dcf221 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); } -TEST_F(SelectedRowsTester, Table) { +TEST_F(SelectedRowsTester, SparseTable) { platform::CPUPlace cpu; SelectedRows table; // initialize a sparse table @@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) { framework::Tensor get_value; get_value.mutable_data(framework::make_ddim({2, 100}), cpu); std::vector keys({non_key, key}); - auto non_keys = table.Get(keys, &get_value); + auto non_key_pairs = table.Get(keys, &get_value); ASSERT_EQ(get_value.data()[100], static_cast(10)); - ASSERT_EQ(non_keys.size(), static_cast(1)); - ASSERT_EQ(non_keys[0], non_key); + ASSERT_EQ(non_key_pairs.size(), static_cast(1)); + ASSERT_EQ(non_key_pairs[0].first, non_key); } } // namespace framework diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 82d8c3df4ece7e56a72b650f8ea58f3953af3b64..d6d4c2f8a2ced08ed8481e92e131f6e2bed9ec05 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase { // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { - infer_builder_.reset(createInferBuilder(logger_)); + infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } // After finishing adding ops, freeze this network and creates the executation diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 796283d325ceb84c733eff5c119b808300bca069..2b402cce60762d774cd7b371e448b2b88794b6a8 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -46,13 +46,13 @@ const int kDataTypeSize[] = { // The following two API are implemented in TensorRT's header file, cannot load // from the dynamic library. So create our own implementation and directly // trigger the method from the dynamic library. -static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { +static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); } -static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { +static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } // A logger for create TensorRT infer builder. @@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger { return *x; } - virtual ~NaiveLogger() override {} + ~NaiveLogger() override {} }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index aed5b5e1a22cbed1256d4f28d0a8a4c29c6cc744..a07537985738ab0ad4092b794f3b62ba53dfa866 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include "NvInfer.h" -#include "cuda.h" -#include "cuda_runtime_api.h" #include "paddle/fluid/platform/dynload/tensorrt.h" namespace dy = paddle::platform::dynload; @@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger { class ScopedWeights { public: - ScopedWeights(float value) : value_(value) { + explicit ScopedWeights(float value) : value_(value) { w.type = nvinfer1::DataType::kFLOAT; w.values = &value_; w.count = 1; @@ -58,13 +58,13 @@ class ScopedWeights { // The following two API are implemented in TensorRT's header file, cannot load // from the dynamic library. So create our own implementation and directly // trigger the method from the dynamic library. -nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { +nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); } -nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { +nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( - dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } const char* kInputTensor = "input"; @@ -74,7 +74,7 @@ const char* kOutputTensor = "output"; nvinfer1::IHostMemory* CreateNetwork() { Logger logger; // Create the engine. - nvinfer1::IBuilder* builder = createInferBuilder(logger); + nvinfer1::IBuilder* builder = createInferBuilder(&logger); ScopedWeights weights(2.); ScopedWeights bias(3.); @@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() { return model; } -void Execute(nvinfer1::IExecutionContext& context, const float* input, +void Execute(nvinfer1::IExecutionContext* context, const float* input, float* output) { - const nvinfer1::ICudaEngine& engine = context.getEngine(); + const nvinfer1::ICudaEngine& engine = context->getEngine(); // Two binds, input and output ASSERT_EQ(engine.getNbBindings(), 2); const int input_index = engine.getBindingIndex(kInputTensor); @@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input, // Copy the input to the GPU, execute the network, and copy the output back. ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), cudaMemcpyHostToDevice, stream)); - context.enqueue(1, buffers, stream, nullptr); + context->enqueue(1, buffers, stream, nullptr); ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); @@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) { // Use the model to create an engine and an execution context. Logger logger; - nvinfer1::IRuntime* runtime = createInferRuntime(logger); + nvinfer1::IRuntime* runtime = createInferRuntime(&logger); nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); model->destroy(); @@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) { // Execute the network. float input = 1234; float output; - Execute(*context, &input, &output); + Execute(context, &input, &output); EXPECT_EQ(output, input * 2 + 3); // Destroy the engine. diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e4a56d4a45a732cfcf43b09228bc0c44df5924c --- /dev/null +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -0,0 +1,325 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::MKLDNNMemDesc; +using mkldnn::memory; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +namespace { +template +struct bn_type_traits { + using op_type = T; + using op_desc = typename op_type::desc; + using op_prim = typename op_type::primitive_desc; +}; + +template +void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, + Container *c) { + auto it = std::begin(*c); + + std::copy(scale_begin, scale_end, std::inserter(*c, it)); + std::copy( + shift_begin, shift_end, + std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end)))); +} + +template +void run_batch_norm_op(Args &&... args) { + Op batch_norm_op{args...}; + + std::vector pipeline; + pipeline.push_back(batch_norm_op); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +template +inline void *cast_const_to_void(const T *t) { + return static_cast(const_cast(t)); +} +} // namespace + +template +class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto data_layout_str = ctx.Attr("data_layout"); + auto data_layout = framework::StringToDataLayout(data_layout_str); + PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, + "MKLDNN batch normalization handles only NCHW data layout"); + + const float epsilon = ctx.Attr("epsilon"); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + + const auto *x = ctx.Input("X"); + const auto *mean = ctx.Input("Mean"); + const auto *variance = ctx.Input("Variance"); + + auto &dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *batch_mean = ctx.Output("SavedMean"); + auto *batch_variance = ctx.Output("SavedVariance"); + + const auto *scale = ctx.Input("Scale"); + const auto *shift = ctx.Input("Bias"); + + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + + if (!is_test) { + batch_mean->mutable_data(ctx.GetPlace()); + batch_variance->mutable_data(ctx.GetPlace()); + } + + auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + + auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; + auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; + + auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; + auto dst = mkldnn::memory{dst_pd, y->data()}; + + unsigned flags = mkldnn::use_scale_shift; + if (is_test) flags |= mkldnn::use_global_stats; + + using bn_fwd_types = bn_type_traits; + auto batch_norm_fwd_desc = + bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; + auto batch_norm_fwd_pd = + bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + + const unsigned int ic = dims[1]; + + // MKLDNN requires a single piece of memory for scale and shift/bias data + const size_t scaleshift_size = 2 * ic; + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + + auto scaleshift_memory = mkldnn::memory{ + batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + + if (is_test) { + auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), + cast_const_to_void(mean->data())}; + + auto variance_memory = + mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), + cast_const_to_void(variance->data())}; + + run_batch_norm_op( + batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, + dst); + } else { + auto mean_memory = + mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), + cast_const_to_void(batch_mean->data())}; + + auto variance_memory = + mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), + cast_const_to_void(batch_variance->data())}; + + run_batch_norm_op(batch_norm_fwd_pd, src, + scaleshift_memory, dst, + mean_memory, variance_memory); + } + + if (!is_test) { + const unsigned int in = dims[0]; + const unsigned int sample_size = x->numel() / in / ic; + + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e( + batch_mean->mutable_data(ctx.GetPlace()), ic); + EigenVectorArrayMap saved_variance_e( + batch_variance->mutable_data(ctx.GetPlace()), ic); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + const unsigned int x_arr_size = in * ic; + ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); + for (unsigned int nc = 0; nc < x_arr_size; ++nc) { + saved_mean_e(nc % ic) += x_arr.col(nc).sum(); + } + saved_mean_e /= in * sample_size; + for (unsigned int nc = 0; nc < x_arr_size; ++nc) { + saved_variance_e(nc % ic) += + (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); + } + saved_variance_e /= in * sample_size; + + ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; + ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; + + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), ic); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), ic); + + auto one_minus_momentum = 1. - momentum; + running_mean_arr = + mean_arr * momentum + saved_mean_e * one_minus_momentum; + running_var_arr = + variance_arr * momentum + saved_variance_e * one_minus_momentum; + } + } +}; + +template +class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto data_layout_str = ctx.Attr("data_layout"); + auto data_layout = framework::StringToDataLayout(data_layout_str); + PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, + "MKLDNN batch normalization handles only NCHW data layout"); + + auto &dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + + const float epsilon = ctx.Attr("epsilon"); + + const auto *x = ctx.Input("X"); + const auto *scale = ctx.Input("Scale"); + const auto *shift = ctx.Input("Bias"); + const auto *batch_mean = ctx.Input("SavedMean"); + const auto *batch_variance = ctx.Input("SavedVariance"); + + const auto *diff_y = ctx.Input(framework::GradVarName("Y")); + auto *diff_x = ctx.Output(framework::GradVarName("X")); + auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); + auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); + + diff_x->mutable_data(ctx.GetPlace()); + diff_scale->mutable_data(ctx.GetPlace()); + diff_shift->mutable_data(ctx.GetPlace()); + + auto dims = paddle::framework::vectorize2int(x->dims()); + unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + + auto src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto diff_src_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + auto diff_dst_md = + MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + + using bn_bwd_types = bn_type_traits; + using bn_fwd_types = bn_type_traits; + + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; + auto batch_norm_fwd_pd = + bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + + auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ + mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ + batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; + + auto src = mkldnn::memory{{src_md, mkldnn_engine}, + cast_const_to_void(x->data())}; + + auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), + cast_const_to_void(batch_mean->data())}; + + auto variance = + mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), + cast_const_to_void(batch_variance->data())}; + + auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, + cast_const_to_void(diff_y->data())}; + + const unsigned int ic = dims[1]; + + const size_t scaleshift_size = 2 * ic; + + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + + auto scaleshift_memory = mkldnn::memory{ + batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + + std::vector diff_scaleshift_data; + diff_scaleshift_data.reserve(scaleshift_size); + copy_to_weights(diff_scale->data(), diff_scale->data() + ic, + diff_shift->data(), diff_shift->data() + ic, + &diff_scaleshift_data); + + auto diff_scaleshift_memory = + mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()}; + + auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, + static_cast(diff_x->data())}; + + run_batch_norm_op( + batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, + diff_src, diff_scaleshift_memory); + + auto it = std::begin(diff_scaleshift_data); + std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(std::next(it, ic), std::end(diff_scaleshift_data), + diff_shift->data()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, + ops::BatchNormMKLDNNOpKernel); +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, + ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index c9939e8602ed341d37784ca292a55326899e8e65..b4bd40d0311bf10ec1fddabab2ee131fe02baf52 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include #include "paddle/fluid/framework/data_layout.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::ToDataType(ctx.Input("X")->type()); - // For float or float16 input tensor, the type of the scale, bias, mean, - // and var tensors should both be float. + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(ctx.Input("Scale")->type()), "Scale input should be of float type"); @@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( ctx.Input("Variance")->type()), "Variance input should be of float type"); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library_); } }; @@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { "Variance of the current mini batch, " "will apply to output when training") .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Batch Normalization. @@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel { if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD"); } - return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.GetPlace()); + + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout, library_); } }; @@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("SavedVariance", Output("SavedVariance")); @@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OP_CPU_KERNEL( - batch_norm, - ops::BatchNormKernel); + batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel); REGISTER_OP_CPU_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index cb1927bc0f2eb735f0a3184df5f0f8fada2f9dca..550dd32d36767f90e880415bfffaf01aeb623609 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -287,6 +287,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel, ops::BatchNormKernel); REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel); + batch_norm_grad, ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 0e0622e290f42811c83c354d749ef32a2d9dcadb..2b2a9dc8319f964875371214168ce04cb67fc818 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X. } // namespace paddle namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); -REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, - ops::CrossEntropyOpKernel); +REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); REGISTER_OP_CPU_KERNEL(cross_entropy_grad, - ops::CrossEntropyGradientOpKernel, - ops::CrossEntropyGradientOpKernel); + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu index 6449149d4b55962e84baafffc0c2c03f8108866f..30dbd5bd3d39dd2992c3dd91364003bb7715a2eb 100644 --- a/paddle/fluid/operators/cross_entropy_op.cu +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -14,98 +14,11 @@ limitations under the License. */ #include "paddle/fluid/operators/cross_entropy_op.h" -namespace paddle { -namespace operators { - -namespace { - -template -__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, - const int64_t* label, const int N, - const int D) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; - i += blockDim.x * gridDim.x) { - int idx = i * D + label[i]; - dX[idx] = -dY[i] / X[idx]; - } -} - -template -__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X, - const T* label, const int N, - const int D) { - int ids = blockIdx.x * blockDim.x + threadIdx.x; - if (ids < N * D) { - int row_ids = ids / D; - dX[ids] = -label[ids] * dY[row_ids] / X[ids]; - } -} -} // namespace - -template -class CrossEntropyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "This kernel only runs on GPU device."); - const Tensor* x = ctx.Input("X"); - const Tensor* label = ctx.Input("Label"); - Tensor* y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - math::CrossEntropyFunctor()( - ctx.template device_context(), y, x, label, - ctx.Attr("soft_label")); - } -}; - -template -class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "This kernel only runs on GPU device."); - - const Tensor* x = ctx.Input("X"); - const Tensor* label = ctx.Input("Label"); - Tensor* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - const T* dy_data = - ctx.Input(framework::GradVarName("Y"))->data(); - T* dx_data = dx->mutable_data(ctx.GetPlace()); - const T* x_data = x->data(); - - int64_t batch_size = x->dims()[0]; - int64_t class_num = x->dims()[1]; - - int block = 512; - int grid = (batch_size * class_num + block - 1) / block; - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - - if (ctx.Attr("soft_label")) { - auto* label_data = label->data(); - SoftCrossEntropyGradientKernel<<>>( - dx_data, dy_data, x_data, label_data, batch_size, class_num); - } else { - math::SetConstant functor; - functor(dev_ctx, dx, 0); - auto* label_data = label->data(); - grid = (batch_size + block - 1) / block; - CrossEntropyGradientKernel<<>>( - dx_data, dy_data, x_data, label_data, batch_size, class_num); - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, - ops::CrossEntropyOpCUDAKernel); +using CUDACtx = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(cross_entropy, + ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, - ops::CrossEntropyGradientOpCUDAKernel, - ops::CrossEntropyGradientOpCUDAKernel); + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 6da3a24dc89a85fe432b6350d3af7b0e84337c9d..19a2aec92b267ece94685ce34604b7d1cfa5d209 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -17,69 +17,106 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; -template +template class CrossEntropyOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "This kernel only runs on CPU."); - const Tensor* x = ctx.Input("X"); - const Tensor* labels = ctx.Input("Label"); - Tensor* y = ctx.Output("Y"); + auto* x = ctx.Input("X"); + auto* labels = ctx.Input("Label"); + auto* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.template device_context(), y, x, labels, + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, ctx.Attr("soft_label")); } }; template +class XeSoftlabelGradFunctor { + public: + XeSoftlabelGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const T* label, // NOLINT + size_t num_classes) + : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + + HOSTDEVICE void operator()(size_t i) { + auto row_ids = i / num_classes_; + dx_[i] = -label_[i] * dy_[row_ids] / x_[i]; + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const T* label_; + size_t num_classes_; +}; + +template +class XeGradFunctor { + public: + XeGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const int64_t* label, // NOLINT + size_t num_classes) + : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + + HOSTDEVICE void operator()(size_t sample_id) { + auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; ++x_offset) { + dx_[x_offset] = x_offset != x_is_true_offset + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; + } + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const int64_t* label_; + size_t num_classes_; +}; + +template class CrossEntropyGradientOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "This kernel only runs on CPU."); - const Tensor* x = ctx.Input("X"); - const Tensor* dy = ctx.Input(framework::GradVarName("Y")); - const Tensor* label = ctx.Input("Label"); - Tensor* dx = ctx.Output(framework::GradVarName("X")); - T* dx_data = dx->mutable_data(ctx.GetPlace()); + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label = ctx.Input("Label"); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); int64_t class_num = x->dims()[1]; if (ctx.Attr("soft_label")) { - auto x_mat = EigenMatrix::From(*x); - auto dy_mat = EigenMatrix::From(*dy); - auto lbl_mat = EigenMatrix::From(*label); - auto dx_mat = EigenMatrix::From(*dx); - - dx_mat.device(*ctx.template device_context() - .eigen_device()) = - -(lbl_mat * - dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); + XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), + label->data(), + static_cast(class_num)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dx->numel())); + for_range(functor); } else { - int64_t batch_size = x->dims()[0]; - const T* dy_data = dy->data(); - const T* x_data = x->data(); - const int64_t* label_data = label->data(); - - math::SetConstant functor; - functor(ctx.template device_context(), dx, 0); - - for (int64_t i = 0; i < batch_size; ++i) { - PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); - int64_t index = i * class_num + label_data[i]; - dx_data[index] = math::TolerableValue()(-dy_data[i] / x_data[index]); - } + XeGradFunctor functor(dx_data, dy->data(), x->data(), + label->data(), + static_cast(class_num)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dy->numel())); + for_range(functor); } } }; diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 95f4738b4ff50852d9591719133ca650533bf848..e6ee28ea8d920ef80fead258a9bd0d5f6762c879 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -82,7 +82,9 @@ class RequestSend final : public RequestBase { virtual std::string GetReqName() { return request_->Varname(); } virtual void Process() { - queue_->Push(std::make_pair(request_->Varname(), request_)); + std::string var_name = GetReqName(); + VLOG(3) << "RequestSend " << var_name; + queue_->Push(std::make_pair(var_name, request_)); sendrecv::VoidMessage reply; responder_.Finish(reply, ::grpc::Status::OK, this); @@ -106,7 +108,7 @@ class RequestGet final : public RequestBase { responder_(&ctx_), scope_(scope), queue_(queue) { - int method_id = static_cast(detail::GrpcMethod::kGetVariable); + auto method_id = static_cast(detail::GrpcMethod::kGetVariable); service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, cq_, this); } @@ -118,6 +120,7 @@ class RequestGet final : public RequestBase { virtual void Process() { // proc request. std::string var_name = request_.varname(); + VLOG(3) << "RequestGet " << var_name; auto* var = scope_->FindVar(var_name); ::grpc::ByteBuffer reply; @@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase { ::grpc::ByteBuffer reply; std::string var_name = request_->OutVarname(); - VLOG(3) << "prefetch var " << var_name; + VLOG(3) << "RequestPrefetch " << var_name; auto var_desc = program_->Block(0).FindVar(var_name); framework::Scope* local_scope = &scope_->NewScope(); auto* var = local_scope->FindVar(var_name); @@ -208,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) { } } +void AsyncGRPCServer::WaitServerReady() { + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); +} + void AsyncGRPCServer::RunSyncUpdate() { ::grpc::ServerBuilder builder; builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(), @@ -241,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() { t_prefetch_.reset(new std::thread( std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), "cq_prefetch", prefetch_register))); + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); // wait server server_->Wait(); t_send_->join(); @@ -307,18 +321,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, bool ok = false; while (true) { - VLOG(3) << "HandleRequest for " << cq_name << " while in"; + VLOG(3) << "HandleRequest for " << cq_name << " wait Next"; if (!cq->Next(&tag, &ok)) { LOG(INFO) << cq_name << " CompletionQueue shutdown!"; break; } - VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; + VLOG(3) << "HandleRequest for " << cq_name << " get Next"; PADDLE_ENFORCE(tag); + if (sync_mode_) { // FIXME(typhoonzero): de-couple the barriers with recv_op if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); + VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond"; } RequestBase* base = reinterpret_cast(tag); @@ -336,9 +352,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, switch (base->Status()) { case PROCESS: { - VLOG(4) << cq_name << " PROCESS status:" << base->Status(); TryToRegisterNewOne(); base->Process(); + VLOG(4) << cq_name << " PROCESS status:" << base->Status(); break; } case FINISH: { diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 99b87b8c6cb3e597778b88c395e4abf400d82c39..7f9cae21ccca8dd51f9fbe98148d01a51ac6eb84 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -45,8 +45,9 @@ class RequestBase; class AsyncGRPCServer final { public: explicit AsyncGRPCServer(const std::string &address, bool sync_mode) - : address_(address), sync_mode_(sync_mode) {} + : address_(address), sync_mode_(sync_mode), ready_(0) {} + void WaitServerReady(); void RunSyncUpdate(); // functions to sync server barrier status. @@ -118,6 +119,10 @@ class AsyncGRPCServer final { framework::ProgramDesc *program_; framework::Executor *executor_; int selected_port_; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; }; }; // namespace detail diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc index 221d2f4c5b30aef022a5d6b54cd657d1dec1f5a2..e9eaaf1cbcd07ed1c8d6fb0b025bc1f1500718fd 100644 --- a/paddle/fluid/operators/detail/serde_test.cc +++ b/paddle/fluid/operators/detail/serde_test.cc @@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); } for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], i); + EXPECT_EQ(rows_data2[i], static_cast(i)); } EXPECT_EQ(slr2->height(), 1000); } diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 953aedc85064ee803ab02afd427a5a6f22096f94..8b052611f80ddf874ca48c1c58e13346528a834e 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -22,6 +22,7 @@ limitations under the License. */ #ifdef __NVCC__ #include #include +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif @@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, } #ifdef __NVCC__ - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - const int warpSize = 32; - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::__shfl_down_sync(mask, val, offset); - } - - return val; -} - template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T* x, const T* y, const T* out, const T* dout, int h, int w, @@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = reduceSum(val, tid, h); + val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } @@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( if (dy) { int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = reduceSum(val, tid, h); + val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 57cff680ab89f2df7e71af4056ee06cdf330bbab..59b94511552874e1557d8a9d7a687af14f96c31c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -45,20 +45,6 @@ static void split(const std::string &str, char sep, } } -static void AsyncExecuteBlock(framework::Executor *executor, - framework::ExecutorPrepareContext *prepared, - framework::Scope *scope) { - std::future future = framework::Async([&executor, &prepared, &scope]() { - try { - executor->RunPreparedContext(prepared, scope, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - }); - // TODO(qiao) maybe we can remove this - future.wait(); -} - static void ParallelExecuteBlocks( const std::vector ¶llel_blkids, framework::Executor *executor, const std::vector> @@ -80,12 +66,7 @@ static void ParallelExecuteBlocks( for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } -static void SavePort(std::shared_ptr rpc_service) { - std::ofstream port_file; - port_file.open("/tmp/paddle.selected_port"); - port_file << rpc_service->GetSelectedPort(); - port_file.close(); -} +std::atomic_int ListenAndServOp::selected_port_{0}; ListenAndServOp::ListenAndServOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -93,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type, const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} -int ListenAndServOp::GetSelectedPort() const { - return rpc_service_->GetSelectedPort(); -} - void ListenAndServOp::Stop() { rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); server_thread_->join(); } +void ListenAndServOp::SavePort(const std::string &file_path) const { + // NOTE: default write file to /tmp/paddle.selected_port + selected_port_ = rpc_service_->GetSelectedPort(); + + std::ofstream port_file; + port_file.open(file_path); + port_file << selected_port_.load(); + port_file.close(); + VLOG(4) << "selected port written to " << file_path; +} + +void ListenAndServOp::WaitServerReady() { + while (selected_port_.load() == 0) { + } +} + void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope, @@ -201,14 +194,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, } // while(true) } +static void AsyncUpdateThread( + const std::string &var_name, const bool &exit_flag, + const std::shared_ptr &queue, + framework::Executor *executor, + framework::ExecutorPrepareContext *prepared) { + VLOG(3) << "update thread for " << var_name << " started"; + while (!exit_flag) { + const detail::ReceivedMessage v = queue->Pop(); + auto recv_var_name = v.first; + auto var = v.second->GetVar(); + if (var == nullptr) { + LOG(ERROR) << "Can not find server side var: " << recv_var_name; + PADDLE_THROW("Can not find server side var"); + } + auto fs = framework::Async([var_name, &executor, &v, prepared] { + try { + executor->RunPreparedContext(prepared, v.second->GetMutableLocalScope(), + false, false); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + }); + fs.wait(); + } +} + void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, - framework::ProgramDesc *program, - framework::Scope *recv_scope, - framework::BlockDesc *prefetch_block) const { + framework::ProgramDesc *program) const { VLOG(3) << "RunAsyncLoop in"; // grad name to block id std::unordered_map grad_to_block_id; std::unordered_map id_to_grad; + std::unordered_map> + grad_to_queue; auto grad_to_block_id_str = Attr>("grad_to_block_id"); @@ -220,6 +239,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0); int block_id = std::stoi(pieces[1]); grad_to_block_id[pieces[0]] = block_id; + grad_to_queue[pieces[0]] = std::make_shared(); id_to_grad[block_id] = pieces[0]; } size_t num_blocks = program->Size(); @@ -238,8 +258,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i]; } - VLOG(3) << "RunAsyncLoop into while"; bool exit_flag = false; + + VLOG(3) << "start async optimize threads"; + std::vector> fs; + for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) { + std::string grad_name = iter->first; + VLOG(3) << "create async update thread for " << grad_name; + fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor, + &grad_to_queue, &grad_to_prepared_ctx]() { + AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name], + executor, grad_to_prepared_ctx[grad_name].get()); + })); + } + + VLOG(3) << "RunAsyncLoop into while"; while (!exit_flag) { const detail::ReceivedMessage v = rpc_service_->Get(); auto recv_var_name = v.first; @@ -249,13 +282,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, break; } else { VLOG(3) << "received grad: " << recv_var_name; - auto var = v.second->GetVar(); - if (var == nullptr) { - LOG(ERROR) << "Can not find server side var: " << recv_var_name; - PADDLE_THROW("Can not find server side var"); - } - AsyncExecuteBlock(executor, grad_to_prepared_ctx[recv_var_name].get(), - v.second->GetMutableLocalScope()); + grad_to_queue[recv_var_name]->Push(v); } if (exit_flag) { @@ -298,13 +325,17 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, // start the server listening after all member initialized. server_thread_.reset(new std::thread(RunServer, rpc_service_)); VLOG(3) << "wait server thread to become ready..."; - sleep(5); + rpc_service_->WaitServerReady(); + // Write to a file of server selected port for python use. - SavePort(rpc_service_); + std::string file_path = + string::Sprintf("/tmp/paddle.%d.selected_port", + static_cast(::getpid())); + SavePort(file_path); if (sync_mode) { RunSyncLoop(&executor, program, &recv_scope, prefetch_block); } else { - RunAsyncLoop(&executor, program, &recv_scope, prefetch_block); + RunAsyncLoop(&executor, program); } } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 3cc0f3047733bea94daa310cd39cb0a4f44bef85..f52a55c5c2d6902df6cb7e0a0d7242c6e86dc786 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include @@ -39,26 +40,33 @@ class ListenAndServOp : public framework::OperatorBase { const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs); - int GetSelectedPort() const; - void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, framework::BlockDesc* prefetch_block) const; void RunAsyncLoop(framework::Executor* executor, - framework::ProgramDesc* program, - framework::Scope* recv_scope, - framework::BlockDesc* prefetch_block) const; + framework::ProgramDesc* program) const; + + void SavePort( + const std::string& file_path = "/tmp/paddle.selected_port") const; + + void WaitServerReady(); + + int GetSelectedPort() { return selected_port_; } void Stop() override; void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; + static void ResetPort() { selected_port_ = 0; } + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr server_thread_; + // FIXME(wuyi): it's static so that the operator can be cloned. + static std::atomic_int selected_port_; }; } // namespace operators diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1839e456d66ab95fb4ccac933cf7b635c54f5a0 --- /dev/null +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +constexpr int64_t kNoPadding = -1; + +class LookupSparseTableInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupSparseTableOp should not be null."); + auto shape_w = ctx->GetInputDim("W"); + auto shape_ids = ctx->GetInputDim("Ids"); + shape_w[0] = shape_ids.size(); + ctx->SetOutputDim("Out", shape_w); + } +}; + +class LookupSparseTableOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto out_var = scope.FindVar(Output("Out")); + auto w_var = scope.FindVar(Input("W")); + auto ids_var = scope.FindVar(Input("Ids")); + unsigned int seed = static_cast(Attr("seed")); + float min = Attr("min"); + float max = Attr("max"); + bool auto_grown_table = Attr("auto_grown_table"); + + PADDLE_ENFORCE(out_var->IsType(), + "The type of Out var should be LodTensor."); + PADDLE_ENFORCE(w_var->IsType(), + "The type of W var should be SelectedRows."); + PADDLE_ENFORCE(ids_var->IsType(), + "The type of Ids var should be LoDTensor."); + auto &ids_t = ids_var->Get(); + auto out_t = out_var->GetMutable(); + auto w_t = w_var->GetMutable(); + std::vector keys; + keys.resize(ids_t.numel()); + for (size_t i = 0; i < ids_t.numel(); ++i) { + keys[i] = ids_t.data()[i]; + } + + // TODO(Yancey1989): support CUDA Place for the sparse table + platform::CPUPlace cpu; + auto out_shape = w_t->value().dims(); + out_shape[0] = keys.size(); + out_t->Resize(out_shape); + out_t->mutable_data(cpu, w_t->value().type()); + PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), + framework::proto::VarType::FP32, + "The sparse table only support FP32"); + auto non_keys_pair = w_t->Get(keys, out_t); + if (!auto_grown_table) { + PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast(0), + "there is some keys does exists in the sparse table."); + } + auto value_shape = w_t->value().dims(); + value_shape[0] = 1; + for (const auto &it : non_keys_pair) { + const auto key = it.first; + const auto index = it.second; + framework::Tensor value; + value.Resize(value_shape); + auto data = value.mutable_data(cpu); + + std::minstd_rand engine; + engine.seed(seed); + std::uniform_real_distribution dist(min, max); + int64_t size = value.numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + w_t->Set(key, value); + memory::Copy(cpu, out_t->mutable_data(cpu) + index * value.numel(), + cpu, value.data(), value.numel() * sizeof(float)); + } + } +}; + +class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LookupSparseTableOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("W", + "(SelectedRows) The input represents embedding table, " + "which is a learnable parameter."); + AddInput("Ids", + "(LoDTensor) Ids's type should be LoDTensor" + "THe ids to be looked up in W."); + AddOutput("Out", + "(LoDTensor) The lookup results, which have the " + "same type as W."); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + AddAttr("min", + "(float, default -1.0) " + "Minimum value of uniform random") + .SetDefault(-1.0f); + AddAttr("max", + "(float, default 1.0) " + "Maximun value of uniform random") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed used for generating samples. " + "0 means use a seed generated by the system." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time.") + .SetDefault(0); + AddAttr("auto_grown_table", + "(bool default false)" + "Whether create new value if for nonexistent key.") + .SetDefault(true); + AddComment(R"DOC( +Lookup Sprase Tablel Operator. + +This operator is used to perform lookup on parameter W, +then concatenated into a sparse tensor. + +The type of Ids(Input) is SelectedRows, the rows of Ids contains +the ids to be looked up in W; +if the Id is not in the sparse table, this operator will return a +random value and set the value into the table for the next looking up. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp, + ops::LookupSparseTableInferShape, + ops::LookupSparseTableOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 6d2ba2bd0d653ecf83f9e2abc1413ae551dc8bb7..0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { @@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, } } -template -__device__ __forceinline__ T sum_single_warp(T val) { - val += platform::__shfl_down_sync(0, val, 16); - val += platform::__shfl_down_sync(0, val, 8); - val += platform::__shfl_down_sync(0, val, 4); - val += platform::__shfl_down_sync(0, val, 2); - val += platform::__shfl_down_sync(0, val, 1); - return val; -} - -// CUDA do not support dynamic arrary in template -// https://stackoverflow.com/questions/20497209 -template -struct SharedMemory { - // Ensure that we won't compile any un-specialized types - __device__ T* GetPointer() { return NULL; } -}; - -template <> -struct SharedMemory { - __device__ float* GetPointer() { - extern __shared__ float s_float[]; - return s_float; - } -}; - -template <> -struct SharedMemory { - __device__ double* GetPointer() { - extern __shared__ double s_double[]; - return s_double; - } -}; - template __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, const int class_num) { int tid = threadIdx.x; - SharedMemory d_sum_shared; - T* d_sum = d_sum_shared.GetPointer(); - d_sum[tid] = 0; + T val = 0; - int cur_idx = tid; - int next_idx = blockIdx.x * class_num + tid; - while (cur_idx < class_num) { - d_sum[tid] += - math::TolerableValue()(std::log(X[next_idx])) * label[next_idx]; - next_idx += blockDim.x; - cur_idx += blockDim.x; + int idx = blockIdx.x * class_num + tid; + int end = blockIdx.x * class_num + class_num; + for (; idx < end; idx += blockDim.x) { + val += math::TolerableValue()(std::log(X[idx])) * label[idx]; } - __syncthreads(); - for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { - if (tid < stride) d_sum[tid] += d_sum[tid + stride]; - __syncthreads(); + val = paddle::platform::reduceSum(val, tid, blockDim.x); + if (threadIdx.x == 0) { + Y[blockIdx.x] = -val; } - - T val = d_sum[tid]; - val = sum_single_warp(val); - if (tid == 0) Y[blockIdx.x] = -val; } } // namespace @@ -113,9 +70,7 @@ class CrossEntropyFunctor { ? 512 : pow(2, static_cast(std::log2(class_num))); - SoftCrossEntropyKernel<<< - batch_size, block, block * sizeof(T), - reinterpret_cast(ctx).stream()>>>( + SoftCrossEntropyKernel<<>>( loss_data, prob_data, label_data, class_num); } else { const int64_t* label_data = labels->data(); diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 97a2e81c84c060a8be57db6274839ee39edf466c..b871851798e48e6b598cb4ab8e2e42db478a3820 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/operators/math/pooling.h" +#include +#include namespace paddle { namespace operators { @@ -27,9 +28,10 @@ template class Pool2dFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -63,11 +65,11 @@ class Pool2dFunctor { T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - pool_process.compute(ele, input_data[h * input_width + w]); + pool_process.compute(input_data[h * input_width + w], &ele); } } int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, (static_cast(pool_size))); + pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } } @@ -86,13 +88,12 @@ class Pool2dFunctor { template class Pool2dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_grad_process, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, PoolProcess pool_grad_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -131,8 +132,8 @@ class Pool2dGradFunctor { input_data[h * input_width + w], output_data[ph * output_width + pw], output_grad_data[ph * output_width + pw], - input_grad_data[h * input_width + w], - static_cast(scale)); + static_cast(scale), + input_grad_data + h * input_width + w); } } } @@ -154,12 +155,11 @@ class Pool2dGradFunctor { template class MaxPool2dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -246,9 +246,10 @@ template class Pool3dFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -293,14 +294,14 @@ class Pool3dFunctor { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { pool_process.compute( - ele, - input_data[(d * input_height + h) * input_width + w]); + input_data[(d * input_height + h) * input_width + w], + &ele); } } } int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, static_cast(pool_size)); + pool_process.finalize(static_cast(pool_size), &ele); output_data[output_idx] = ele; } } @@ -320,13 +321,12 @@ class Pool3dFunctor { template class Pool3dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_grad_process, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, PoolProcess pool_grad_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -379,8 +379,8 @@ class Pool3dGradFunctor { (pd * output_height + ph) * output_width + pw; pool_grad_process.compute( input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], - input_grad_data[input_idx], static_cast(scale)); + output_grad_data[output_idx], static_cast(scale), + input_grad_data + input_idx); } } } @@ -404,12 +404,11 @@ class Pool3dGradFunctor { template class MaxPool3dGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* input_grad) { + void operator()( + const platform::CPUDeviceContext& context, const framework::Tensor& input, + const framework::Tensor& output, const framework::Tensor& output_grad, + const std::vector& ksize, const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -510,9 +509,10 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; @@ -628,9 +629,10 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 267f8c409df301f9b1a8c68f337473198cf827f4..b1c76350d1724629bae175abf47e6671a1532242 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - pool_process.compute(ele, input_data[h * input_width + w]); + pool_process.compute(input_data[h * input_width + w], &ele); } } int pool_size = (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, (static_cast(pool_size))); + pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } } @@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad( int pool_size = (hend - hstart) * (wend - wstart); int output_sub_idx = ph * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], - output_grad[output_sub_idx], gradient, - static_cast(1.0 / pool_size)); + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), &gradient); } } input_grad[index] = gradient; @@ -158,9 +160,10 @@ template class Pool2dFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -201,9 +204,11 @@ class Pool2dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* input_grad) { + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -246,8 +251,10 @@ class MaxPool2dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { pool_process.compute( - ele, input_data[(d * input_height + h) * input_width + w]); + input_data[(d * input_height + h) * input_width + w], &ele); } } } int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - pool_process.finalize(ele, static_cast(pool_size)); + pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } } @@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad( int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int output_sub_idx = (pd * output_height + ph) * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], - output_grad[output_sub_idx], gradient, - static_cast(1.0 / pool_size)); + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), &gradient); } } } @@ -474,9 +481,10 @@ template class Pool3dFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* output) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -525,9 +533,11 @@ class Pool3dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_process, framework::Tensor* input_grad) { + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_process, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -578,8 +588,10 @@ class MaxPool3dGradFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -736,9 +748,10 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -937,9 +951,10 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask) { + const framework::Tensor& input, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 74cb42f0d02086a6776b22d57832757ae3ffc470..2538d739cce95d1b2fc5b3f905af5e6d94cf7af5 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -23,8 +24,8 @@ namespace operators { namespace math { #define FLT_MAX \ - __FLT_MAX__ // It might need to be placed in another file, but I'm still - // wondering where to put it. + __FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm + // still wondering where to put it. /* * \brief Extracting simple operations from pooling. @@ -40,33 +41,33 @@ template class MaxPool { public: DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } - DEVICE inline void finalize(T& y, const T& pool_field) {} + DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + DEVICE inline void finalize(const T& pool_field, T* y) {} }; template class AvgPool { public: DEVICE inline T initial() { return static_cast(0); } - DEVICE inline void compute(T& y, const T& x) { y += x; } - DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } + DEVICE inline void compute(const T& x, T* y) { *y += x; } + DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; } }; template class MaxPoolGrad { public: - DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, - T scale) { - dx += dy * (x == y); + DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, + T* dx) { + *dx += dy * (x == y); } }; template class AvgPoolGrad { public: - DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, - T scale) { - dx += (scale * dy); + DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, + T* dx) { + *dx += (scale * dy); } }; @@ -88,8 +89,9 @@ template class Pool2dFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, PoolProcess pool_compute, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, framework::Tensor* output); }; @@ -98,9 +100,11 @@ class Pool2dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* input_grad); + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* input_grad); }; template @@ -108,8 +112,10 @@ class MaxPool2dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -117,8 +123,9 @@ template class Pool3dFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, PoolProcess pool_compute, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, framework::Tensor* output); }; @@ -127,9 +134,11 @@ class Pool3dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* input_grad); + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* input_grad); }; template @@ -137,8 +146,10 @@ class MaxPool3dGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -153,8 +164,9 @@ template class MaxPool2dWithIndexFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, framework::Tensor* output, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, framework::Tensor* mask); }; @@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; @@ -172,8 +185,9 @@ template class MaxPool3dWithIndexFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - std::vector& ksize, std::vector& strides, - std::vector& paddings, framework::Tensor* output, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* output, framework::Tensor* mask); }; @@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& output_grad, - const framework::Tensor& mask, std::vector& ksize, - std::vector& strides, std::vector& paddings, + const framework::Tensor& mask, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* input_grad); }; diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index 38bd3b99758555a24b3b8eb0de06cca8e424fcb2..d63c6c4ed55331235188c1c750468d4e75b9b7f2 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -22,7 +22,7 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor& padding, + const framework::LoDTensor& seq, framework::Tensor* padding, bool norm_by_times) { auto lod = seq.lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, @@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor { "The first dimension of LoDTensor seq should be " "equal to the sum of all sequences's length."); - auto padding_dims = padding.dims(); + auto padding_dims = padding->dims(); PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, "The input padding should be a 3-D Tensor of shape " "[max_sequence_length, num_sequences, sequence_width]."); @@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor { "width of sequence in LoDTensor seq."); const T* seq_data = seq.data(); - T* padding_data = padding.data(); + T* padding_data = padding->data(); for (int64_t i = 0; i < max_sequence_length; ++i) { for (int64_t j = 0; j < num_sequences; ++j) { int64_t start_pos = abs_offset_lod[level][j]; @@ -84,16 +84,16 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - framework::LoDTensor& seq, const framework::Tensor& padding, + framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times) { - auto lod = seq.lod(); + auto lod = seq->lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, "The LoD of LoDTensor seq should not be null."); const size_t level = 0; framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - auto seq_dims = seq.dims(); + auto seq_dims = seq->dims(); PADDLE_ENFORCE_EQ(seq_dims[0], static_cast(abs_offset_lod[level].back()), "The first dimension of LoDTensor seq should be " @@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor { "The second dimension of Tensor padding should be " "the number of sequences in LoDTensor seq."); - const int64_t sequence_width = seq.numel() / seq_dims[0]; + const int64_t sequence_width = seq->numel() / seq_dims[0]; PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, "The third dimension of Tensor padding should be the " "width of sequence in LoDTensor seq."); const T* padding_data = padding.data(); - T* seq_data = seq.data(); + T* seq_data = seq->data(); for (int64_t i = 0; i < num_sequences; ++i) { int64_t start_pos = abs_offset_lod[level][i]; int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index c044e6fc32bab8f72a0dce45b4abdb1174a0d72f..0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/operators/math/sequence_padding.h" namespace paddle { @@ -61,7 +62,7 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor& padding, + const framework::LoDTensor& seq, framework::Tensor* padding, bool norm_by_times) { auto lod = seq.lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, @@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor { "The first dimension of LoDTensor seq should be " "equal to the sum of all sequences's length."); - auto padding_dims = padding.dims(); + auto padding_dims = padding->dims(); PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, "The input padding should be a 3-D Tensor of shape " "[max_sequence_length, num_sequences, sequence_width]."); @@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor { "width of sequence in LoDTensor seq."); if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(seq, context.GetPlace(), context, &padding); - padding.Resize(padding_dims); + TensorCopy(seq, context.GetPlace(), context, padding); + padding->Resize(padding_dims); return; } @@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor { dim3 grid(grid_dim_x, grid_dim_y); const T* seq_data = seq.data(); - T* padding_data = padding.data(); + T* padding_data = padding->data(); if (norm_by_times) { SequencePaddingKernel<<>>( padding_data, const_cast(seq_data), @@ -136,16 +137,16 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - framework::LoDTensor& seq, const framework::Tensor& padding, + framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times) { - auto lod = seq.lod(); + auto lod = seq->lod(); PADDLE_ENFORCE_GT(lod.size(), 0UL, "The lod of LoDTensor seq should not be null."); const size_t level = 0; framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - auto seq_dims = seq.dims(); + auto seq_dims = seq->dims(); PADDLE_ENFORCE_EQ(seq_dims[0], static_cast(abs_offset_lod[level].back()), "The first dimension of LoDTensor seq should be " @@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor { "The second dimension of Tensor padding should be " "the number of sequences in LoDTensor seq."); - const int64_t sequence_width = seq.numel() / seq_dims[0]; + const int64_t sequence_width = seq->numel() / seq_dims[0]; PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, "The third dimension of Tensor padding should be the " "width of sequence in LoDTensor seq."); if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(padding, context.GetPlace(), context, &seq); - seq.Resize(seq_dims); + TensorCopy(padding, context.GetPlace(), context, seq); + seq->Resize(seq_dims); return; } @@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor { dim3 grid(grid_dim_x, grid_dim_y); const T* padding_data = padding.data(); - T* seq_data = seq.data(); + T* seq_data = seq->data(); if (norm_by_times) { SequencePaddingKernel<<>>( const_cast(padding_data), seq_data, diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index 17f044b9d6667ed6a45bf5a0c2362c351d2c2beb..b56e6db1ebdac1a00561c07845c03bb8fbd8d35a 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -64,13 +65,13 @@ template class PaddingLoDTensorFunctor { public: void operator()(const DeviceContext& context, const framework::LoDTensor& seq, - framework::Tensor& padding, bool norm_by_times); + framework::Tensor* padding, bool norm_by_times); }; template class UnpaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, framework::LoDTensor& seq, + void operator()(const DeviceContext& context, framework::LoDTensor* seq, const framework::Tensor& padding, bool norm_by_times); }; diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index e3d62144856eb30a64007b54c16719cc028c70f2..b9a1b9ae4d6e8c82aa782190d9e145e7a2b502be 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod, static_cast(sequence_width)}); padding.mutable_data(padding_dims, *place); paddle::operators::math::PaddingLoDTensorFunctor()( - *context, seq, padding, false); + *context, seq, &padding, false); seq_back.set_lod(lod); seq_back.mutable_data(seq_dims, *place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - *context, seq_back, padding, false); + *context, &seq_back, padding, false); if (paddle::platform::is_cpu_place(*place)) { cpu_seq_back = seq_back; diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index 6c70970e15f0d63ebe2134c6bc8163339ba30e75..f13ec53905aa3d5b55b865c3514f36211c06a549 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -17,6 +17,8 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; + class MomentumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::ToDataType(ctx.Input("Param")->type()); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index c9fabc8d485b3bba2c8ae14b3616d0bdcae058a7..6903cf83b41a54b54382fac2cf58f7bfe192b55f 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( - mul, ops::MulKernel); + mul, ops::MulKernel, + ops::MulKernel); REGISTER_OP_CPU_KERNEL( - mul_grad, ops::MulGradKernel); + mul_grad, ops::MulGradKernel, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc index 757f9c3ee2665c7ac654659416fe8dd727dca16d..81f3e42bf412fa4d2cb48405f2f8ee49b6aa0b67 100644 --- a/paddle/fluid/operators/mul_op.cu.cc +++ b/paddle/fluid/operators/mul_op.cu.cc @@ -18,6 +18,8 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel, + ops::MulKernel, ops::MulKernel); REGISTER_OP_CUDA_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel, + ops::MulGradKernel); diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index dd8e62aca47a3b34a3788a43cc0043a887af818f..79d08cf3d1edbc158ef551c94330e688c87e6c1e 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/cuda_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc index a7ba1e0ae1d22a22cf2943c9aaf0c394ef4ae326..74385ee47543e3f4887081c2225212996d3df3f1 100644 --- a/paddle/fluid/operators/save_load_op_test.cc +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" USE_NO_KERNEL_OP(save); USE_NO_KERNEL_OP(load); @@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) { } } } + +TEST(SaveLoadFP16Op, CPU) { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + + auto var = scope.Var("test_var"); + auto tensor = var->GetMutable(); + tensor->Resize({3, 10}); + + float* expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(paddle::platform::float16(i)); + } + + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", std::string("tensor.save")}); + attrs.insert({"save_as_fp16", true}); + + auto save_op = paddle::framework::OpRegistry::CreateOp( + "save", {{"X", {"test_var"}}}, {}, attrs); + save_op->Run(scope, place); + + auto load_var = scope.Var("out_var"); + auto target = load_var->GetMutable(); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load", {}, {{"Out", {"out_var"}}}, attrs); + load_op->Run(scope, place); + paddle::platform::float16* actual = target->data(); + for (int64_t i = 0; i < tensor->numel(); ++i) { + EXPECT_EQ(expect[i], static_cast(actual[i])); + } +} diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 4a715c4baab2da7b7af86ada22ee88a16b05a814..f45d07ed90d52d204e9a3a5c2efe6df6b27ebfe6 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase { const platform::Place &place) const override { auto filename = Attr("file_path"); auto overwrite = Attr("overwrite"); + auto save_as_fp16 = Attr("save_as_fp16"); if (FileExists(filename) && !overwrite) { PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", @@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); - framework::SerializeToStream(fout, tensor, dev_ctx); + auto in_dtype = framework::ToDataType(tensor.type()); + auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } } }; @@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk. "(boolean, default true)" "Overwrite the output file if exist") .SetDefault(true); + AddAttr("save_as_fp16", + "(boolean, default false)" + "If true, the tensor will be converted to float16 data " + "type and then saved. Otherwise, the tensor will be " + "directly saved without data type conversion.") + .SetDefault(false); AddAttr("file_path", "(string)" "The \"file_path\" where the variable will be saved.") diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 1e938638c9182972a2ae2436166ff0aa49efd4be..7dcf33c989c3bcd905da8017ee36ec8ce8032911 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel { } }; -template class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { public: ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -47,9 +46,9 @@ Scale operator $$Out = scale*X$$ )DOC"); - AddAttr("scale", - "(float, default 1.0)" - "The scaling factor of the scale operator.") + AddAttr("scale", + "(float, default 1.0)" + "The scaling factor of the scale operator.") .SetDefault(1.0); } }; @@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, - ops::ScaleGradMaker); +REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index d2e1f3cb2ff9c8254cd4815a0f8750966a6e161c..eb51f301bfe2a97c65dd1fec23ff5a44f3843b05 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -113,15 +113,15 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, op->SetAttrMap(attrs); } -void StartServerNet(bool is_sparse) { +void StartServerNet(bool is_sparse, std::atomic *initialized) { f::Scope scope; p::CPUPlace place; + VLOG(4) << "before init tensor"; if (is_sparse) { InitSelectedRowsInScope(place, &scope); } else { InitTensorsInScope(place, &scope); } - // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; const auto &root_block = program.Block(0); @@ -129,7 +129,6 @@ void StartServerNet(bool is_sparse) { auto *prefetch_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensors, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); - f::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:0")}); attrs.insert({"Fanin", 1}); @@ -139,15 +138,22 @@ void StartServerNet(bool is_sparse) { attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"grad_to_block_id", std::vector({""})}); attrs.insert({"sync_mode", true}); + VLOG(4) << "before init op"; listen_and_serv_op = f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); + *initialized = true; listen_and_serv_op->Run(scope, place); LOG(INFO) << "server exit"; } TEST(SendRecvOp, CPUDense) { - std::thread server_thread(StartServerNet, false); - sleep(5); // wait server to start + std::atomic initialized{false}; + std::thread server_thread(StartServerNet, false, &initialized); + while (!initialized) { + } + static_cast(listen_and_serv_op.get()) + ->WaitServerReady(); + // local net f::Scope scope; p::CPUPlace place; @@ -156,9 +162,11 @@ TEST(SendRecvOp, CPUDense) { scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; - selected_port = static_cast( - listen_and_serv_op.get()) - ->GetSelectedPort(); + auto *listen_and_serv_op_ptr = + static_cast( + listen_and_serv_op.get()); + ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); + selected_port = listen_and_serv_op_ptr->GetSelectedPort(); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); attrs.insert({"endpoints", std::vector({endpoint})}); attrs.insert({"epmap", std::vector({endpoint})}); @@ -181,11 +189,21 @@ TEST(SendRecvOp, CPUDense) { listen_and_serv_op->Stop(); server_thread.join(); listen_and_serv_op.reset(nullptr); + paddle::operators::ListenAndServOp::ResetPort(); } TEST(SendRecvOp, CPUSparse) { - std::thread server_thread(StartServerNet, true); - sleep(3); // wait server to start + std::atomic initialized; + initialized = false; + std::thread server_thread(StartServerNet, true, &initialized); + while (!initialized) { + } + auto *listen_and_serv_op_ptr = + static_cast( + listen_and_serv_op.get()); + ASSERT_TRUE(listen_and_serv_op_ptr != nullptr); + listen_and_serv_op_ptr->WaitServerReady(); + // local net f::Scope scope; p::CPUPlace place; @@ -193,9 +211,7 @@ TEST(SendRecvOp, CPUSparse) { InitSelectedRowsInScope(place, &scope); scope.Var("RPC_CLIENT_VAR"); f::AttributeMap attrs; - selected_port = static_cast( - listen_and_serv_op.get()) - ->GetSelectedPort(); + selected_port = listen_and_serv_op_ptr->GetSelectedPort(); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); attrs.insert({"endpoints", std::vector({endpoint})}); attrs.insert({"epmap", std::vector({endpoint})}); @@ -226,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) { listen_and_serv_op->Stop(); server_thread.join(); listen_and_serv_op.reset(); + paddle::operators::ListenAndServOp::ResetPort(); } diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 06cb0550ad7d4ad0241a4f439ea9ac16d9714c38..bd04c60ffa5c1e5eb8d2051ce495ab6c685b14b5 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel { } }; +class SGDOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto input_var = op_desc.Input("Param")[0]; + for (auto& out_var : op_desc.Output("ParamOut")) { + if (block->FindRecursiveOrCreateVar(input_var).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::SELECTED_ROWS); + } else { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::LOD_TENSOR); + } + } + } +}; + class SGDOpMaker : public framework::OpProtoAndCheckerMaker { public: SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) @@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); +REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker, + paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType); REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, ops::SGDOpKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 2741ba95bcfc1db3d74e0fb8c3f6fddf7d5a2caa..aa7b192e327704c02a26c86cc208ebe8a5cd7ba5 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OP_CPU_KERNEL( - softmax, ops::SoftmaxKernel); + softmax, ops::SoftmaxKernel, + ops::SoftmaxKernel); REGISTER_OP_CPU_KERNEL( softmax_grad, - ops::SoftmaxGradKernel); + ops::SoftmaxGradKernel, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc index 0c1f7cef7ab7b66358d80f6f0670e0d07536128c..5fb4f011d9b47cebc4a23bcce47eada825263343 100644 --- a/paddle/fluid/operators/softmax_op.cu.cc +++ b/paddle/fluid/operators/softmax_op.cu.cc @@ -19,6 +19,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( softmax, ops::SoftmaxKernel, + ops::SoftmaxKernel, ops::SoftmaxKernel); -REGISTER_OP_CUDA_KERNEL(softmax_grad, - ops::SoftmaxGradKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, ops::SoftmaxGradKernel, + ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index 2e4e8caed5327f4ca9038c376de2ec831354917e..942a5de3f90f20eabe691924a570b61509eccf76 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -75,4 +75,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(top_k, - ops::TopkKernel); + ops::TopkKernel, + ops::TopkKernel); diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index d7f4d383ce0d9e1ff42fc12c96aaf0ceb532e5db..2ea9fd1d29936357bbb5f6819c813e4bd1ebe44c 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -318,4 +318,5 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index acaefaacdaa593c090d81084fdc1b3665314833f..3b5cf68dd4f28d23e507058337fe55de9b88d3cd 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -116,11 +116,31 @@ uniform distribution. .SetDefault(framework::proto::VarType::FP32); } }; + +class UniformRandomOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output("Out").front(); + if (block->FindRecursiveOrCreateVar(out_var_name).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + block->FindRecursiveOrCreateVar(out_var_name) + .SetType(framework::proto::VarType::LOD_TENSOR); + } + } +}; + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, - paddle::operators::UniformRandomOpMaker); +REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp, + paddle::operators::UniformRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::UniformRandomOpVarTypeInference); + REGISTER_OP_CPU_KERNEL(uniform_random, paddle::operators::CPUUniformRandomKernel, paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 85131d002595f7681e4bec4135e28fe49cf842fb..705cc894c06b207f4e4e45fc771c04fa3cbdf6d5 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel { static_cast(sequence_width)}); warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); math::PaddingLoDTensorFunctor()( - ctx.template device_context(), *logits, warpctc_logits, + ctx.template device_context(), *logits, &warpctc_logits, false); const T* warpctc_logits_data = warpctc_logits.data(); @@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel { logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), *logits_grad, + ctx.template device_context(), logits_grad, *warpctc_grad, norm_by_times); const T* loss_grad_data = loss_grad->data(); diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h new file mode 100644 index 0000000000000000000000000000000000000000..7cfeaab35b8c52225ff6e6cc2cdb8296621b30d9 --- /dev/null +++ b/paddle/fluid/platform/cuda_device_function.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace platform { + +// __shfl_down and __shfl have been deprecated as of CUDA 9.0. +#if CUDA_VERSION < 9000 +template +__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { + return __shfl_down(val, delta); +} + +template +__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line, + int width) { + return __shfl(val, src_line, width); +} +#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; +#else +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) +#endif + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::__shfl_down_sync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += platform::__shfl_down_sync(mask, val, offset); + } + return val; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 46b97043ab3cf36498c34798ef63cefce2301333..8758af0804ae08fec6fa64d7387f197f046ce20e 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -66,22 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { } #endif -// __shfl_down has been deprecated as of CUDA 9.0. -#if CUDA_VERSION < 9000 -template -__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) { - return __shfl_down(val, delta); -} -#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; -#else -template -__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) { - return __shfl_down(mask, val, delta); -} -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) -#endif - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index b07427c8f6903e0100ca9a478881444d86501bcc..428d9ebceaabd987261c1dcd6e66faf044b702c0 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.pb.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b20b514fcdd0b41fefa0933bc2d22645e7d4b6d6..c925686f8382da1758fb7cdc048253290ef69513 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, std::vector &local_scopes, - bool allow_op_delay, bool customize_loss_grad) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, bcast_vars, main_program, - loss_var_name, scope, local_scopes, - allow_op_delay, customize_loss_grad); + bool allow_op_delay, bool use_default_grad_scale) { + new (&self) ParallelExecutor( + num_threads, use_event, places, params, bcast_vars, + main_program, loss_var_name, scope, local_scopes, + allow_op_delay, use_default_grad_scale); }) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) // NOTE: even we return a vec* to Python use reference policy. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index dcd711a33ff3a35fdd51d11f54a3343a0bb491c9..93b09ed6922b32a5531224acc470daf0d97f95bd 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { return self.data()[offset]; } else { std::shared_ptr dst(new framework::Tensor); - framework::TensorCopy(self, platform::CPUPlace(), dst.get()); + framework::TensorCopySync(self, platform::CPUPlace(), dst.get()); return dst->data()[offset]; } } @@ -117,9 +117,9 @@ template void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { if (platform::is_gpu_place(self->place())) { std::shared_ptr dst(new framework::Tensor); - framework::TensorCopy(*self, platform::CPUPlace(), dst.get()); + framework::TensorCopySync(*self, platform::CPUPlace(), dst.get()); dst->data()[offset] = elem; - framework::TensorCopy(*dst.get(), self->place(), self); + framework::TensorCopySync(*dst.get(), self->place(), self); } else if (platform::is_cpu_place(self->place())) { self->data()[offset] = elem; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 53455fd86041f87d4050eba89f37c3435fed1ae1..1595cc9e8aad4d143ca62f84f812dbc791dc1d26 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -40,6 +40,7 @@ function print_usage() { ${BLUE}capi${NONE}: generate paddle CAPI package ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}check_style${NONE}: run code style check + ${BLUE}cicheck${NONE}: run CI tasks " } @@ -453,6 +454,8 @@ function gen_capi_package() { } function gen_fluid_inference_lib() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build if [ ${WITH_C_API:-OFF} == "OFF" ] ; then cat <void + reader: + parallel: True if use multi-CPUs or multi-GPUs + feed_order: Feeding order of reader. None will following the defining + order in program + + Returns: + + """ + if parallel: + raise NotImplementedError( + "Parallel Executor version of trainer is not implemented") + + self._train_by_executor(num_epochs, event_handler, reader, feed_order) def test(self, reader): pass + + def _get_scope_from_params(self, params): + """ + Get Scope from parameter object. + Args: + params(Parameter|None): The parameter object instance. Could be None. + + Returns: New scope if params is None. Or params.scope() + NOTE: This method is WIP. Not fully implemented. + """ + if params is None: + return core.Scope() # new scope when params is None + else: + raise NotImplementedError("Not implemented right now.") + + @staticmethod + def _check_and_get_place(place): + """ + Check the type of place or get the default place + Args: + place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. + + Raises: + TypeError if the type mismatched. + + Returns: + the original place if it is not None. + if fluid is compiled with CUDA, returns CUDAPlace(0) by default. + Otherwise returns CPUPlace by default. + """ + if place is None: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + else: + return core.CPUPlace() + else: + if not isinstance(place, core.CUDAPlace) and not isinstance( + place, core.CPUPlace): + raise TypeError("Place should be either CUDAPlace or CPUPlace") + return place + + @contextlib.contextmanager + def _prog_and_scope_guard(self): + with framework.program_guard( + main_program=self.train_program, + startup_program=self.startup_program): + with executor.scope_guard(self.scope): + yield + + def _train_by_executor(self, num_epochs, event_handler, reader, feed_order): + """ + Train by Executor and single device. + + Args: + num_epochs: + event_handler: + reader: + feed_order: + + Returns: + + """ + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + if feed_order is None: + feed_var_list = [ + var + for var in self.train_program.global_block( + ).vars.itervalues() + if hasattr(var, 'is_data') and var.is_data + ] + else: + feed_var_list = [ + self.train_program.global_block().var(var_name) + for var_name in feed_order + ] + + feeder = data_feeder.DataFeeder( + feed_list=feed_var_list, place=self.place) + for epoch_id in range(num_epochs): + event_handler(BeginEpochEvent(epoch_id)) + for step_id, data in enumerate(reader()): + event_handler(BeginStepEvent(epoch_id, step_id)) + exe.run(feed=feeder.feed(data), fetch_list=[]) + event_handler(EndStepEvent(epoch_id, step_id)) + event_handler(EndEpochEvent(epoch_id))