提交 a6edeb39 编写于 作者: Y Yu Yang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/clean_blas

...@@ -32,7 +32,7 @@ RUN apt-get update && \ ...@@ -32,7 +32,7 @@ RUN apt-get update && \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool ccache && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
......
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
...@@ -200,18 +200,19 @@ def main(): ...@@ -200,18 +200,19 @@ def main():
num_samples += len(data) num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size) train_pass_acc.add(value=acc, weight=b_size)
print( print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
% (pass_id, iters, loss, acc, "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
len(data) / (time.time() - ts)) loss, acc,
len(data) / (time.time() - ts))
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time pass_elapsed = time.time() - start_time
pass_train_acc = train_pass_acc.eval() pass_train_acc = train_pass_acc.eval()
pass_test_acc = test(exe) pass_test_acc = test(exe)
print( print("Task:%d Pass = %d, Training performance = %f imgs/s, "
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" "Train accuracy = %f, Test accuracy = %f\n" %
% (pass_id, num_samples / pass_elapsed, pass_train_acc, (args.task_index, pass_id, num_samples / pass_elapsed,
pass_test_acc)) pass_train_acc, pass_test_acc))
if args.local: if args.local:
# Parameter initialization # Parameter initialization
...@@ -239,8 +240,6 @@ def main(): ...@@ -239,8 +240,6 @@ def main():
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(
optimize_ops,
params_grads,
trainer_id=args.task_index, trainer_id=args.task_index,
pservers=args.ps_hosts, pservers=args.ps_hosts,
trainers=trainers) trainers=trainers)
......
...@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat, ...@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
paddle_error paddle_matrix_get_shape(paddle_matrix mat, paddle_error paddle_matrix_get_shape(paddle_matrix mat,
uint64_t* height, uint64_t* height,
uint64_t* width) { uint64_t* width) {
if (mat == nullptr) return kPD_NULLPTR; if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
if (height != nullptr) { if (height != nullptr) {
*height = cast(mat)->mat->getHeight(); *height = cast(mat)->mat->getHeight();
} }
......
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef HL_BASE_H_ #pragma once
#define HL_BASE_H_
#include <cstddef> #include <cstddef>
...@@ -207,8 +206,8 @@ typedef struct { ...@@ -207,8 +206,8 @@ typedef struct {
#ifdef __NVCC__ #ifdef __NVCC__
#include "cuda_runtime.h" #include <cuda_runtime.h>
#include "hl_cuda.h" #include "paddle/cuda/include/hl_cuda.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
extern __thread bool g_sync_flag; extern __thread bool g_sync_flag;
...@@ -228,6 +227,19 @@ extern __thread cudaStream_t default_stream; ...@@ -228,6 +227,19 @@ extern __thread cudaStream_t default_stream;
<< "CUDA error: " << hl_get_device_error_string((size_t)err); \ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
} }
#endif /* __NVCC__ */ // __shfl has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T
__shfl_sync(unsigned, T val, int src_line, int width) {
return __shfl(val, src_line, width);
}
#endif /* HL_BASE_H_ */ #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif // __NVCC__
...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue, ...@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
} }
__device__ __forceinline__ void transpose_32x32(real a[], const int idx) { __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
int addr = idx % 32; const int warp_size = 32;
int addr = idx % warp_size;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, addr < warp_size);
#pragma unroll #pragma unroll
for (int k = 1; k < 32; k++) { for (int k = 1; k < 32; k++) {
// rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32); // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
addr = __shfl_sync(addr, (idx + 1) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
a[k] = __shfl_sync(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
} }
#pragma unroll #pragma unroll
...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) { ...@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
} }
addr = (32 - idx) % 32; addr = (32 - idx) % 32;
CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
#pragma unroll #pragma unroll
for (int k = 0; k < 32; k++) { for (int k = 0; k < 32; k++) {
a[k] = __shfl_sync(a[k], addr, 32); a[k] = __shfl_sync(mask, a[k], addr, 32);
addr = __shfl_sync(addr, (idx + 31) % 32, 32); addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
} }
} }
......
...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK, ...@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
if (--beamSize == 0) break; if (--beamSize == 0) break;
__syncthreads(); __syncthreads();
unsigned mask = 0u;
// CREATE_SHFL_MASK(mask, tid < len);
if (tid == maxId[0]) { if (tid == maxId[0]) {
if (beam < maxLength) { if (beam < maxLength) {
shTopK[tid] = topK[beam]; shTopK[tid] = topK[beam];
} }
} }
if (maxId[0] / 32 == warp) { if (maxId[0] / 32 == warp) {
if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break; if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
} }
} }
} }
......
...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss, const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
platform::NCCLContextMap *nccl_ctxs) platform::NCCLContextMap *nccl_ctxs)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, bool skip_scale_loss) const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
: loss_var_name_(loss_var_name), : loss_var_name_(loss_var_name),
places_(places), places_(places),
local_scopes_(local_scopes) { local_scopes_(local_scopes) {
...@@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
for (auto &p : params) { for (auto &p : params) {
grad_names_.insert(GradVarName(p)); grad_names_.insert(GradVarName(p));
} }
skip_scale_loss_ = skip_scale_loss; use_default_grad_scale_ = use_default_grad_scale;
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
...@@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} else if (IsDistTrainOp(*op, send_op)) { } else if (IsDistTrainOp(*op, send_op)) {
CreateComputationalOps(&result, *op, 1); CreateComputationalOps(&result, *op, 1);
} else if (IsScaleLossOp(*op)) { } else if (IsScaleLossOp(*op)) {
// user can customize loss@grad if skip_scale_loss_ // user can customize loss@grad if not use_default_grad_scale_
if (!skip_scale_loss_) { if (use_default_grad_scale_) {
CreateScaleLossGradOp(&result); CreateScaleLossGradOp(&result);
} }
is_forwarding = false; is_forwarding = false;
......
...@@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::string &loss_var_name, const std::string &loss_var_name,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
bool skip_scale_loss); bool use_default_grad_scale);
#endif #endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
...@@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_; platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
bool skip_scale_loss_; bool use_default_grad_scale_;
bool IsScaleLossOp(const OpDesc &op) const; bool IsScaleLossOp(const OpDesc &op) const;
......
...@@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) { ...@@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) {
std::unique_ptr<std::istream> stream_ptr(stream); std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr)); recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(&scanner, ctx); auto tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(&scanner, ctx); tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
} }
......
...@@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -93,6 +93,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
RunImpl(scope, place); RunImpl(scope, place);
} }
bool OperatorBase::HasInputs(const std::string& name) const {
if (inputs_.find(name) != inputs_.end()) {
return true;
} else {
return false;
}
}
std::string OperatorBase::Input(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name); auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL, PADDLE_ENFORCE_LE(ins.size(), 1UL,
...@@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs( ...@@ -109,6 +117,14 @@ const std::vector<std::string>& OperatorBase::Inputs(
return it->second; return it->second;
} }
bool OperatorBase::HasOutputs(const std::string& name) const {
if (outputs_.find(name) != outputs_.end()) {
return true;
} else {
return false;
}
}
std::string OperatorBase::Output(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const {
auto& outs = Outputs(name); auto& outs = Outputs(name);
PADDLE_ENFORCE_LE(outs.size(), 1UL, PADDLE_ENFORCE_LE(outs.size(), 1UL,
...@@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const { ...@@ -220,13 +236,18 @@ void OperatorBase::CheckAllInputOutputSet() const {
if (op_info == nullptr || op_info->proto_ == nullptr) return; if (op_info == nullptr || op_info->proto_ == nullptr) return;
for (auto& in : op_info->Proto().inputs()) { for (auto& in : op_info->Proto().inputs()) {
PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), if (!in.dispensable()) {
"Type %s's input %s is not set", Type(), in.name()); PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
"Operator %s's input, %s, is not set", Type(), in.name());
}
} }
for (auto& out : op_info->Proto().outputs()) { for (auto& out : op_info->Proto().outputs()) {
PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), if (!out.dispensable()) {
"Type %s's output %s is not set", Type(), out.name()); PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
"Operator %s's output, %s, is not set", Type(),
out.name());
}
} }
} }
...@@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -332,6 +353,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {} : op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
if (!op_.HasInputs(name)) {
return false;
}
auto& ins = Inputs(name); auto& ins = Inputs(name);
size_t length = ins.size(); size_t length = ins.size();
if (length == 0) { if (length == 0) {
...@@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -345,6 +369,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
if (!op_.HasOutputs(name)) {
return false;
}
auto& outs = Outputs(name); auto& outs = Outputs(name);
size_t length = outs.size(); size_t length = outs.size();
if (length == 0) { if (length == 0) {
...@@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -358,6 +385,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
if (!op_.HasInputs(name)) {
return false;
}
auto inputs = op_.Inputs(name); auto inputs = op_.Inputs(name);
if (inputs.empty()) { if (inputs.empty()) {
return false; return false;
...@@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -371,6 +401,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
bool HasOutputs(const std::string& name) const override { bool HasOutputs(const std::string& name) const override {
if (!op_.HasOutputs(name)) {
return false;
}
auto outputs = op_.Outputs(name); auto outputs = op_.Outputs(name);
if (outputs.empty()) { if (outputs.empty()) {
return false; return false;
......
...@@ -105,6 +105,7 @@ class OperatorBase { ...@@ -105,6 +105,7 @@ class OperatorBase {
const VariableNameMap& Inputs() const { return inputs_; } const VariableNameMap& Inputs() const { return inputs_; }
const VariableNameMap& Outputs() const { return outputs_; } const VariableNameMap& Outputs() const { return outputs_; }
bool HasInputs(const std::string& name) const;
//! Get a input with argument's name described in `op_proto` //! Get a input with argument's name described in `op_proto`
std::string Input(const std::string& name) const; std::string Input(const std::string& name) const;
//! Get a input which has multiple variables. //! Get a input which has multiple variables.
...@@ -112,6 +113,7 @@ class OperatorBase { ...@@ -112,6 +113,7 @@ class OperatorBase {
//! Get all inputs variable names //! Get all inputs variable names
std::vector<std::string> InputVars() const; std::vector<std::string> InputVars() const;
bool HasOutputs(const std::string& name) const;
//! Get a output with argument's name described in `op_proto` //! Get a output with argument's name described in `op_proto`
std::string Output(const std::string& name) const; std::string Output(const std::string& name) const;
//! Get an output which has multiple variables. //! Get an output which has multiple variables.
......
...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay, Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
bool customize_scale_loss) bool use_default_grad_scale)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor( ...@@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder( details::MultiDevSSAGraphBuilder builder(
member_->places_, loss_var_name, params, member_->local_scopes_, member_->places_, loss_var_name, params, member_->local_scopes_,
customize_scale_loss, member_->nccl_ctxs_.get()); use_default_grad_scale, member_->nccl_ctxs_.get());
#else #else
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
params, member_->local_scopes_, params, member_->local_scopes_,
customize_scale_loss); use_default_grad_scale);
#endif #endif
auto graph = builder.Build(main_program); auto graph = builder.Build(main_program);
......
...@@ -40,7 +40,7 @@ class ParallelExecutor { ...@@ -40,7 +40,7 @@ class ParallelExecutor {
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes, const std::vector<Scope*>& local_scopes,
bool allow_op_delay, bool customize_scale_loss); bool allow_op_delay, bool use_default_grad_scale);
~ParallelExecutor(); ~ParallelExecutor();
......
...@@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const { ...@@ -120,11 +120,11 @@ bool SelectedRows::HasKey(int64_t key) const {
: true; : true;
} }
std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
framework::Tensor* value) const { std::vector<int64_t> keys, framework::Tensor* value) const {
PADDLE_ENFORCE(value->IsInitialized(), PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized."); "The value tensor should be initialized.");
std::vector<int64_t> non_keys; std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
int64_t value_width = value_->numel() / value_->dims()[0]; int64_t value_width = value_->numel() / value_->dims()[0];
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table " "output tensor should have the same shape with table "
...@@ -133,7 +133,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, ...@@ -133,7 +133,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
for (size_t i = 0; i < keys.size(); ++i) { for (size_t i = 0; i < keys.size(); ++i) {
int64_t index = Index(keys[i]); int64_t index = Index(keys[i]);
if (index == -1) { if (index == -1) {
non_keys.push_back(keys[i]); non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
} else { } else {
framework::VisitDataType( framework::VisitDataType(
framework::ToDataType(value_->type()), framework::ToDataType(value_->type()),
...@@ -141,7 +141,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys, ...@@ -141,7 +141,7 @@ std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
index * value_width, value_width)); index * value_width, value_width));
} }
} }
return non_keys; return non_keys_pair;
} }
bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -78,10 +79,11 @@ class SelectedRows { ...@@ -78,10 +79,11 @@ class SelectedRows {
/* /*
* @brief Get value by the key list, if the * @brief Get value by the key list, if the
* *
* @return a list of keys which does not exists in table * @return a list of pair which contains the non-exists key and the index in
* the value
*/ */
std::vector<int64_t> Get(std::vector<int64_t> keys, std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
framework::Tensor* tensor) const; framework::Tensor* value) const;
/* /*
* @brief Set a key-value pair into the table. * @brief Set a key-value pair into the table.
......
...@@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -59,7 +59,7 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
} }
TEST_F(SelectedRowsTester, Table) { TEST_F(SelectedRowsTester, SparseTable) {
platform::CPUPlace cpu; platform::CPUPlace cpu;
SelectedRows table; SelectedRows table;
// initialize a sparse table // initialize a sparse table
...@@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) { ...@@ -87,11 +87,11 @@ TEST_F(SelectedRowsTester, Table) {
framework::Tensor get_value; framework::Tensor get_value;
get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu); get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
std::vector<int64_t> keys({non_key, key}); std::vector<int64_t> keys({non_key, key});
auto non_keys = table.Get(keys, &get_value); auto non_key_pairs = table.Get(keys, &get_value);
ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10)); ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
ASSERT_EQ(non_keys.size(), static_cast<size_t>(1)); ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
ASSERT_EQ(non_keys[0], non_key); ASSERT_EQ(non_key_pairs[0].first, non_key);
} }
} // namespace framework } // namespace framework
......
...@@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase { ...@@ -65,7 +65,7 @@ class TensorRTEngine : public EngineBase {
// Initialize the inference network, so that TensorRT layers can add to this // Initialize the inference network, so that TensorRT layers can add to this
// network. // network.
void InitNetwork() { void InitNetwork() {
infer_builder_.reset(createInferBuilder(logger_)); infer_builder_.reset(createInferBuilder(&logger_));
infer_network_.reset(infer_builder_->createNetwork()); infer_network_.reset(infer_builder_->createNetwork());
} }
// After finishing adding ops, freeze this network and creates the executation // After finishing adding ops, freeze this network and creates the executation
......
...@@ -46,13 +46,13 @@ const int kDataTypeSize[] = { ...@@ -46,13 +46,13 @@ const int kDataTypeSize[] = {
// The following two API are implemented in TensorRT's header file, cannot load // The following two API are implemented in TensorRT's header file, cannot load
// from the dynamic library. So create our own implementation and directly // from the dynamic library. So create our own implementation and directly
// trigger the method from the dynamic library. // trigger the method from the dynamic library.
static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
// A logger for create TensorRT infer builder. // A logger for create TensorRT infer builder.
...@@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger { ...@@ -80,7 +80,7 @@ class NaiveLogger : public nvinfer1::ILogger {
return *x; return *x;
} }
virtual ~NaiveLogger() override {} ~NaiveLogger() override {}
}; };
} // namespace tensorrt } // namespace tensorrt
......
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "NvInfer.h" #include "NvInfer.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/dynload/tensorrt.h"
namespace dy = paddle::platform::dynload; namespace dy = paddle::platform::dynload;
...@@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger { ...@@ -43,7 +43,7 @@ class Logger : public nvinfer1::ILogger {
class ScopedWeights { class ScopedWeights {
public: public:
ScopedWeights(float value) : value_(value) { explicit ScopedWeights(float value) : value_(value) {
w.type = nvinfer1::DataType::kFLOAT; w.type = nvinfer1::DataType::kFLOAT;
w.values = &value_; w.values = &value_;
w.count = 1; w.count = 1;
...@@ -58,13 +58,13 @@ class ScopedWeights { ...@@ -58,13 +58,13 @@ class ScopedWeights {
// The following two API are implemented in TensorRT's header file, cannot load // The following two API are implemented in TensorRT's header file, cannot load
// from the dynamic library. So create our own implementation and directly // from the dynamic library. So create our own implementation and directly
// trigger the method from the dynamic library. // trigger the method from the dynamic library.
nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IBuilder*>( return static_cast<nvinfer1::IBuilder*>(
dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
const char* kInputTensor = "input"; const char* kInputTensor = "input";
...@@ -74,7 +74,7 @@ const char* kOutputTensor = "output"; ...@@ -74,7 +74,7 @@ const char* kOutputTensor = "output";
nvinfer1::IHostMemory* CreateNetwork() { nvinfer1::IHostMemory* CreateNetwork() {
Logger logger; Logger logger;
// Create the engine. // Create the engine.
nvinfer1::IBuilder* builder = createInferBuilder(logger); nvinfer1::IBuilder* builder = createInferBuilder(&logger);
ScopedWeights weights(2.); ScopedWeights weights(2.);
ScopedWeights bias(3.); ScopedWeights bias(3.);
...@@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() { ...@@ -103,9 +103,9 @@ nvinfer1::IHostMemory* CreateNetwork() {
return model; return model;
} }
void Execute(nvinfer1::IExecutionContext& context, const float* input, void Execute(nvinfer1::IExecutionContext* context, const float* input,
float* output) { float* output) {
const nvinfer1::ICudaEngine& engine = context.getEngine(); const nvinfer1::ICudaEngine& engine = context->getEngine();
// Two binds, input and output // Two binds, input and output
ASSERT_EQ(engine.getNbBindings(), 2); ASSERT_EQ(engine.getNbBindings(), 2);
const int input_index = engine.getBindingIndex(kInputTensor); const int input_index = engine.getBindingIndex(kInputTensor);
...@@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input, ...@@ -119,7 +119,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input,
// Copy the input to the GPU, execute the network, and copy the output back. // Copy the input to the GPU, execute the network, and copy the output back.
ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
cudaMemcpyHostToDevice, stream)); cudaMemcpyHostToDevice, stream));
context.enqueue(1, buffers, stream, nullptr); context->enqueue(1, buffers, stream, nullptr);
ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
cudaMemcpyDeviceToHost, stream)); cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
...@@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) { ...@@ -136,7 +136,7 @@ TEST(TensorrtTest, BasicFunction) {
// Use the model to create an engine and an execution context. // Use the model to create an engine and an execution context.
Logger logger; Logger logger;
nvinfer1::IRuntime* runtime = createInferRuntime(logger); nvinfer1::IRuntime* runtime = createInferRuntime(&logger);
nvinfer1::ICudaEngine* engine = nvinfer1::ICudaEngine* engine =
runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
model->destroy(); model->destroy();
...@@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) { ...@@ -145,7 +145,7 @@ TEST(TensorrtTest, BasicFunction) {
// Execute the network. // Execute the network.
float input = 1234; float input = 1234;
float output; float output;
Execute(*context, &input, &output); Execute(context, &input, &output);
EXPECT_EQ(output, input * 2 + 3); EXPECT_EQ(output, input * 2 + 3);
// Destroy the engine. // Destroy the engine.
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory;
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
namespace {
template <typename T>
struct bn_type_traits {
using op_type = T;
using op_desc = typename op_type::desc;
using op_prim = typename op_type::primitive_desc;
};
template <typename T, typename Container>
void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
Container *c) {
auto it = std::begin(*c);
std::copy(scale_begin, scale_end, std::inserter(*c, it));
std::copy(
shift_begin, shift_end,
std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
}
template <typename Op, typename... Args>
void run_batch_norm_op(Args &&... args) {
Op batch_norm_op{args...};
std::vector<mkldnn::primitive> pipeline;
pipeline.push_back(batch_norm_op);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
template <typename T>
inline void *cast_const_to_void(const T *t) {
return static_cast<void *>(const_cast<T *>(t));
}
} // namespace
template <typename T>
class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *variance = ctx.Input<Tensor>("Variance");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
auto *y = ctx.Output<Tensor>("Y");
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
auto *batch_mean = ctx.Output<Tensor>("SavedMean");
auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *shift = ctx.Input<Tensor>("Bias");
y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<T>(ctx.GetPlace());
variance_out->mutable_data<T>(ctx.GetPlace());
if (!is_test) {
batch_mean->mutable_data<T>(ctx.GetPlace());
batch_variance->mutable_data<T>(ctx.GetPlace());
}
auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
auto dst = mkldnn::memory{dst_pd, y->data<T>()};
unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats;
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc =
bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
auto batch_norm_fwd_pd =
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
const unsigned int ic = dims[1];
// MKLDNN requires a single piece of memory for scale and shift/bias data
const size_t scaleshift_size = 2 * ic;
std::vector<T> scaleshift_data;
scaleshift_data.reserve(scaleshift_size);
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
shift->data<T>() + ic, &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{
batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
if (is_test) {
auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
cast_const_to_void(mean->data<T>())};
auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
cast_const_to_void(variance->data<T>())};
run_batch_norm_op<typename bn_fwd_types::op_type>(
batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
(const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
dst);
} else {
auto mean_memory =
mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
cast_const_to_void(batch_mean->data<T>())};
auto variance_memory =
mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
cast_const_to_void(batch_variance->data<T>())};
run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
scaleshift_memory, dst,
mean_memory, variance_memory);
}
if (!is_test) {
const unsigned int in = dims[0];
const unsigned int sample_size = x->numel() / in / ic;
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(
batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
EigenVectorArrayMap<T> saved_variance_e(
batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
saved_mean_e.setZero();
saved_variance_e.setZero();
const unsigned int x_arr_size = in * ic;
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_mean_e(nc % ic) += x_arr.col(nc).sum();
}
saved_mean_e /= in * sample_size;
for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
saved_variance_e(nc % ic) +=
(x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
}
saved_variance_e /= in * sample_size;
ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
EigenVectorArrayMap<T> running_mean_arr(
mean_out->mutable_data<T>(ctx.GetPlace()), ic);
EigenVectorArrayMap<T> running_var_arr(
variance_out->mutable_data<T>(ctx.GetPlace()), ic);
auto one_minus_momentum = 1. - momentum;
running_mean_arr =
mean_arr * momentum + saved_mean_e * one_minus_momentum;
running_var_arr =
variance_arr * momentum + saved_variance_e * one_minus_momentum;
}
}
};
template <typename T>
class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext &ctx) const override {
auto data_layout_str = ctx.Attr<std::string>("data_layout");
auto data_layout = framework::StringToDataLayout(data_layout_str);
PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
"MKLDNN batch normalization handles only NCHW data layout");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
const float epsilon = ctx.Attr<float>("epsilon");
const auto *x = ctx.Input<Tensor>("X");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *shift = ctx.Input<Tensor>("Bias");
const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
diff_x->mutable_data<T>(ctx.GetPlace());
diff_scale->mutable_data<T>(ctx.GetPlace());
diff_shift->mutable_data<T>(ctx.GetPlace());
auto dims = paddle::framework::vectorize2int(x->dims());
unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
auto src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto diff_src_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
auto diff_dst_md =
MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
auto batch_norm_fwd_pd =
bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
auto src = mkldnn::memory{{src_md, mkldnn_engine},
cast_const_to_void(x->data<T>())};
auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
cast_const_to_void(batch_mean->data<T>())};
auto variance =
mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
cast_const_to_void(batch_variance->data<T>())};
auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
cast_const_to_void(diff_y->data<T>())};
const unsigned int ic = dims[1];
const size_t scaleshift_size = 2 * ic;
std::vector<T> scaleshift_data;
scaleshift_data.reserve(scaleshift_size);
copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
shift->data<T>() + ic, &scaleshift_data);
auto scaleshift_memory = mkldnn::memory{
batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
std::vector<T> diff_scaleshift_data;
diff_scaleshift_data.reserve(scaleshift_size);
copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
diff_shift->data<T>(), diff_shift->data<T>() + ic,
&diff_scaleshift_data);
auto diff_scaleshift_memory =
mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
diff_scaleshift_data.data()};
auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
static_cast<void *>(diff_x->data<T>())};
run_batch_norm_op<bn_bwd_types::op_type>(
batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
diff_src, diff_scaleshift_memory);
auto it = std::begin(diff_scaleshift_data);
std::copy(it, std::next(it, ic), diff_scale->data<T>());
std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
diff_shift->data<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
ops::BatchNormMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
ops::BatchNormMKLDNNGradOpKernel<float>);
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -87,9 +90,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type()); framework::ToDataType(ctx.Input<Tensor>("X")->type());
// For float or float16 input tensor, the type of the scale, bias, mean, // By default, the type of the scale, bias, mean,
// and var tensors should both be float. // and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto bn_param_type = framework::proto::VarType::FP32; auto bn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
bn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(bn_param_type, PADDLE_ENFORCE_EQ(bn_param_type,
framework::ToDataType(ctx.Input<Tensor>("Scale")->type()), framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
"Scale input should be of float type"); "Scale input should be of float type");
...@@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -102,7 +109,18 @@ class BatchNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
ctx.Input<Tensor>("Variance")->type()), ctx.Input<Tensor>("Variance")->type()),
"Variance input should be of float type"); "Variance input should be of float type");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library_);
} }
}; };
...@@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -147,6 +165,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
"Variance of the current mini batch, " "Variance of the current mini batch, "
"will apply to output when training") "will apply to output when training")
.AsIntermediate(); .AsIntermediate();
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Batch Normalization. Batch Normalization.
...@@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -345,8 +366,19 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
if (t == nullptr) { if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD"); PADDLE_THROW("can't find Y@GRAD");
} }
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace()); framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout, library_);
} }
}; };
...@@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { ...@@ -470,6 +502,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetInput("Scale", Input("Scale")); op->SetInput("Scale", Input("Scale"));
op->SetInput("Bias", Input("Bias"));
op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput("SavedVariance", Output("SavedVariance")); op->SetInput("SavedVariance", Output("SavedVariance"));
...@@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ...@@ -492,8 +525,9 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm_grad, batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>); ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -287,6 +287,8 @@ namespace ops = paddle::operators; ...@@ -287,6 +287,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>, batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
ops::BatchNormKernel<plat::CUDADeviceContext, double>,
ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>); ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>); batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
...@@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X. ...@@ -164,11 +164,13 @@ or not. But the output only shares the LoD information with input X.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>, REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
ops::CrossEntropyOpKernel<double>); ops::CrossEntropyOpKernel<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad, REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<float>, ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel<double>); ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
...@@ -14,98 +14,11 @@ limitations under the License. */ ...@@ -14,98 +14,11 @@ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
namespace paddle {
namespace operators {
namespace {
template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const int64_t* label, const int N,
const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
int idx = i * D + label[i];
dX[idx] = -dY[i] / X[idx];
}
}
template <typename T>
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
int ids = blockIdx.x * blockDim.x + threadIdx.x;
if (ids < N * D) {
int row_ids = ids / D;
dX[ids] = -label[ids] * dY[row_ids] / X[ids];
}
}
} // namespace
template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* label = ctx.Input<Tensor>("Label");
Tensor* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace());
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
ctx.Attr<bool>("soft_label"));
}
};
template <typename T>
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* label = ctx.Input<Tensor>("Label");
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
const T* dy_data =
ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
const T* x_data = x->data<T>();
int64_t batch_size = x->dims()[0];
int64_t class_num = x->dims()[1];
int block = 512;
int grid = (batch_size * class_num + block - 1) / block;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto stream = dev_ctx.stream();
if (ctx.Attr<bool>("soft_label")) {
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
dx_data, dy_data, x_data, label_data, batch_size, class_num);
} else {
math::SetConstant<platform::CUDADeviceContext, T> functor;
functor(dev_ctx, dx, 0);
auto* label_data = label->data<int64_t>();
grid = (batch_size + block - 1) / block;
CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
dx_data, dy_data, x_data, label_data, batch_size, class_num);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>, using CUDACtx = paddle::platform::CUDADeviceContext;
ops::CrossEntropyOpCUDAKernel<double>); REGISTER_OP_CUDA_KERNEL(cross_entropy,
ops::CrossEntropyOpKernel<CUDACtx, float>,
ops::CrossEntropyOpKernel<CUDACtx, double>);
REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpCUDAKernel<float>, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpCUDAKernel<double>); ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
...@@ -17,69 +17,106 @@ limitations under the License. */ ...@@ -17,69 +17,106 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename DeviceContext, typename T>
class CrossEntropyOpKernel : public framework::OpKernel<T> { class CrossEntropyOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), auto* x = ctx.Input<Tensor>("X");
"This kernel only runs on CPU."); auto* labels = ctx.Input<Tensor>("Label");
const Tensor* x = ctx.Input<Tensor>("X"); auto* y = ctx.Output<Tensor>("Y");
const Tensor* labels = ctx.Input<Tensor>("Label");
Tensor* y = ctx.Output<Tensor>("Y");
y->mutable_data<T>(ctx.GetPlace()); y->mutable_data<T>(ctx.GetPlace());
math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()( math::CrossEntropyFunctor<DeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels, ctx.template device_context<DeviceContext>(), y, x, labels,
ctx.Attr<bool>("soft_label")); ctx.Attr<bool>("soft_label"));
} }
}; };
template <typename T> template <typename T>
class XeSoftlabelGradFunctor {
public:
XeSoftlabelGradFunctor(T* dx,
const T* dy, // NOLINT
const T* x, // NOLINT
const T* label, // NOLINT
size_t num_classes)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
HOSTDEVICE void operator()(size_t i) {
auto row_ids = i / num_classes_;
dx_[i] = -label_[i] * dy_[row_ids] / x_[i];
}
private:
T* dx_;
const T* dy_;
const T* x_;
const T* label_;
size_t num_classes_;
};
template <typename T>
class XeGradFunctor {
public:
XeGradFunctor(T* dx,
const T* dy, // NOLINT
const T* x, // NOLINT
const int64_t* label, // NOLINT
size_t num_classes)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
HOSTDEVICE void operator()(size_t sample_id) {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] = x_offset != x_is_true_offset
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
}
}
private:
T* dx_;
const T* dy_;
const T* x_;
const int64_t* label_;
size_t num_classes_;
};
template <typename DeviceContext, typename T>
class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), auto* x = ctx.Input<Tensor>("X");
"This kernel only runs on CPU."); auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
const Tensor* x = ctx.Input<Tensor>("X"); auto* label = ctx.Input<Tensor>("Label");
const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
const Tensor* label = ctx.Input<Tensor>("Label"); auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
int64_t class_num = x->dims()[1]; int64_t class_num = x->dims()[1];
if (ctx.Attr<bool>("soft_label")) { if (ctx.Attr<bool>("soft_label")) {
auto x_mat = EigenMatrix<T>::From(*x); XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
auto dy_mat = EigenMatrix<T>::From(*dy); label->data<T>(),
auto lbl_mat = EigenMatrix<T>::From(*label); static_cast<size_t>(class_num));
auto dx_mat = EigenMatrix<T>::From(*dx); platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>() static_cast<size_t>(dx->numel()));
.eigen_device()) = for_range(functor);
-(lbl_mat *
dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
} else { } else {
int64_t batch_size = x->dims()[0]; XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
const T* dy_data = dy->data<T>(); label->data<int64_t>(),
const T* x_data = x->data<T>(); static_cast<size_t>(class_num));
const int64_t* label_data = label->data<int64_t>(); platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
math::SetConstant<platform::CPUDeviceContext, T> functor; static_cast<size_t>(dy->numel()));
functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0); for_range(functor);
for (int64_t i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int64_t index = i * class_num + label_data[i];
dx_data[index] = math::TolerableValue<T>()(-dy_data[i] / x_data[index]);
}
} }
} }
}; };
......
...@@ -82,7 +82,9 @@ class RequestSend final : public RequestBase { ...@@ -82,7 +82,9 @@ class RequestSend final : public RequestBase {
virtual std::string GetReqName() { return request_->Varname(); } virtual std::string GetReqName() { return request_->Varname(); }
virtual void Process() { virtual void Process() {
queue_->Push(std::make_pair(request_->Varname(), request_)); std::string var_name = GetReqName();
VLOG(3) << "RequestSend " << var_name;
queue_->Push(std::make_pair(var_name, request_));
sendrecv::VoidMessage reply; sendrecv::VoidMessage reply;
responder_.Finish(reply, ::grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
...@@ -106,7 +108,7 @@ class RequestGet final : public RequestBase { ...@@ -106,7 +108,7 @@ class RequestGet final : public RequestBase {
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
queue_(queue) { queue_(queue) {
int method_id = static_cast<int>(detail::GrpcMethod::kGetVariable); auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
cq_, this); cq_, this);
} }
...@@ -118,6 +120,7 @@ class RequestGet final : public RequestBase { ...@@ -118,6 +120,7 @@ class RequestGet final : public RequestBase {
virtual void Process() { virtual void Process() {
// proc request. // proc request.
std::string var_name = request_.varname(); std::string var_name = request_.varname();
VLOG(3) << "RequestGet " << var_name;
auto* var = scope_->FindVar(var_name); auto* var = scope_->FindVar(var_name);
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
...@@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase { ...@@ -176,7 +179,7 @@ class RequestPrefetch final : public RequestBase {
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
std::string var_name = request_->OutVarname(); std::string var_name = request_->OutVarname();
VLOG(3) << "prefetch var " << var_name; VLOG(3) << "RequestPrefetch " << var_name;
auto var_desc = program_->Block(0).FindVar(var_name); auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope(); framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name); auto* var = local_scope->FindVar(var_name);
...@@ -208,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) { ...@@ -208,6 +211,11 @@ void AsyncGRPCServer::WaitClientGet(int count) {
} }
} }
void AsyncGRPCServer::WaitServerReady() {
std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
}
void AsyncGRPCServer::RunSyncUpdate() { void AsyncGRPCServer::RunSyncUpdate() {
::grpc::ServerBuilder builder; ::grpc::ServerBuilder builder;
builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(), builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
...@@ -241,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -241,6 +249,12 @@ void AsyncGRPCServer::RunSyncUpdate() {
t_prefetch_.reset(new std::thread( t_prefetch_.reset(new std::thread(
std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
"cq_prefetch", prefetch_register))); "cq_prefetch", prefetch_register)));
{
std::lock_guard<std::mutex> lock(this->mutex_ready_);
ready_ = 1;
}
condition_ready_.notify_all();
// wait server // wait server
server_->Wait(); server_->Wait();
t_send_->join(); t_send_->join();
...@@ -307,18 +321,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, ...@@ -307,18 +321,20 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
bool ok = false; bool ok = false;
while (true) { while (true) {
VLOG(3) << "HandleRequest for " << cq_name << " while in"; VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
if (!cq->Next(&tag, &ok)) { if (!cq->Next(&tag, &ok)) {
LOG(INFO) << cq_name << " CompletionQueue shutdown!"; LOG(INFO) << cq_name << " CompletionQueue shutdown!";
break; break;
} }
VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; VLOG(3) << "HandleRequest for " << cq_name << " get Next";
PADDLE_ENFORCE(tag); PADDLE_ENFORCE(tag);
if (sync_mode_) { if (sync_mode_) {
// FIXME(typhoonzero): de-couple the barriers with recv_op // FIXME(typhoonzero): de-couple the barriers with recv_op
if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
} }
RequestBase* base = reinterpret_cast<RequestBase*>(tag); RequestBase* base = reinterpret_cast<RequestBase*>(tag);
...@@ -336,9 +352,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, ...@@ -336,9 +352,9 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
switch (base->Status()) { switch (base->Status()) {
case PROCESS: { case PROCESS: {
VLOG(4) << cq_name << " PROCESS status:" << base->Status();
TryToRegisterNewOne(); TryToRegisterNewOne();
base->Process(); base->Process();
VLOG(4) << cq_name << " PROCESS status:" << base->Status();
break; break;
} }
case FINISH: { case FINISH: {
......
...@@ -45,8 +45,9 @@ class RequestBase; ...@@ -45,8 +45,9 @@ class RequestBase;
class AsyncGRPCServer final { class AsyncGRPCServer final {
public: public:
explicit AsyncGRPCServer(const std::string &address, bool sync_mode) explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
: address_(address), sync_mode_(sync_mode) {} : address_(address), sync_mode_(sync_mode), ready_(0) {}
void WaitServerReady();
void RunSyncUpdate(); void RunSyncUpdate();
// functions to sync server barrier status. // functions to sync server barrier status.
...@@ -118,6 +119,10 @@ class AsyncGRPCServer final { ...@@ -118,6 +119,10 @@ class AsyncGRPCServer final {
framework::ProgramDesc *program_; framework::ProgramDesc *program_;
framework::Executor *executor_; framework::Executor *executor_;
int selected_port_; int selected_port_;
std::mutex mutex_ready_;
std::condition_variable condition_ready_;
int ready_;
}; };
}; // namespace detail }; // namespace detail
......
...@@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -108,7 +108,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
} }
for (size_t i = 0; i < rows2->size(); ++i) { for (size_t i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[i], i); EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
} }
EXPECT_EQ(slr2->height(), 1000); EXPECT_EQ(slr2->height(), 1000);
} }
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h> #include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
#ifdef __NVCC__ #ifdef __NVCC__
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
const int warpSize = 32;
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
__syncthreads();
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
}
return val;
}
template <typename T, typename DX_OP, typename DY_OP> template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel( static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w, const T* x, const T* y, const T* out, const T* dout, int h, int w,
...@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
if (dy) { if (dy) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
if (dy) { if (dy) {
int h = pre * post; int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
......
...@@ -45,20 +45,6 @@ static void split(const std::string &str, char sep, ...@@ -45,20 +45,6 @@ static void split(const std::string &str, char sep,
} }
} }
static void AsyncExecuteBlock(framework::Executor *executor,
framework::ExecutorPrepareContext *prepared,
framework::Scope *scope) {
std::future<void> future = framework::Async([&executor, &prepared, &scope]() {
try {
executor->RunPreparedContext(prepared, scope, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
});
// TODO(qiao) maybe we can remove this
future.wait();
}
static void ParallelExecuteBlocks( static void ParallelExecuteBlocks(
const std::vector<size_t> &parallel_blkids, framework::Executor *executor, const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>> const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
...@@ -80,12 +66,7 @@ static void ParallelExecuteBlocks( ...@@ -80,12 +66,7 @@ static void ParallelExecuteBlocks(
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
} }
static void SavePort(std::shared_ptr<detail::AsyncGRPCServer> rpc_service) { std::atomic_int ListenAndServOp::selected_port_{0};
std::ofstream port_file;
port_file.open("/tmp/paddle.selected_port");
port_file << rpc_service->GetSelectedPort();
port_file.close();
}
ListenAndServOp::ListenAndServOp(const std::string &type, ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap &inputs,
...@@ -93,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type, ...@@ -93,15 +74,27 @@ ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
int ListenAndServOp::GetSelectedPort() const {
return rpc_service_->GetSelectedPort();
}
void ListenAndServOp::Stop() { void ListenAndServOp::Stop() {
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
server_thread_->join(); server_thread_->join();
} }
void ListenAndServOp::SavePort(const std::string &file_path) const {
// NOTE: default write file to /tmp/paddle.selected_port
selected_port_ = rpc_service_->GetSelectedPort();
std::ofstream port_file;
port_file.open(file_path);
port_file << selected_port_.load();
port_file.close();
VLOG(4) << "selected port written to " << file_path;
}
void ListenAndServOp::WaitServerReady() {
while (selected_port_.load() == 0) {
}
}
void ListenAndServOp::RunSyncLoop(framework::Executor *executor, void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
framework::Scope *recv_scope, framework::Scope *recv_scope,
...@@ -201,14 +194,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, ...@@ -201,14 +194,40 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
} // while(true) } // while(true)
} }
static void AsyncUpdateThread(
const std::string &var_name, const bool &exit_flag,
const std::shared_ptr<detail::ReceivedQueue> &queue,
framework::Executor *executor,
framework::ExecutorPrepareContext *prepared) {
VLOG(3) << "update thread for " << var_name << " started";
while (!exit_flag) {
const detail::ReceivedMessage v = queue->Pop();
auto recv_var_name = v.first;
auto var = v.second->GetVar();
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << recv_var_name;
PADDLE_THROW("Can not find server side var");
}
auto fs = framework::Async([var_name, &executor, &v, prepared] {
try {
executor->RunPreparedContext(prepared, v.second->GetMutableLocalScope(),
false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
});
fs.wait();
}
}
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program) const {
framework::Scope *recv_scope,
framework::BlockDesc *prefetch_block) const {
VLOG(3) << "RunAsyncLoop in"; VLOG(3) << "RunAsyncLoop in";
// grad name to block id // grad name to block id
std::unordered_map<std::string, int32_t> grad_to_block_id; std::unordered_map<std::string, int32_t> grad_to_block_id;
std::unordered_map<int32_t, std::string> id_to_grad; std::unordered_map<int32_t, std::string> id_to_grad;
std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
grad_to_queue;
auto grad_to_block_id_str = auto grad_to_block_id_str =
Attr<std::vector<std::string>>("grad_to_block_id"); Attr<std::vector<std::string>>("grad_to_block_id");
...@@ -220,6 +239,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -220,6 +239,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0); PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
int block_id = std::stoi(pieces[1]); int block_id = std::stoi(pieces[1]);
grad_to_block_id[pieces[0]] = block_id; grad_to_block_id[pieces[0]] = block_id;
grad_to_queue[pieces[0]] = std::make_shared<detail::ReceivedQueue>();
id_to_grad[block_id] = pieces[0]; id_to_grad[block_id] = pieces[0];
} }
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
...@@ -238,8 +258,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -238,8 +258,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i]; grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
} }
VLOG(3) << "RunAsyncLoop into while";
bool exit_flag = false; bool exit_flag = false;
VLOG(3) << "start async optimize threads";
std::vector<std::future<void>> fs;
for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
std::string grad_name = iter->first;
VLOG(3) << "create async update thread for " << grad_name;
fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
&grad_to_queue, &grad_to_prepared_ctx]() {
AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
executor, grad_to_prepared_ctx[grad_name].get());
}));
}
VLOG(3) << "RunAsyncLoop into while";
while (!exit_flag) { while (!exit_flag) {
const detail::ReceivedMessage v = rpc_service_->Get(); const detail::ReceivedMessage v = rpc_service_->Get();
auto recv_var_name = v.first; auto recv_var_name = v.first;
...@@ -249,13 +282,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -249,13 +282,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
break; break;
} else { } else {
VLOG(3) << "received grad: " << recv_var_name; VLOG(3) << "received grad: " << recv_var_name;
auto var = v.second->GetVar(); grad_to_queue[recv_var_name]->Push(v);
if (var == nullptr) {
LOG(ERROR) << "Can not find server side var: " << recv_var_name;
PADDLE_THROW("Can not find server side var");
}
AsyncExecuteBlock(executor, grad_to_prepared_ctx[recv_var_name].get(),
v.second->GetMutableLocalScope());
} }
if (exit_flag) { if (exit_flag) {
...@@ -298,13 +325,17 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -298,13 +325,17 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
VLOG(3) << "wait server thread to become ready..."; VLOG(3) << "wait server thread to become ready...";
sleep(5); rpc_service_->WaitServerReady();
// Write to a file of server selected port for python use. // Write to a file of server selected port for python use.
SavePort(rpc_service_); std::string file_path =
string::Sprintf("/tmp/paddle.%d.selected_port",
static_cast<int>(::getpid()));
SavePort(file_path);
if (sync_mode) { if (sync_mode) {
RunSyncLoop(&executor, program, &recv_scope, prefetch_block); RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
} else { } else {
RunAsyncLoop(&executor, program, &recv_scope, prefetch_block); RunAsyncLoop(&executor, program);
} }
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <atomic>
#include <ostream> #include <ostream>
#include <string> #include <string>
...@@ -39,26 +40,33 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -39,26 +40,33 @@ class ListenAndServOp : public framework::OperatorBase {
const framework::VariableNameMap& outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs); const framework::AttributeMap& attrs);
int GetSelectedPort() const;
void RunSyncLoop(framework::Executor* executor, void RunSyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program,
framework::Scope* recv_scope, framework::Scope* recv_scope,
framework::BlockDesc* prefetch_block) const; framework::BlockDesc* prefetch_block) const;
void RunAsyncLoop(framework::Executor* executor, void RunAsyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program) const;
framework::Scope* recv_scope,
framework::BlockDesc* prefetch_block) const; void SavePort(
const std::string& file_path = "/tmp/paddle.selected_port") const;
void WaitServerReady();
int GetSelectedPort() { return selected_port_; }
void Stop() override; void Stop() override;
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override; const platform::Place& dev_place) const override;
static void ResetPort() { selected_port_ = 0; }
protected: protected:
mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_; mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
// FIXME(wuyi): it's static so that the operator can be cloned.
static std::atomic_int selected_port_;
}; };
} // namespace operators } // namespace operators
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
constexpr int64_t kNoPadding = -1;
class LookupSparseTableInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LookupSparseTableOp should not be null.");
auto shape_w = ctx->GetInputDim("W");
auto shape_ids = ctx->GetInputDim("Ids");
shape_w[0] = shape_ids.size();
ctx->SetOutputDim("Out", shape_w);
}
};
class LookupSparseTableOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
auto out_var = scope.FindVar(Output("Out"));
auto w_var = scope.FindVar(Input("W"));
auto ids_var = scope.FindVar(Input("Ids"));
unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
float min = Attr<float>("min");
float max = Attr<float>("max");
bool auto_grown_table = Attr<bool>("auto_grown_table");
PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
"The type of Out var should be LodTensor.");
PADDLE_ENFORCE(w_var->IsType<framework::SelectedRows>(),
"The type of W var should be SelectedRows.");
PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
"The type of Ids var should be LoDTensor.");
auto &ids_t = ids_var->Get<framework::LoDTensor>();
auto out_t = out_var->GetMutable<framework::LoDTensor>();
auto w_t = w_var->GetMutable<framework::SelectedRows>();
std::vector<int64_t> keys;
keys.resize(ids_t.numel());
for (size_t i = 0; i < ids_t.numel(); ++i) {
keys[i] = ids_t.data<int64_t>()[i];
}
// TODO(Yancey1989): support CUDA Place for the sparse table
platform::CPUPlace cpu;
auto out_shape = w_t->value().dims();
out_shape[0] = keys.size();
out_t->Resize(out_shape);
out_t->mutable_data(cpu, w_t->value().type());
PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
framework::proto::VarType::FP32,
"The sparse table only support FP32");
auto non_keys_pair = w_t->Get(keys, out_t);
if (!auto_grown_table) {
PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
"there is some keys does exists in the sparse table.");
}
auto value_shape = w_t->value().dims();
value_shape[0] = 1;
for (const auto &it : non_keys_pair) {
const auto key = it.first;
const auto index = it.second;
framework::Tensor value;
value.Resize(value_shape);
auto data = value.mutable_data<float>(cpu);
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<float> dist(min, max);
int64_t size = value.numel();
for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine);
}
w_t->Set(key, value);
memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
cpu, value.data<float>(), value.numel() * sizeof(float));
}
}
};
class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LookupSparseTableOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("W",
"(SelectedRows) The input represents embedding table, "
"which is a learnable parameter.");
AddInput("Ids",
"(LoDTensor) Ids's type should be LoDTensor"
"THe ids to be looked up in W.");
AddOutput("Out",
"(LoDTensor) The lookup results, which have the "
"same type as W.");
AddAttr<int64_t>("padding_idx",
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding);
AddAttr<float>("min",
"(float, default -1.0) "
"Minimum value of uniform random")
.SetDefault(-1.0f);
AddAttr<float>("max",
"(float, default 1.0) "
"Maximun value of uniform random")
.SetDefault(1.0f);
AddAttr<int>("seed",
"(int, default 0) "
"Random seed used for generating samples. "
"0 means use a seed generated by the system."
"Note that if seed is not 0, this operator will always "
"generate the same random numbers every time.")
.SetDefault(0);
AddAttr<bool>("auto_grown_table",
"(bool default false)"
"Whether create new value if for nonexistent key.")
.SetDefault(true);
AddComment(R"DOC(
Lookup Sprase Tablel Operator.
This operator is used to perform lookup on parameter W,
then concatenated into a sparse tensor.
The type of Ids(Input) is SelectedRows, the rows of Ids contains
the ids to be looked up in W;
if the Id is not in the sparse table, this operator will return a
random value and set the value into the table for the next looking up.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp,
ops::LookupSparseTableInferShape,
ops::LookupSparseTableOpMaker,
paddle::framework::EmptyGradOpMaker);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
...@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, ...@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
} }
} }
template <typename T>
__device__ __forceinline__ T sum_single_warp(T val) {
val += platform::__shfl_down_sync(0, val, 16);
val += platform::__shfl_down_sync(0, val, 8);
val += platform::__shfl_down_sync(0, val, 4);
val += platform::__shfl_down_sync(0, val, 2);
val += platform::__shfl_down_sync(0, val, 1);
return val;
}
// CUDA do not support dynamic arrary in template
// https://stackoverflow.com/questions/20497209
template <typename T>
struct SharedMemory {
// Ensure that we won't compile any un-specialized types
__device__ T* GetPointer() { return NULL; }
};
template <>
struct SharedMemory<float> {
__device__ float* GetPointer() {
extern __shared__ float s_float[];
return s_float;
}
};
template <>
struct SharedMemory<double> {
__device__ double* GetPointer() {
extern __shared__ double s_double[];
return s_double;
}
};
template <typename T> template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int class_num) { const int class_num) {
int tid = threadIdx.x; int tid = threadIdx.x;
SharedMemory<T> d_sum_shared; T val = 0;
T* d_sum = d_sum_shared.GetPointer();
d_sum[tid] = 0;
int cur_idx = tid; int idx = blockIdx.x * class_num + tid;
int next_idx = blockIdx.x * class_num + tid; int end = blockIdx.x * class_num + class_num;
while (cur_idx < class_num) { for (; idx < end; idx += blockDim.x) {
d_sum[tid] += val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
next_idx += blockDim.x;
cur_idx += blockDim.x;
} }
__syncthreads();
for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) { val = paddle::platform::reduceSum(val, tid, blockDim.x);
if (tid < stride) d_sum[tid] += d_sum[tid + stride]; if (threadIdx.x == 0) {
__syncthreads(); Y[blockIdx.x] = -val;
} }
T val = d_sum[tid];
val = sum_single_warp<T>(val);
if (tid == 0) Y[blockIdx.x] = -val;
} }
} // namespace } // namespace
...@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
? 512 ? 512
: pow(2, static_cast<int>(std::log2(class_num))); : pow(2, static_cast<int>(std::log2(class_num)));
SoftCrossEntropyKernel<T><<< SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
batch_size, block, block * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
loss_data, prob_data, label_data, class_num); loss_data, prob_data, label_data, class_num);
} else { } else {
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
......
...@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include <algorithm>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,9 +28,10 @@ template <typename PoolProcess, typename T> ...@@ -27,9 +28,10 @@ template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -63,11 +65,11 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -63,11 +65,11 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
T ele = pool_process.initial(); T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]); pool_process.compute(input_data[h * input_width + w], &ele);
} }
} }
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size))); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[ph * output_width + pw] = ele; output_data[ph * output_width + pw] = ele;
} }
} }
...@@ -86,13 +88,12 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -86,13 +88,12 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <typename PoolProcess, class T> template <typename PoolProcess, class T>
class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, PoolProcess pool_grad_process,
PoolProcess pool_grad_process, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -131,8 +132,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -131,8 +132,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
input_data[h * input_width + w], input_data[h * input_width + w],
output_data[ph * output_width + pw], output_data[ph * output_width + pw],
output_grad_data[ph * output_width + pw], output_grad_data[ph * output_width + pw],
input_grad_data[h * input_width + w], static_cast<T>(scale),
static_cast<T>(scale)); input_grad_data + h * input_width + w);
} }
} }
} }
...@@ -154,12 +155,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -154,12 +155,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <class T> template <class T>
class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> { class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -246,9 +246,10 @@ template <typename PoolProcess, class T> ...@@ -246,9 +246,10 @@ template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -293,14 +294,14 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -293,14 +294,14 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute( pool_process.compute(
ele, input_data[(d * input_height + h) * input_width + w],
input_data[(d * input_height + h) * input_width + w]); &ele);
} }
} }
} }
int pool_size = int pool_size =
(dend - dstart) * (hend - hstart) * (wend - wstart); (dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size)); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[output_idx] = ele; output_data[output_idx] = ele;
} }
} }
...@@ -320,13 +321,12 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -320,13 +321,12 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <typename PoolProcess, class T> template <typename PoolProcess, class T>
class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, PoolProcess pool_grad_process,
PoolProcess pool_grad_process, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -379,8 +379,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -379,8 +379,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
(pd * output_height + ph) * output_width + pw; (pd * output_height + ph) * output_width + pw;
pool_grad_process.compute( pool_grad_process.compute(
input_data[input_idx], output_data[output_idx], input_data[input_idx], output_data[output_idx],
output_grad_data[output_idx], output_grad_data[output_idx], static_cast<T>(scale),
input_grad_data[input_idx], static_cast<T>(scale)); input_grad_data + input_idx);
} }
} }
} }
...@@ -404,12 +404,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> { ...@@ -404,12 +404,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
template <class T> template <class T>
class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> { class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(
const framework::Tensor& input, const platform::CPUDeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output, const framework::Tensor& output_grad,
const framework::Tensor& output_grad, std::vector<int>& ksize, const std::vector<int>& ksize, const std::vector<int>& strides,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& paddings, framework::Tensor* input_grad) {
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -510,9 +509,10 @@ template <typename T1, typename T2> ...@@ -510,9 +509,10 @@ template <typename T1, typename T2>
class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> { class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> { ...@@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_height = input_grad->dims()[2]; const int input_height = input_grad->dims()[2];
...@@ -628,9 +629,10 @@ template <typename T1, typename T2> ...@@ -628,9 +629,10 @@ template <typename T1, typename T2>
class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> { class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> { ...@@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_depth = input_grad->dims()[2]; const int input_depth = input_grad->dims()[2];
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, ...@@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
T ele = pool_process.initial(); T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]); pool_process.compute(input_data[h * input_width + w], &ele);
} }
} }
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size))); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[index] = ele; output_data[index] = ele;
} }
} }
...@@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad( ...@@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad(
int pool_size = (hend - hstart) * (wend - wstart); int pool_size = (hend - hstart) * (wend - wstart);
int output_sub_idx = ph * output_width + pw; int output_sub_idx = ph * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx], pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient, output_grad[output_sub_idx],
static_cast<T>(1.0 / pool_size)); static_cast<T>(1.0 / pool_size), &gradient);
} }
} }
input_grad[index] = gradient; input_grad[index] = gradient;
...@@ -158,9 +160,10 @@ template <typename PoolProcess, typename T> ...@@ -158,9 +160,10 @@ template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> { class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -201,9 +204,11 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -201,9 +204,11 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_process, framework::Tensor* input_grad) { const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -246,8 +251,10 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> { ...@@ -246,8 +251,10 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data, ...@@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
pool_process.compute( pool_process.compute(
ele, input_data[(d * input_height + h) * input_width + w]); input_data[(d * input_height + h) * input_width + w], &ele);
} }
} }
} }
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size)); pool_process.finalize(static_cast<T>(pool_size), &ele);
output_data[index] = ele; output_data[index] = ele;
} }
} }
...@@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad( ...@@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad(
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int output_sub_idx = (pd * output_height + ph) * output_width + pw; int output_sub_idx = (pd * output_height + ph) * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx], pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient, output_grad[output_sub_idx],
static_cast<T>(1.0 / pool_size)); static_cast<T>(1.0 / pool_size), &gradient);
} }
} }
} }
...@@ -474,9 +481,10 @@ template <typename PoolProcess, class T> ...@@ -474,9 +481,10 @@ template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> { class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
PoolProcess pool_process, framework::Tensor* output) { const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -525,9 +533,11 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -525,9 +533,11 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_process, framework::Tensor* input_grad) { const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -578,8 +588,10 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> { ...@@ -578,8 +588,10 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
...@@ -736,9 +748,10 @@ template <typename T1, typename T2> ...@@ -736,9 +748,10 @@ template <typename T1, typename T2>
class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> { class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
...@@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> { ...@@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_channels = input_grad->dims()[1]; const int input_channels = input_grad->dims()[1];
...@@ -937,9 +951,10 @@ template <typename T1, typename T2> ...@@ -937,9 +951,10 @@ template <typename T1, typename T2>
class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> { class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, std::vector<int>& ksize, const framework::Tensor& input, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
framework::Tensor* output, framework::Tensor* mask) { const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1]; const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
...@@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> { ...@@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad) { framework::Tensor* input_grad) {
const int batch_size = input_grad->dims()[0]; const int batch_size = input_grad->dims()[0];
const int input_channels = input_grad->dims()[1]; const int input_channels = input_grad->dims()[1];
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -23,8 +24,8 @@ namespace operators { ...@@ -23,8 +24,8 @@ namespace operators {
namespace math { namespace math {
#define FLT_MAX \ #define FLT_MAX \
__FLT_MAX__ // It might need to be placed in another file, but I'm still __FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// wondering where to put it. // still wondering where to put it.
/* /*
* \brief Extracting simple operations from pooling. * \brief Extracting simple operations from pooling.
...@@ -40,33 +41,33 @@ template <class T> ...@@ -40,33 +41,33 @@ template <class T>
class MaxPool { class MaxPool {
public: public:
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); } DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; } DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
DEVICE inline void finalize(T& y, const T& pool_field) {} DEVICE inline void finalize(const T& pool_field, T* y) {}
}; };
template <class T> template <class T>
class AvgPool { class AvgPool {
public: public:
DEVICE inline T initial() { return static_cast<T>(0); } DEVICE inline T initial() { return static_cast<T>(0); }
DEVICE inline void compute(T& y, const T& x) { y += x; } DEVICE inline void compute(const T& x, T* y) { *y += x; }
DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; } DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
}; };
template <class T> template <class T>
class MaxPoolGrad { class MaxPoolGrad {
public: public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T scale) { T* dx) {
dx += dy * (x == y); *dx += dy * (x == y);
} }
}; };
template <class T> template <class T>
class AvgPoolGrad { class AvgPoolGrad {
public: public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx, DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
T scale) { T* dx) {
dx += (scale * dy); *dx += (scale * dy);
} }
}; };
...@@ -88,8 +89,9 @@ template <typename DeviceContext, typename PoolProcess, typename T> ...@@ -88,8 +89,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
class Pool2dFunctor { class Pool2dFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, PoolProcess pool_compute, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* output); framework::Tensor* output);
}; };
...@@ -98,9 +100,11 @@ class Pool2dGradFunctor { ...@@ -98,9 +100,11 @@ class Pool2dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_compute, framework::Tensor* input_grad); const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* input_grad);
}; };
template <typename DeviceContext, class T> template <typename DeviceContext, class T>
...@@ -108,8 +112,10 @@ class MaxPool2dGradFunctor { ...@@ -108,8 +112,10 @@ class MaxPool2dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -117,8 +123,9 @@ template <typename DeviceContext, typename PoolProcess, typename T> ...@@ -117,8 +123,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
class Pool3dFunctor { class Pool3dFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, PoolProcess pool_compute, const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* output); framework::Tensor* output);
}; };
...@@ -127,9 +134,11 @@ class Pool3dGradFunctor { ...@@ -127,9 +134,11 @@ class Pool3dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
PoolProcess pool_compute, framework::Tensor* input_grad); const std::vector<int>& strides,
const std::vector<int>& paddings, PoolProcess pool_compute,
framework::Tensor* input_grad);
}; };
template <typename DeviceContext, class T> template <typename DeviceContext, class T>
...@@ -137,8 +146,10 @@ class MaxPool3dGradFunctor { ...@@ -137,8 +146,10 @@ class MaxPool3dGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -153,8 +164,9 @@ template <typename DeviceContext, typename T1, typename T2> ...@@ -153,8 +164,9 @@ template <typename DeviceContext, typename T1, typename T2>
class MaxPool2dWithIndexFunctor { class MaxPool2dWithIndexFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, framework::Tensor* output, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask); framework::Tensor* mask);
}; };
...@@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor { ...@@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
...@@ -172,8 +185,9 @@ template <typename DeviceContext, typename T1, typename T2> ...@@ -172,8 +185,9 @@ template <typename DeviceContext, typename T1, typename T2>
class MaxPool3dWithIndexFunctor { class MaxPool3dWithIndexFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
std::vector<int>& ksize, std::vector<int>& strides, const std::vector<int>& ksize,
std::vector<int>& paddings, framework::Tensor* output, const std::vector<int>& strides,
const std::vector<int>& paddings, framework::Tensor* output,
framework::Tensor* mask); framework::Tensor* mask);
}; };
...@@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor { ...@@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor {
public: public:
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, const std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* input_grad); framework::Tensor* input_grad);
}; };
......
...@@ -22,7 +22,7 @@ template <typename T> ...@@ -22,7 +22,7 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding, const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
...@@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."); "equal to the sum of all sequences's length.");
auto padding_dims = padding.dims(); auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape " "The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width]."); "[max_sequence_length, num_sequences, sequence_width].");
...@@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>(); const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>(); T* padding_data = padding->data<T>();
for (int64_t i = 0; i < max_sequence_length; ++i) { for (int64_t i = 0; i < max_sequence_length; ++i) {
for (int64_t j = 0; j < num_sequences; ++j) { for (int64_t j = 0; j < num_sequences; ++j) {
int64_t start_pos = abs_offset_lod[level][j]; int64_t start_pos = abs_offset_lod[level][j];
...@@ -84,16 +84,16 @@ template <typename T> ...@@ -84,16 +84,16 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding, framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null."); "The LoD of LoDTensor seq should not be null.");
const size_t level = 0; const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims(); auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0], PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()), static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
...@@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> { ...@@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
"The second dimension of Tensor padding should be " "The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."); "the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0]; const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the " "The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>(); const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>(); T* seq_data = seq->data<T>();
for (int64_t i = 0; i < num_sequences; ++i) { for (int64_t i = 0; i < num_sequences; ++i) {
int64_t start_pos = abs_offset_lod[level][i]; int64_t start_pos = abs_offset_lod[level][i];
int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
namespace paddle { namespace paddle {
...@@ -61,7 +62,7 @@ template <typename T> ...@@ -61,7 +62,7 @@ template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding, const framework::LoDTensor& seq, framework::Tensor* padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
...@@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length."); "equal to the sum of all sequences's length.");
auto padding_dims = padding.dims(); auto padding_dims = padding->dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape " "The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width]."); "[max_sequence_length, num_sequences, sequence_width].");
...@@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) { if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(seq, context.GetPlace(), context, &padding); TensorCopy(seq, context.GetPlace(), context, padding);
padding.Resize(padding_dims); padding->Resize(padding_dims);
return; return;
} }
...@@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>(); const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>(); T* padding_data = padding->data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
...@@ -136,16 +137,16 @@ template <typename T> ...@@ -136,16 +137,16 @@ template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding, framework::LoDTensor* seq, const framework::Tensor& padding,
bool norm_by_times) { bool norm_by_times) {
auto lod = seq.lod(); auto lod = seq->lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL, PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null."); "The lod of LoDTensor seq should not be null.");
const size_t level = 0; const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims(); auto seq_dims = seq->dims();
PADDLE_ENFORCE_EQ(seq_dims[0], PADDLE_ENFORCE_EQ(seq_dims[0],
static_cast<int64_t>(abs_offset_lod[level].back()), static_cast<int64_t>(abs_offset_lod[level].back()),
"The first dimension of LoDTensor seq should be " "The first dimension of LoDTensor seq should be "
...@@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"The second dimension of Tensor padding should be " "The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq."); "the number of sequences in LoDTensor seq.");
const int64_t sequence_width = seq.numel() / seq_dims[0]; const int64_t sequence_width = seq->numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the " "The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq."); "width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) { if (!norm_by_times && num_sequences == 1UL) {
TensorCopy(padding, context.GetPlace(), context, &seq); TensorCopy(padding, context.GetPlace(), context, seq);
seq.Resize(seq_dims); seq->Resize(seq_dims);
return; return;
} }
...@@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
dim3 grid(grid_dim_x, grid_dim_y); dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>(); const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>(); T* seq_data = seq->data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -64,13 +65,13 @@ template <typename DeviceContext, typename T> ...@@ -64,13 +65,13 @@ template <typename DeviceContext, typename T>
class PaddingLoDTensorFunctor { class PaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq, void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
framework::Tensor& padding, bool norm_by_times); framework::Tensor* padding, bool norm_by_times);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class UnpaddingLoDTensorFunctor { class UnpaddingLoDTensorFunctor {
public: public:
void operator()(const DeviceContext& context, framework::LoDTensor& seq, void operator()(const DeviceContext& context, framework::LoDTensor* seq,
const framework::Tensor& padding, bool norm_by_times); const framework::Tensor& padding, bool norm_by_times);
}; };
......
...@@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod, ...@@ -54,12 +54,12 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place); padding.mutable_data<T>(padding_dims, *place);
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, padding, false); *context, seq, &padding, false);
seq_back.set_lod(lod); seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place); seq_back.mutable_data<T>(seq_dims, *place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq_back, padding, false); *context, &seq_back, padding, false);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back; cpu_seq_back = seq_back;
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
class MomentumOp : public framework::OperatorWithKernel { class MomentumOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -50,6 +52,12 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Param")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ...@@ -204,6 +204,8 @@ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>); mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>); mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
ops::MulKernel<plat::CUDADeviceContext, double>,
ops::MulKernel<plat::CUDADeviceContext, plat::float16>); ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(mul_grad, REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>); ops::MulGradKernel<plat::CUDADeviceContext, float>,
ops::MulGradKernel<plat::CUDADeviceContext, double>);
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/row_conv_op.h" #include "paddle/fluid/operators/row_conv_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
USE_NO_KERNEL_OP(save); USE_NO_KERNEL_OP(save);
USE_NO_KERNEL_OP(load); USE_NO_KERNEL_OP(load);
...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) { ...@@ -61,3 +62,35 @@ TEST(SaveLoadOp, CPU) {
} }
} }
} }
TEST(SaveLoadFP16Op, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
float* expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<float>(paddle::platform::float16(i));
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("tensor.save")});
attrs.insert({"save_as_fp16", true});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
paddle::platform::float16* actual = target->data<paddle::platform::float16>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
}
}
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <numeric> #include <numeric>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase { ...@@ -68,6 +69,7 @@ class SaveOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite"); auto overwrite = Attr<bool>("overwrite");
auto save_as_fp16 = Attr<bool>("save_as_fp16");
if (FileExists(filename) && !overwrite) { if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase { ...@@ -96,7 +98,18 @@ class SaveOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::SerializeToStream(fout, tensor, dev_ctx); auto in_dtype = framework::ToDataType(tensor.type());
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
} }
}; };
...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk. ...@@ -114,6 +127,12 @@ This operator will serialize and write a tensor variable to file on disk.
"(boolean, default true)" "(boolean, default true)"
"Overwrite the output file if exist") "Overwrite the output file if exist")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("save_as_fp16",
"(boolean, default false)"
"If true, the tensor will be converted to float16 data "
"type and then saved. Otherwise, the tensor will be "
"directly saved without data type conversion.")
.SetDefault(false);
AddAttr<std::string>("file_path", AddAttr<std::string>("file_path",
"(string)" "(string)"
"The \"file_path\" where the variable will be saved.") "The \"file_path\" where the variable will be saved.")
......
...@@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,6 @@ class ScaleOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
...@@ -47,9 +46,9 @@ Scale operator ...@@ -47,9 +46,9 @@ Scale operator
$$Out = scale*X$$ $$Out = scale*X$$
)DOC"); )DOC");
AddAttr<AttrType>("scale", AddAttr<float>("scale",
"(float, default 1.0)" "(float, default 1.0)"
"The scaling factor of the scale operator.") "The scaling factor of the scale operator.")
.SetDefault(1.0); .SetDefault(1.0);
} }
}; };
...@@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { ...@@ -73,8 +72,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>, scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>, ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
......
...@@ -113,15 +113,15 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs, ...@@ -113,15 +113,15 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
op->SetAttrMap(attrs); op->SetAttrMap(attrs);
} }
void StartServerNet(bool is_sparse) { void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
VLOG(4) << "before init tensor";
if (is_sparse) { if (is_sparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
} else { } else {
InitTensorsInScope(place, &scope); InitTensorsInScope(place, &scope);
} }
// sub program run in listen_and_serv_op, for simple test we use sum // sub program run in listen_and_serv_op, for simple test we use sum
f::ProgramDesc program; f::ProgramDesc program;
const auto &root_block = program.Block(0); const auto &root_block = program.Block(0);
...@@ -129,7 +129,6 @@ void StartServerNet(bool is_sparse) { ...@@ -129,7 +129,6 @@ void StartServerNet(bool is_sparse) {
auto *prefetch_block = program.AppendBlock(root_block); auto *prefetch_block = program.AppendBlock(root_block);
// X for server side tensors, RX for received tensors, must be of same shape. // X for server side tensors, RX for received tensors, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
f::AttributeMap attrs; f::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:0")}); attrs.insert({"endpoint", std::string("127.0.0.1:0")});
attrs.insert({"Fanin", 1}); attrs.insert({"Fanin", 1});
...@@ -139,15 +138,22 @@ void StartServerNet(bool is_sparse) { ...@@ -139,15 +138,22 @@ void StartServerNet(bool is_sparse) {
attrs.insert({"PrefetchBlock", prefetch_block}); attrs.insert({"PrefetchBlock", prefetch_block});
attrs.insert({"grad_to_block_id", std::vector<std::string>({""})}); attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
attrs.insert({"sync_mode", true}); attrs.insert({"sync_mode", true});
VLOG(4) << "before init op";
listen_and_serv_op = listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs); f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
*initialized = true;
listen_and_serv_op->Run(scope, place); listen_and_serv_op->Run(scope, place);
LOG(INFO) << "server exit"; LOG(INFO) << "server exit";
} }
TEST(SendRecvOp, CPUDense) { TEST(SendRecvOp, CPUDense) {
std::thread server_thread(StartServerNet, false); std::atomic<bool> initialized{false};
sleep(5); // wait server to start std::thread server_thread(StartServerNet, false, &initialized);
while (!initialized) {
}
static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
->WaitServerReady();
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
...@@ -156,9 +162,11 @@ TEST(SendRecvOp, CPUDense) { ...@@ -156,9 +162,11 @@ TEST(SendRecvOp, CPUDense) {
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>( auto *listen_and_serv_op_ptr =
listen_and_serv_op.get()) static_cast<paddle::operators::ListenAndServOp *>(
->GetSelectedPort(); listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
selected_port = listen_and_serv_op_ptr->GetSelectedPort();
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
...@@ -181,11 +189,21 @@ TEST(SendRecvOp, CPUDense) { ...@@ -181,11 +189,21 @@ TEST(SendRecvOp, CPUDense) {
listen_and_serv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
listen_and_serv_op.reset(nullptr); listen_and_serv_op.reset(nullptr);
paddle::operators::ListenAndServOp::ResetPort();
} }
TEST(SendRecvOp, CPUSparse) { TEST(SendRecvOp, CPUSparse) {
std::thread server_thread(StartServerNet, true); std::atomic<bool> initialized;
sleep(3); // wait server to start initialized = false;
std::thread server_thread(StartServerNet, true, &initialized);
while (!initialized) {
}
auto *listen_and_serv_op_ptr =
static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get());
ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
listen_and_serv_op_ptr->WaitServerReady();
// local net // local net
f::Scope scope; f::Scope scope;
p::CPUPlace place; p::CPUPlace place;
...@@ -193,9 +211,7 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -193,9 +211,7 @@ TEST(SendRecvOp, CPUSparse) {
InitSelectedRowsInScope(place, &scope); InitSelectedRowsInScope(place, &scope);
scope.Var("RPC_CLIENT_VAR"); scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs; f::AttributeMap attrs;
selected_port = static_cast<paddle::operators::ListenAndServOp *>( selected_port = listen_and_serv_op_ptr->GetSelectedPort();
listen_and_serv_op.get())
->GetSelectedPort();
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port); std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})}); attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})}); attrs.insert({"epmap", std::vector<std::string>({endpoint})});
...@@ -226,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) { ...@@ -226,4 +242,5 @@ TEST(SendRecvOp, CPUSparse) {
listen_and_serv_op->Stop(); listen_and_serv_op->Stop();
server_thread.join(); server_thread.join();
listen_and_serv_op.reset(); listen_and_serv_op.reset();
paddle::operators::ListenAndServOp::ResetPort();
} }
...@@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,24 @@ class SGDOp : public framework::OperatorWithKernel {
} }
}; };
class SGDOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto input_var = op_desc.Input("Param")[0];
for (auto& out_var : op_desc.Output("ParamOut")) {
if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::SELECTED_ROWS);
} else {
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::LOD_TENSOR);
}
}
}
};
class SGDOpMaker : public framework::OpProtoAndCheckerMaker { class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$ ...@@ -74,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>); REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
...@@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ...@@ -164,7 +164,9 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>); softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
softmax_grad, softmax_grad,
ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>); ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -19,6 +19,8 @@ namespace ops = paddle::operators; ...@@ -19,6 +19,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>, softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>); ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(softmax_grad, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>); softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
...@@ -75,4 +75,5 @@ namespace ops = paddle::operators; ...@@ -75,4 +75,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker, REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(top_k, REGISTER_OP_CPU_KERNEL(top_k,
ops::TopkKernel<paddle::platform::CPUPlace, float>); ops::TopkKernel<paddle::platform::CPUPlace, float>,
ops::TopkKernel<paddle::platform::CPUPlace, double>);
...@@ -318,4 +318,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -318,4 +318,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>); REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>);
...@@ -116,11 +116,31 @@ uniform distribution. ...@@ -116,11 +116,31 @@ uniform distribution.
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
} }
}; };
class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("Out").front();
if (block->FindRecursiveOrCreateVar(out_var_name).GetType() ==
framework::proto::VarType::SELECTED_ROWS) {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(framework::proto::VarType::LOD_TENSOR);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp, REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp,
paddle::operators::UniformRandomOpMaker); paddle::operators::UniformRandomOpMaker,
paddle::framework::EmptyGradOpMaker,
paddle::operators::UniformRandomOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(uniform_random, REGISTER_OP_CPU_KERNEL(uniform_random,
paddle::operators::CPUUniformRandomKernel<float>, paddle::operators::CPUUniformRandomKernel<float>,
paddle::operators::CPUUniformRandomKernel<double>); paddle::operators::CPUUniformRandomKernel<double>);
......
...@@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel<T> { ...@@ -162,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
static_cast<int64_t>(sequence_width)}); static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace()); warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
math::PaddingLoDTensorFunctor<DeviceContext, T>()( math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, warpctc_logits, ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
false); false);
const T* warpctc_logits_data = warpctc_logits.data<T>(); const T* warpctc_logits_data = warpctc_logits.data<T>();
...@@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> { ...@@ -217,7 +217,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
bool norm_by_times = ctx.Attr<bool>("norm_by_times"); bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()( math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits_grad, ctx.template device_context<DeviceContext>(), logits_grad,
*warpctc_grad, norm_by_times); *warpctc_grad, norm_by_times);
const T* loss_grad_data = loss_grad->data<T>(); const T* loss_grad_data = loss_grad->data<T>();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
namespace paddle {
namespace platform {
// __shfl_down and __shfl have been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
template <typename T>
__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
int width) {
return __shfl(val, src_line, width);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
const int warpSize = 32;
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
__syncthreads();
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += platform::__shfl_down_sync(mask, val, offset);
}
return val;
}
} // namespace platform
} // namespace paddle
...@@ -66,22 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -66,22 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) {
return __shfl_down(mask, val, delta);
}
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -502,11 +502,11 @@ All parameter, weight, gradient are variables in Paddle.
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, std::vector<Scope *> &local_scopes, Scope *scope, std::vector<Scope *> &local_scopes,
bool allow_op_delay, bool customize_loss_grad) { bool allow_op_delay, bool use_default_grad_scale) {
new (&self) ParallelExecutor(num_threads, use_event, places, new (&self) ParallelExecutor(
params, bcast_vars, main_program, num_threads, use_event, places, params, bcast_vars,
loss_var_name, scope, local_scopes, main_program, loss_var_name, scope, local_scopes,
allow_op_delay, customize_loss_grad); allow_op_delay, use_default_grad_scale);
}) })
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -107,7 +107,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
return self.data<T>()[offset]; return self.data<T>()[offset];
} else { } else {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(self, platform::CPUPlace(), dst.get()); framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
return dst->data<T>()[offset]; return dst->data<T>()[offset];
} }
} }
...@@ -117,9 +117,9 @@ template <typename T> ...@@ -117,9 +117,9 @@ template <typename T>
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) { if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(*self, platform::CPUPlace(), dst.get()); framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem; dst->data<T>()[offset] = elem;
framework::TensorCopy(*dst.get(), self->place(), self); framework::TensorCopySync(*dst.get(), self->place(), self);
} else if (platform::is_cpu_place(self->place())) { } else if (platform::is_cpu_place(self->place())) {
self->data<T>()[offset] = elem; self->data<T>()[offset] = elem;
......
...@@ -40,6 +40,7 @@ function print_usage() { ...@@ -40,6 +40,7 @@ function print_usage() {
${BLUE}capi${NONE}: generate paddle CAPI package ${BLUE}capi${NONE}: generate paddle CAPI package
${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
${BLUE}check_style${NONE}: run code style check ${BLUE}check_style${NONE}: run code style check
${BLUE}cicheck${NONE}: run CI tasks
" "
} }
...@@ -453,6 +454,8 @@ function gen_capi_package() { ...@@ -453,6 +454,8 @@ function gen_capi_package() {
} }
function gen_fluid_inference_lib() { function gen_fluid_inference_lib() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF cat <<EOF
======================================== ========================================
...@@ -503,6 +506,13 @@ function main() { ...@@ -503,6 +506,13 @@ function main() {
check_style) check_style)
check_style check_style
;; ;;
cicheck)
cmake_gen ${PYTHON_ABI:-""}
build
run_test
gen_capi_package
gen_fluid_inference_lib
;;
*) *)
print_usage print_usage
exit 0 exit 0
......
...@@ -21,8 +21,7 @@ import executor ...@@ -21,8 +21,7 @@ import executor
from executor import * from executor import *
import trainer import trainer
from trainer import Trainer from trainer import *
from trainer import Event
import inferencer import inferencer
from inferencer import Inferencer from inferencer import Inferencer
......
...@@ -137,8 +137,6 @@ def split_dense_variable(var_list, ...@@ -137,8 +137,6 @@ def split_dense_variable(var_list,
class DistributeTranspiler: class DistributeTranspiler:
def transpile(self, def transpile(self,
optimize_ops,
params_grads,
trainer_id, trainer_id,
program=None, program=None,
pservers="127.0.0.1:6174", pservers="127.0.0.1:6174",
...@@ -169,11 +167,6 @@ class DistributeTranspiler: ...@@ -169,11 +167,6 @@ class DistributeTranspiler:
4. append ops that should run on current server instance. 4. append ops that should run on current server instance.
5. add listen_and_serv op 5. add listen_and_serv op
:param optimize_ops: op list of optimization, should be the
return value of Optimizer.minimize
:type optimize_ops: list
:param params_grads: list of tuple(weight, gradient)
:type params_grads: list
:param trainer_id: one unique id for each trainer in a job. :param trainer_id: one unique id for each trainer in a job.
:type trainer_id: int :type trainer_id: int
:param program: program to transpile, default is default_main_program :param program: program to transpile, default is default_main_program
...@@ -194,7 +187,6 @@ class DistributeTranspiler: ...@@ -194,7 +187,6 @@ class DistributeTranspiler:
program = default_main_program() program = default_main_program()
self.origin_program = program self.origin_program = program
self.trainer_num = trainers self.trainer_num = trainers
self.optimize_ops = optimize_ops
self.sync_mode = sync_mode self.sync_mode = sync_mode
# TODO(typhoonzero): currently trainer_id is fetched from cluster system # TODO(typhoonzero): currently trainer_id is fetched from cluster system
# like Kubernetes, we should port this to use etcd later when developing # like Kubernetes, we should port this to use etcd later when developing
...@@ -202,6 +194,7 @@ class DistributeTranspiler: ...@@ -202,6 +194,7 @@ class DistributeTranspiler:
self.trainer_id = trainer_id self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",") pserver_endpoints = pservers.split(",")
self.pserver_endpoints = pserver_endpoints self.pserver_endpoints = pserver_endpoints
self.optimize_ops, params_grads = self._get_optimize_pass()
# process lookup_table_op # process lookup_table_op
# 1. check all lookup_table_op is distributed # 1. check all lookup_table_op is distributed
...@@ -408,11 +401,8 @@ class DistributeTranspiler: ...@@ -408,11 +401,8 @@ class DistributeTranspiler:
# HACK: optimization global ops only used to scale beta1 and beta2 # HACK: optimization global ops only used to scale beta1 and beta2
# replace it with dependency engine. # replace it with dependency engine.
for op in self.optimize_ops: for op in self.optimize_ops:
if op.type == "scale": if self._is_adam_connected_op(op):
for in_name in op.input_arg_names: global_ops.append(op)
if in_name.startswith("beta1_pow_acc") or \
in_name.startswith("beta2_pow_acc"):
global_ops.append(op)
def __append_optimize_op__(op, block, grad_to_block_id): def __append_optimize_op__(op, block, grad_to_block_id):
if self._is_opt_op(op): if self._is_opt_op(op):
...@@ -661,7 +651,7 @@ class DistributeTranspiler: ...@@ -661,7 +651,7 @@ class DistributeTranspiler:
shape=trainer_out.shape, shape=trainer_out.shape,
dtype=trainer_out.dtype) dtype=trainer_out.dtype)
prefetch_block.append_op( prefetch_block.append_op(
type=LOOKUP_TABLE_TYPE, type="lookup_sparse_table",
inputs={'Ids': pserver_ids, inputs={'Ids': pserver_ids,
"W": table_var}, "W": table_var},
outputs={"Out": pserver_out}, outputs={"Out": pserver_out},
...@@ -685,9 +675,14 @@ class DistributeTranspiler: ...@@ -685,9 +675,14 @@ class DistributeTranspiler:
# STEP: create table optimize block # STEP: create table optimize block
# create table param and grad var in pserver program # create table param and grad var in pserver program
param_var = _clone_var( origin_param_var = self.origin_program.global_block().vars[
pserver_program.global_block(), self.table_name]
self.origin_program.global_block().vars[self.table_name]) param_var = pserver_program.global_block().create_var(
name=origin_param_var.name,
shape=origin_param_var.shape,
dtype=origin_param_var.dtype,
type=core.VarDesc.VarType.SELECTED_ROWS,
persistable=True)
grad_var = _clone_var( grad_var = _clone_var(
pserver_program.global_block(), pserver_program.global_block(),
self.origin_program.global_block().vars[framework.grad_var_name( self.origin_program.global_block().vars[framework.grad_var_name(
...@@ -1142,3 +1137,32 @@ class DistributeTranspiler: ...@@ -1142,3 +1137,32 @@ class DistributeTranspiler:
# we only need to append op for once # we only need to append op for once
break break
return lr_ops return lr_ops
def _get_optimize_pass(self):
block = self.origin_program.global_block()
opt_ops = []
params_grads = []
for op in block.ops:
if self._is_opt_op(op):
opt_ops.append(op)
params_grads.append((self.origin_program.global_block().var(
op.input("Param")[0]),
self.origin_program.global_block().var(
op.input("Grad")[0])))
elif self._is_adam_connected_op(op):
opt_ops.append(op)
else:
pass
return opt_ops, params_grads
def _is_adam_connected_op(self, op):
"""
A hack function to determinate whether the input operator
is connected to optimize operator.
"""
if op.type == "scale":
for in_name in op.input_arg_names:
if in_name.startswith("beta1_pow_acc") or \
in_name.startswith("beta2_pow_acc"):
return True
return False
...@@ -50,8 +50,6 @@ def data(name, ...@@ -50,8 +50,6 @@ def data(name,
dtype(int|float): The type of data : float32, float_16, int etc dtype(int|float): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR. type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence. lod_level(int): The LoD Level. 0 means the input data is not a sequence.
main_program(Program): Name of the main program that calls this
startup_program(Program): Name of the startup program
stop_gradient(bool): A boolean that mentions whether gradient should flow. stop_gradient(bool): A boolean that mentions whether gradient should flow.
Returns: Returns:
...@@ -74,13 +72,15 @@ def data(name, ...@@ -74,13 +72,15 @@ def data(name,
if append_batch_size: if append_batch_size:
shape = [-1] + shape # append batch size as -1 shape = [-1] + shape # append batch size as -1
return helper.create_global_variable( data_var = helper.create_global_variable(
name=name, name=name,
shape=shape, shape=shape,
dtype=dtype, dtype=dtype,
type=type, type=type,
stop_gradient=stop_gradient, stop_gradient=stop_gradient,
lod_level=lod_level) lod_level=lod_level)
data_var.is_data = True
return data_var
class BlockGuardServ(BlockGuard): class BlockGuardServ(BlockGuard):
...@@ -168,7 +168,9 @@ class ListenAndServ(object): ...@@ -168,7 +168,9 @@ class ListenAndServ(object):
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'OptimizeBlock': current_block, 'OptimizeBlock': current_block,
'PrefetchBlock': empty_block 'PrefetchBlock': empty_block,
'sync_mode': True, # did not support async now in layers
'grad_to_block_id': [""]
}) })
......
...@@ -169,7 +169,9 @@ def monkey_patch_variable(): ...@@ -169,7 +169,9 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly # a*b == b*a. Do not need to reverse explicitly
("__rmul__", "elementwise_mul", False), ("__rmul__", "elementwise_mul", False),
("__div__", "elementwise_div", False), ("__div__", "elementwise_div", False),
("__truediv__", "elementwise_div", False),
("__rdiv__", "elementwise_div", True), ("__rdiv__", "elementwise_div", True),
("__rtruediv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False), ("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True), ("__rpow__", "elementwise_pow", True),
# for logical compare # for logical compare
......
...@@ -1496,6 +1496,7 @@ def batch_norm(input, ...@@ -1496,6 +1496,7 @@ def batch_norm(input,
bias_attr=None, bias_attr=None,
data_layout='NCHW', data_layout='NCHW',
in_place=False, in_place=False,
use_mkldnn=False,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None, moving_variance_name=None,
...@@ -1574,9 +1575,12 @@ def batch_norm(input, ...@@ -1574,9 +1575,12 @@ def batch_norm(input,
"SavedMean": saved_mean, "SavedMean": saved_mean,
"SavedVariance": saved_variance "SavedVariance": saved_variance
}, },
attrs={"momentum": momentum, attrs={
"epsilon": epsilon, "momentum": momentum,
"is_test": is_test}) "epsilon": epsilon,
"is_test": is_test,
"use_mkldnn": use_mkldnn
})
return helper.append_activation(batch_norm_out) return helper.append_activation(batch_norm_out)
......
...@@ -28,7 +28,8 @@ from contextlib import contextmanager ...@@ -28,7 +28,8 @@ from contextlib import contextmanager
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage' 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage',
'Optimizer'
] ]
......
...@@ -30,7 +30,7 @@ class ParallelExecutor(object): ...@@ -30,7 +30,7 @@ class ParallelExecutor(object):
num_threads=None, num_threads=None,
allow_op_delay=False, allow_op_delay=False,
share_vars_from=None, share_vars_from=None,
customize_loss_grad=False): use_default_grad_scale=True):
""" """
ParallelExecutor can run program in parallel. ParallelExecutor can run program in parallel.
...@@ -46,6 +46,11 @@ class ParallelExecutor(object): ...@@ -46,6 +46,11 @@ class ParallelExecutor(object):
improve performance in some cases, defalut False. improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied, share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor. it will share variables from the specified ParallelExecutor.
use_default_grad_scale(bool, default True): If set True, a default
scale value equal to `1./device_count` would be multiplied to
gradients of each device and scaled gradients would be
aggregated. Otherwise, a customized scale value should be fed
to the network.
Returns: Returns:
A ParallelExecutor object. A ParallelExecutor object.
...@@ -124,7 +129,7 @@ class ParallelExecutor(object): ...@@ -124,7 +129,7 @@ class ParallelExecutor(object):
scope, scope,
local_scopes, local_scopes,
allow_op_delay, allow_op_delay,
customize_loss_grad) use_default_grad_scale)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None): def run(self, fetch_list, feed=None, feed_dict=None):
......
...@@ -80,12 +80,7 @@ def train(use_cuda, save_dirname, is_local): ...@@ -80,12 +80,7 @@ def train(use_cuda, save_dirname, is_local):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -189,12 +189,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -189,12 +189,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -259,12 +259,7 @@ def train(use_cuda, save_dirname=None, is_local=True): ...@@ -259,12 +259,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -231,12 +231,7 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -231,12 +231,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -162,12 +162,7 @@ def train(nn_type, ...@@ -162,12 +162,7 @@ def train(nn_type,
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -261,12 +261,7 @@ def train(use_cuda, save_dirname, is_local=True): ...@@ -261,12 +261,7 @@ def train(use_cuda, save_dirname, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -213,12 +213,7 @@ def train(word_dict, ...@@ -213,12 +213,7 @@ def train(word_dict,
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
...@@ -145,12 +145,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -145,12 +145,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
training_role = os.getenv("TRAINING_ROLE", "TRAINER") training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile( t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
trainer_id,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_startup = t.get_startup_program(current_endpoint,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import numpy as np
import math
import sys
from functools import partial
PASS_NUM = 100
EMBED_SIZE = 32
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
def create_random_lodtensor(lod, place, low, high):
# The range of data elements is [low, high]
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
def inference_network(is_sparse):
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
embed_first = fluid.layers.embedding(
input=first_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_second = fluid.layers.embedding(
input=second_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_third = fluid.layers.embedding(
input=third_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
embed_forth = fluid.layers.embedding(
input=forth_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=is_sparse,
param_attr='shared_w')
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed,
size=HIDDEN_SIZE,
act='sigmoid')
predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
return predict_word
def train_network(is_sparse):
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
predict_word = inference_network(is_sparse)
cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
avg_cost = fluid.layers.mean(cost)
return avg_cost
def train(use_cuda, is_sparse, save_path):
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
def event_handler(event):
print type(event)
if isinstance(event, fluid.EndEpochEvent):
avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
word_dict, N))
if avg_cost < 5.0:
trainer.params.save(save_path)
return
if math.isnan(avg_cost):
sys.exit("got NaN loss, training failed.")
trainer = fluid.Trainer(
partial(train_network, is_sparse),
fluid.optimizer.SGD(learning_rate=0.001),
place=place)
trainer.train(
reader=train_reader, num_epochs=100, event_handler=event_handler)
def infer(use_cuda, save_path):
params = fluid.Params(save_path)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(inference_network, params, place=place)
lod = [0, 1]
first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
result = inferencer.infer({
'firstw': first_word,
'secondw': second_word,
'thirdw': third_word,
'forthw': fourth_word
})
print(result)
def main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "word2vec.inference.model"
train(use_cuda, is_sparse, save_path)
infer(use_cuda, save_path)
if __name__ == '__main__':
for use_cuda in (False, True):
for is_sparse in (False, True):
main(use_cuda=use_cuda, is_sparse=is_sparse)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from op_test import OpTest
from paddle.fluid.framework import grad_var_name
from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_formats = ["NCHW"]
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
# run forward
y, saved_mean, saved_variance = _reference_training(
x, scale, bias, epsilon, data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = saved_variance * (1. - momentum) + momentum * variance
# run backward
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
def init_kernel_type(self):
self.use_mkldnn = True
def test_check_output(self):
place = core.CPUPlace()
data_format = "NCHW"
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
if __name__ == '__main__':
unittest.main()
...@@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None): ...@@ -158,6 +158,8 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
class TestBatchNormOpInference(unittest.TestCase): class TestBatchNormOpInference(unittest.TestCase):
def setUp(self): def setUp(self):
self.dtype = np.float32 self.dtype = np.float32
self.use_mkldnn = False
self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
...@@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -230,6 +232,7 @@ class TestBatchNormOpInference(unittest.TestCase):
# attrs # attrs
is_test=True, is_test=True,
data_layout=data_layout, data_layout=data_layout,
use_mkldnn=self.use_mkldnn,
epsilon=epsilon) epsilon=epsilon)
batch_norm_op.run(scope, place) batch_norm_op.run(scope, place)
...@@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -254,10 +257,15 @@ class TestBatchNormOpInference(unittest.TestCase):
[2, 3, 4, 5]) [2, 3, 4, 5])
self.check_with_place(place, data_format, self.dtype, [2, 3]) self.check_with_place(place, data_format, self.dtype, [2, 3])
def init_kernel_type(self):
pass
class TestFP16BatchNormOpInference(TestBatchNormOpInference): class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def setUp(self): def setUp(self):
self.dtype = np.float16 self.dtype = np.float16
self.use_mkldnn = False
self.init_kernel_type()
def test_check_output(self): def test_check_output(self):
places = [] places = []
...@@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -274,9 +282,28 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase): class TestBatchNormOpTraining(unittest.TestCase):
def setUp(self):
self.use_mkldnn = False
self.data_formats = ["NCHW", "NHWC"]
self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
np.allclose(np.array(tensor), np_array, atol=atol) np.allclose(np.array(tensor), np_array, atol=atol)
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
# run forward
y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# run backward
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, data_layout, shape): def test_with_place(place, data_layout, shape):
# attr # attr
...@@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -295,16 +322,11 @@ class TestBatchNormOpTraining(unittest.TestCase):
mean = np.zeros(scale_shape).astype(np.float32) mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32) variance = np.ones(scale_shape).astype(np.float32)
# run forward
y, saved_mean, var_ref = _reference_training(x, scale, bias,
epsilon, data_layout)
mean_out = saved_mean * (1. - momentum) + momentum * mean
variance_out = var_ref * (1. - momentum) + momentum * variance
saved_variance = 1. / np.sqrt(var_ref + epsilon)
# run backward
y_grad = np.random.random_sample(shape).astype(np.float32) y_grad = np.random.random_sample(shape).astype(np.float32)
x_grad, scale_grad, bias_grad = _reference_grad(
x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
x, y_grad, scale, bias, mean, variance, epsilon, momentum,
shape, data_layout)
var_dict = locals() var_dict = locals()
var_dict['y@GRAD'] = y_grad var_dict['y@GRAD'] = y_grad
...@@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -344,7 +366,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": False, "is_test": False,
"data_layout": data_layout "data_layout": data_layout,
"use_mkldnn": self.use_mkldnn
}) })
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
...@@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -387,13 +410,17 @@ class TestBatchNormOpTraining(unittest.TestCase):
print "op test forward passed: ", str(place), data_layout print "op test forward passed: ", str(place), data_layout
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in self.data_formats:
test_with_place(place, data_format, [2, 3, 4, 5]) test_with_place(place, data_format, [2, 3, 4, 5])
def init_kernel_type(self):
pass
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase): ...@@ -34,7 +34,7 @@ class TestSendOp(unittest.TestCase):
p.start() p.start()
time.sleep(10) time.sleep(10)
with open("/tmp/paddle.selected_port", "r") as fn: with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn:
selected_port = int(fn.readlines()[0]) selected_port = int(fn.readlines()[0])
self.init_client(place, selected_port) self.init_client(place, selected_port)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
def output_hist(out):
hist, _ = np.histogram(out, range=(-5, 10))
hist = hist.astype("float32")
hist /= float(out.size)
prob = 0.1 * np.ones((10))
return hist, prob
class TestLookupSpraseTable(OpTest):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Id Variable
ids = scope.var("Ids").get_tensor()
ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
ids.set(ids_array, place)
# create and initialize W Variable
rows = [0, 1, 2, 3, 4, 5, 6]
row_numel = 10000
w_selected_rows = scope.var('W').get_selected_rows()
w_selected_rows.set_height(len(rows))
w_selected_rows.set_rows(rows)
w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
# create Out Variable
out_tensor = scope.var('Out').get_tensor()
# create and run lookup_table operator
lookup_table = Operator(
"lookup_sparse_table",
W='W',
Ids='Ids',
Out='Out',
min=-5.0,
max=10.0,
seed=10)
lookup_table.run(scope, place)
# get result from Out
result_array = np.array(out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array[:-2]):
assert (row == result_array[idx]).all()
# check the random value
hist, prob = output_hist(result_array[-1])
self.assertTrue(
np.allclose(
hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
def test_w_is_selected_rows(self):
places = [core.CPUPlace()]
# currently only support CPU
for place in places:
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
...@@ -12,44 +12,200 @@ ...@@ -12,44 +12,200 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import core
import framework
import executor
import data_feeder
import contextlib
# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
import optimizer as opt_module
__all__ = [ __all__ = [
'Event',
'Trainer', 'Trainer',
'BeginEpochEvent',
'EndEpochEvent',
'BeginStepEvent',
'EndStepEvent',
] ]
class Event(object): class BeginEpochEvent(object):
BEGIN_EPOCH = 0 def __init__(self, epoch_id):
END_EPOCH = 1 self.epoch = epoch_id
BEGIN_STEP = 2
END_STEP = 3
class EndEpochEvent(object):
def __init__(self, epoch_id):
self.epoch = epoch_id
def __init__(self):
self.step = 0 class BeginStepEvent(object):
self.epoch = 0 def __init__(self, epoch_id, step_id):
self.type = Event.BEGIN_EPOCH self.epoch = epoch_id
self.step = step_id
class EndStepEvent(object):
def __init__(self, epoch_id, step_id):
self.epoch = epoch_id
self.step = step_id
class Trainer(object): class Trainer(object):
"""
Args:
network_func(callable): A function which will return loss. The loss must be a scaler.
optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
params:
place: The device place of this trainer.
"""
def __init__(self, network_func, optimizer, params=None, place=None): def __init__(self, network_func, optimizer, params=None, place=None):
# 1. we need to generate a framework.Program by calling # 1. we need to generate a framework.Program by calling
# network_func. Reference: fluid.program_guard in # network_func. Reference: fluid.program_guard in
# test_word2vec.py # test_word2vec.py
self.scope = self._get_scope_from_params(params)
self.startup_program = framework.Program()
self.train_program = framework.Program()
with framework.program_guard(self.train_program, self.startup_program):
loss = network_func()
if not isinstance(optimizer, opt_module.Optimizer):
raise TypeError(
"The optimizer should be an instance of Optimizer")
optimizer.minimize(loss)
self.place = Trainer._check_and_get_place(place)
# 2. move the default_main_program to self.program and run the # 2. move the default_main_program to self.program and run the
# default_startup program on an empty core.Scope() # default_startup program on an empty core.Scope()
# Run startup program
if params is None:
exe = executor.Executor(place)
exe.run(self.startup_program, scope=self.scope)
# 3. call self.params.add_vars with the initialized scope, it # 3. call self.params.add_vars with the initialized scope, it
# will add the new vars of the initialized scope into # will add the new vars of the initialized scope into
# self.params. # self.params.
self.network_func = network_func # TODO(yuyang): This depends on parameters implementation.
self.optimizer = optimizer
self.params = params
self.place = place
# TODO(helin): support distributed training # TODO(helin): support distributed training
def train(self, reader, num_epochs, event_handler): def train(self,
pass num_epochs,
event_handler,
reader=None,
parallel=False,
feed_order=None):
"""
Train the model.
Args:
num_epochs: The number of epoch. An epoch will process all data in reader
event_handler: The event handler. A function with type (ev:Event)->void
reader:
parallel: True if use multi-CPUs or multi-GPUs
feed_order: Feeding order of reader. None will following the defining
order in program
Returns:
"""
if parallel:
raise NotImplementedError(
"Parallel Executor version of trainer is not implemented")
self._train_by_executor(num_epochs, event_handler, reader, feed_order)
def test(self, reader): def test(self, reader):
pass pass
def _get_scope_from_params(self, params):
"""
Get Scope from parameter object.
Args:
params(Parameter|None): The parameter object instance. Could be None.
Returns: New scope if params is None. Or params.scope()
NOTE: This method is WIP. Not fully implemented.
"""
if params is None:
return core.Scope() # new scope when params is None
else:
raise NotImplementedError("Not implemented right now.")
@staticmethod
def _check_and_get_place(place):
"""
Check the type of place or get the default place
Args:
place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
Raises:
TypeError if the type mismatched.
Returns:
the original place if it is not None.
if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
Otherwise returns CPUPlace by default.
"""
if place is None:
if core.is_compiled_with_cuda():
return core.CUDAPlace(0)
else:
return core.CPUPlace()
else:
if not isinstance(place, core.CUDAPlace) and not isinstance(
place, core.CPUPlace):
raise TypeError("Place should be either CUDAPlace or CPUPlace")
return place
@contextlib.contextmanager
def _prog_and_scope_guard(self):
with framework.program_guard(
main_program=self.train_program,
startup_program=self.startup_program):
with executor.scope_guard(self.scope):
yield
def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
"""
Train by Executor and single device.
Args:
num_epochs:
event_handler:
reader:
feed_order:
Returns:
"""
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
if feed_order is None:
feed_var_list = [
var
for var in self.train_program.global_block(
).vars.itervalues()
if hasattr(var, 'is_data') and var.is_data
]
else:
feed_var_list = [
self.train_program.global_block().var(var_name)
for var_name in feed_order
]
feeder = data_feeder.DataFeeder(
feed_list=feed_var_list, place=self.place)
for epoch_id in range(num_epochs):
event_handler(BeginEpochEvent(epoch_id))
for step_id, data in enumerate(reader()):
event_handler(BeginStepEvent(epoch_id, step_id))
exe.run(feed=feeder.feed(data), fetch_list=[])
event_handler(EndStepEvent(epoch_id, step_id))
event_handler(EndEpochEvent(epoch_id))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册