提交 7141debe 编写于 作者: D dzhwinter

add cudnn back. staged.

上级 09409bad
...@@ -299,16 +299,19 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -299,16 +299,19 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
std::unique_ptr<ExecutorPrepareContext> ctx( std::unique_ptr<ExecutorPrepareContext> ctx(
new ExecutorPrepareContext(program, block_id)); new ExecutorPrepareContext(program, block_id));
VLOG(3) << "after create prepare"; VLOG(3) << "after create prepare";
// PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size()); // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
VLOG(3) << "before create op_desc"; VLOG(3) << "before create op_desc";
auto& block = program.Block(block_id); auto& block = program.Block(block_id);
VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size(); VLOG(3) << "create before" << ctx->ops_.size() << " "
<< block.AllOps().size();
int counter = 0; int counter = 0;
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type(); VLOG(3) << "create op "
<< "index " << ++counter << " type " << op_desc->Type();
} }
VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size(); VLOG(3) << "create finished" << ctx->ops_.size() << " "
<< block.AllOps().size();
return ctx; return ctx;
} }
...@@ -320,22 +323,25 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -320,22 +323,25 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
for (auto& bid : block_ids) { for (auto& bid : block_ids) {
VLOG(3) << "block id" << bid; VLOG(3) << "block id" << bid;
auto* ctx = new ExecutorPrepareContext(program, bid); auto* ctx = new ExecutorPrepareContext(program, bid);
//PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size()); // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid); auto& block = program.Block(bid);
int counter = 0; int counter = 0;
VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size(); VLOG(3) << "create before" << ctx->ops_.size() << " "
<< block.AllOps().size();
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type(); VLOG(3) << "create op "
<< "index " << ++counter << " type " << op_desc->Type();
} }
VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size(); VLOG(3) << "create finished" << ctx->ops_.size() << " "
<< block.AllOps().size();
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx)); result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
} }
return result; return result;
} }
// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, Scope* local_scope) { // void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
// Scope* local_scope) {
// VLOG(3) << "before checking result"; // VLOG(3) << "before checking result";
// auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_); // auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
// std::vector<std::string> outputs; // std::vector<std::string> outputs;
...@@ -343,7 +349,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -343,7 +349,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
// bool found = false; // bool found = false;
// framework::OpDesc* myop = nullptr; // framework::OpDesc* myop = nullptr;
// for(auto& op : block.AllOps()) { // for(auto& op : block.AllOps()) {
// if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") return; // if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
// "feed") return;
// if (op->Type() == op_type) { // if (op->Type() == op_type) {
// found = true; // found = true;
// myop = op; // myop = op;
...@@ -370,7 +377,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -370,7 +377,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
// for(size_t i=0; i < check.numel(); ++i) { // for(size_t i=0; i < check.numel(); ++i) {
// sum += check.data<float>()[i]; // sum += check.data<float>()[i];
// } // }
// VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum; // VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
// << sum;
// VLOG(3) << "after checking result"; // VLOG(3) << "after checking result";
// } // }
...@@ -389,11 +397,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -389,11 +397,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
VLOG(3) << "Scope ptr " << local_scope; VLOG(3) << "Scope ptr " << local_scope;
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
// CheckResult(op->Type(), ctx, local_scope); // CheckResult(op->Type(), ctx, local_scope);
if (FLAGS_benchmark) { // if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " // VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); // << memory::memory_usage(place_);
} // }
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
// platform::DeviceContextPool::Instance().Get(place_)->Wait();
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
...@@ -403,13 +414,15 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -403,13 +414,15 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
// auto& block = ctx->prog_.Block(0); // auto& block = ctx->prog_.Block(0);
// for(auto& op : block.AllOps()) { // for(auto& op : block.AllOps()) {
// if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue; // if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
// "feed") continue;
// // for(auto& real_op : ctx->ops_) { // // for(auto& real_op : ctx->ops_) {
// // if(real_op->Type() == op->Type()) { // // if(real_op->Type() == op->Type()) {
// // VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope); // // VLOG(3) << real_op->Type() << " " <<place_ << " " <<
// real_op->DebugStringEx(local_scope);
// // } // // }
// // } // // }
// //VLOG(3) << "start op output" << op->Type(); // //VLOG(3) << "start op output" << op->Type();
// for(auto var_name: op->InputArgumentNames()) { // for(auto var_name: op->InputArgumentNames()) {
// auto* var = local_scope->Var(var_name); // auto* var = local_scope->Var(var_name);
...@@ -418,19 +431,21 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -418,19 +431,21 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
// auto* tensor = var->GetMutable<framework::LoDTensor>(); // auto* tensor = var->GetMutable<framework::LoDTensor>();
// framework::Tensor check; // framework::Tensor check;
// VLOG(3) << "before tensor copy"; // VLOG(3) << "before tensor copy";
// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
// VLOG(3) << "after tensor copy"; // VLOG(3) << "after tensor copy";
// float sum = .0; // float sum = .0;
// for(size_t i=0; i < check.numel(); ++i) { // for(size_t i=0; i < check.numel(); ++i) {
// if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) { // if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
// {
// sum += static_cast<float>(check.data<int64_t>()[i]); // sum += static_cast<float>(check.data<int64_t>()[i]);
// } else { // } else {
// sum += check.data<float>()[i]; // sum += check.data<float>()[i];
// } // }
// } // }
// VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum; // VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum "
// << sum;
// } // }
// VLOG(3) << "op " << op->Type() << "input finished"; // VLOG(3) << "op " << op->Type() << "input finished";
...@@ -442,23 +457,28 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -442,23 +457,28 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
// framework::Tensor check; // framework::Tensor check;
// VLOG(3) << "before tensor copy"; // VLOG(3) << "before tensor copy";
// if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) { // if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
// VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel(); // VLOG(3) << "op " << op->Type() << " output var " << var_name << " "
// << tensor->numel();
// tensor->mutable_data<float>(place_); // tensor->mutable_data<float>(place_);
// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
// &check);
// } else { // } else {
// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
// &check);
// } // }
// VLOG(3) << "after tensor copy"; // VLOG(3) << "after tensor copy";
// float sum = .0; // float sum = .0;
// for(size_t i=0; i < check.numel(); ++i) { // for(size_t i=0; i < check.numel(); ++i) {
// if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) { // if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
// {
// sum += static_cast<float>(check.data<int64_t>()[i]); // sum += static_cast<float>(check.data<int64_t>()[i]);
// } else { // } else {
// sum += check.data<float>()[i]; // sum += check.data<float>()[i];
// } // }
// } // }
// VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum; // VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
// << sum;
// } // }
// } // }
......
...@@ -50,19 +50,41 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -50,19 +50,41 @@ class CompileTimeInferShapeContext : public InferShapeContext {
const std::vector<std::string> &Outputs( const std::vector<std::string> &Outputs(
const std::string &name) const override; const std::string &name) const override;
void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size());
const std::string &input_n = Inputs(in)[i];
const std::string &output_n = Outputs(out)[j];
PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
in, i);
PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", out, j);
auto *in_var = block_.FindVarRecursive(input_n);
auto *out_var = block_.FindVarRecursive(output_n);
PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
"The type of %s and %s is not the same.", input_n, output_n);
SetDim(output_n, GetDim(input_n));
}
void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size()); PADDLE_ENFORCE_LT(j, Outputs(out).size());
PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", in, i);
PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
"The %s[%d] is @EMPTY@", out, j);
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR) { if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
VLOG(3) << "input " << in << " is not LodTensor"; VLOG(3) << "input " << in << " is not LodTensor";
return; return;
} }
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j,
out);
out_var->SetLoDLevel(in_var->GetLoDLevel()); out_var->SetLoDLevel(in_var->GetLoDLevel());
} }
...@@ -441,7 +463,10 @@ static void InitInferShapeFuncs() { ...@@ -441,7 +463,10 @@ static void InitInferShapeFuncs() {
for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
auto op_type = kern_pair.first; auto op_type = kern_pair.first;
auto &op_info = info_map.at(op_type); auto it = info_map.find(op_type);
PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
op_type);
auto &op_info = it->second;
auto op = static_cast<OperatorWithKernel *>(op_info.Creator()( auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
"", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
if (op_info.infer_shape_) { // infer_shape has been registered. if (op_info.infer_shape_) { // infer_shape has been registered.
...@@ -490,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const { ...@@ -490,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
} }
void OpDesc::InferVarType(BlockDesc *block) const { void OpDesc::InferVarType(BlockDesc *block) const {
// There are a few places that var type can be set.
// When VarDesc is created, default set to LOD_TENSOR.
// When output variable is created, default is defaut set to LOD_TENSOR.
// We limit here to be the only place that operator defines its customized
// var type inference. Hence, we don't do any "default" setting here.
auto &info = OpInfoMap::Instance().Get(this->Type()); auto &info = OpInfoMap::Instance().Get(this->Type());
if (info.infer_var_type_) { if (info.infer_var_type_) {
info.infer_var_type_(*this, block); info.infer_var_type_(*this, block);
} else {
// all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
block->FindRecursiveOrCreateVar(out_var_name)
.SetType(proto::VarType::LOD_TENSOR);
}
}
} }
} }
......
...@@ -100,16 +100,6 @@ class OpDesc { ...@@ -100,16 +100,6 @@ class OpDesc {
std::vector<std::string> InputNames() const { return MapKeys(inputs_); } std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
std::vector<std::string> OutputNames() const { return MapKeys(outputs_); } std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
void SetInputMap(const VariableNameMap &input) {
this->inputs_ = input;
this->need_update_ = true;
}
void SetOutputMap(const VariableNameMap &output) {
this->outputs_ = output;
this->need_update_ = true;
}
const VariableNameMap &Inputs() const { return inputs_; } const VariableNameMap &Inputs() const { return inputs_; }
const VariableNameMap &Outputs() const { return outputs_; } const VariableNameMap &Outputs() const { return outputs_; }
......
...@@ -62,7 +62,7 @@ static DDim GetDims(const Scope& scope, const std::string& name, ...@@ -62,7 +62,7 @@ static DDim GetDims(const Scope& scope, const std::string& name,
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (!tensor.IsInitialized()) { if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1}); return DDim({-1});
} }
return tensor.dims(); return tensor.dims();
...@@ -91,13 +91,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { ...@@ -91,13 +91,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (!tensor.IsInitialized()) { if (UNLIKELY(!tensor.IsInitialized())) {
return ""; return "";
} }
return DataTypeToString(ToDataType(tensor.type())); return DataTypeToString(ToDataType(tensor.type()));
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
auto tensor = var->Get<SelectedRows>().value(); auto tensor = var->Get<SelectedRows>().value();
if (!tensor.IsInitialized()) { if (UNLIKELY(!tensor.IsInitialized())) {
return "uninited"; return "uninited";
} else { } else {
return DataTypeToString(ToDataType(tensor.type())); return DataTypeToString(ToDataType(tensor.type()));
...@@ -130,7 +130,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -130,7 +130,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>(); const LoDTensor& tensor = var->Get<LoDTensor>();
if (!tensor.IsInitialized()) { if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod; return default_lod;
} }
return tensor.lod(); return tensor.lod();
...@@ -149,11 +149,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -149,11 +149,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif #endif
} }
VLOG(3) << "start pool";
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); // The profile has a process-wide mutex, results in serious performance issue
platform::RecordEvent record_event(Type(), pool.Get(place)); // in concurrency scenerio. Here use an `if` to fix this issue.
VLOG(3) << "start RunImpl"; // Please not remove the `if`, ask @Superjomn if there are any concern.
RunImpl(scope, place); RunImpl(scope, place);
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} }
...@@ -206,7 +208,6 @@ const std::vector<std::string>& OperatorBase::Outputs( ...@@ -206,7 +208,6 @@ const std::vector<std::string>& OperatorBase::Outputs(
} }
std::string OperatorBase::DebugStringEx(const Scope* scope) const { std::string OperatorBase::DebugStringEx(const Scope* scope) const {
VLOG(3) << this->Type() << " scope ptr " << scope;
std::stringstream ss; std::stringstream ss;
ss << "Op(" << type_ << "), inputs:{"; ss << "Op(" << type_ << "), inputs:{";
for (auto it = inputs_.begin(); it != inputs_.end();) { for (auto it = inputs_.begin(); it != inputs_.end();) {
...@@ -470,35 +471,35 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -470,35 +471,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {} : op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
if (!op_.HasInputs(name)) { // has only one input
const auto& ins = op_.Inputs();
auto it = ins.find(name);
if (it == ins.end()) {
return false; return false;
} }
auto& ins = Inputs(name); const auto& in = it->second;
size_t length = ins.size(); if (in.size() == 0 || in[0] == kEmptyVarName) {
if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(in.size(), 1UL,
"Input %s should not have more than one inputs", name); "Input %s should not have more than one inputs", name);
auto ipt = ins[0]; return scope_.FindVar(in[0]) != nullptr;
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
if (!op_.HasOutputs(name)) { // has only one output
const auto& outs = op_.Outputs();
auto it = outs.find(name);
if (it == outs.end()) {
return false; return false;
} }
auto& outs = Outputs(name); const auto& out = it->second;
size_t length = outs.size(); if (out.size() == 0 || out[0] == kEmptyVarName) {
if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(out.size(), 1UL,
"Output %s should not have more than one inputs", name); "Output %s should not have more than one outputs", name);
auto ipt = outs[0]; return scope_.FindVar(out[0]) != nullptr;
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
...@@ -545,13 +546,45 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -545,13 +546,45 @@ class RuntimeInferShapeContext : public InferShapeContext {
return op_.Outputs(name); return op_.Outputs(name);
} }
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size()); PADDLE_ENFORCE_LT(j, Outputs(out).size());
Variable* in_var = scope_.FindVar(Inputs(in)[i]); const std::string& input_n = Inputs(in)[i];
Variable* out_var = scope_.FindVar(Outputs(out)[j]); const std::string& output_n = Outputs(out)[j];
Variable* in_var = scope_.FindVar(input_n);
Variable* out_var = scope_.FindVar(output_n);
PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
"The type of %s and %s is not the same.", output_n,
GetDim(input_n));
if (in_var->IsType<framework::SelectedRows>()) {
auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
out_sele_rows->set_rows(in_sele_rows.rows());
out_sele_rows->set_height(in_sele_rows.height());
} else if (in_var->IsType<framework::LoDTensor>()) {
auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
out_lod_tensor->Resize(in_lod_tensor.dims());
} else {
PADDLE_THROW(
"Currently, the input type of ShareDim only can be LoDTensor "
"or SelectedRows.");
}
}
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const override {
const std::vector<std::string>& inputs = Inputs(in);
const std::vector<std::string>& outputs = Outputs(out);
PADDLE_ENFORCE_LT(i, inputs.size());
PADDLE_ENFORCE_LT(j, outputs.size());
Variable* in_var = scope_.FindVar(inputs.at(i));
if (!in_var->IsType<LoDTensor>()) return; if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = scope_.FindVar(outputs.at(j));
PADDLE_ENFORCE(out_var->IsType<LoDTensor>(), PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
"The %d-th output of Output(%s) must be LoDTensor.", j, out); "The %d-th output of Output(%s) must be LoDTensor.", j, out);
auto in_tensor = in_var->Get<LoDTensor>(); auto in_tensor = in_var->Get<LoDTensor>();
...@@ -579,20 +612,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -579,20 +612,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_layout(in_tensor.layout()); out_tensor->set_layout(in_tensor.layout());
} }
void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const {
PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size());
Variable* in_var = scope_.FindVar(Inputs(in)[i]);
Variable* out_var = scope_.FindVar(Outputs(out)[j]);
if (!in_var->IsType<LoDTensor>()) return;
PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
"The %d-th output of Output(%s) must be LoDTensor.", j, out);
auto in_tensor = in_var->Get<LoDTensor>();
auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_layout(in_tensor.layout());
}
bool IsRuntime() const override { return true; } bool IsRuntime() const override { return true; }
protected: protected:
...@@ -663,16 +682,12 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -663,16 +682,12 @@ static void CheckTensorNANOrInf(const std::string& name,
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
VLOG(3) << "start Infershape";
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
VLOG(3) << "Infershape Pass";
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
VLOG(3) << "Start Kernels";
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
VLOG(3) << "Kernel map finish";
auto kernels_iter = all_op_kernels.find(type_); auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) { if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW( PADDLE_THROW(
...@@ -690,7 +705,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -690,7 +705,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key = auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key: " << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
...@@ -56,6 +56,9 @@ class InferShapeContext { ...@@ -56,6 +56,9 @@ class InferShapeContext {
virtual const std::vector<std::string> &Outputs( virtual const std::vector<std::string> &Outputs(
const std::string &name) const = 0; const std::string &name) const = 0;
virtual void ShareDim(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) = 0;
virtual void ShareLoD(const std::string &in, const std::string &out, virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0; size_t i = 0, size_t j = 0) const = 0;
......
...@@ -112,11 +112,11 @@ bool NativePaddlePredictor::Init( ...@@ -112,11 +112,11 @@ bool NativePaddlePredictor::Init(
auto &block = inference_program_->Block(0); auto &block = inference_program_->Block(0);
for (auto *op_desc : block.AllOps()) { for (auto *op_desc : block.AllOps()) {
if (op_desc->HasAttr("use_cudnn")) { // if (op_desc->HasAttr("use_cudnn")) {
op_desc->SetAttr("use_cudnn", false); // op_desc->SetAttr("use_cudnn", false);
} // }
if (op_desc->HasAttr("workspace_size_MB")) { if (op_desc->HasAttr("workspace_size_MB")) {
op_desc->SetAttr("workspace_size_MB", 0); op_desc->SetAttr("workspace_size_MB", 1024);
} }
} }
......
...@@ -27,8 +27,8 @@ NativeConfig GetConfig() { ...@@ -27,8 +27,8 @@ NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
// config.model_dir = FLAGS_dirname; // config.model_dir = FLAGS_dirname;
config.prog_file = "hs_lb_without_bn/__model__"; config.prog_file = "hs_lb_without_bn_cudnn/__model__";
config.param_file = "hs_lb_without_bn/__params__"; config.param_file = "hs_lb_without_bn_cudnn/__params__";
// config.prog_file = "hs_lb_without_bn_cuda/__model__"; // config.prog_file = "hs_lb_without_bn_cuda/__model__";
// config.param_file = "hs_lb_without_bn_cuda/__params__"; // config.param_file = "hs_lb_without_bn_cuda/__params__";
config.fraction_of_gpu_memory = 0.0; config.fraction_of_gpu_memory = 0.0;
...@@ -106,7 +106,7 @@ void test_naive(int batch_size) { ...@@ -106,7 +106,7 @@ void test_naive(int batch_size) {
std::cout << "batch: " << batch_size std::cout << "batch: " << batch_size
<< " predict cost: " << time_diff(time1, time2) / steps << "ms" << " predict cost: " << time_diff(time1, time2) / steps << "ms"
<< std::endl; << std::endl;
std::cout << outputs.size() << std::endl; std::cout << outputs.size() << std::endl;
int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data()); int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
int64_t sum_out = 0; int64_t sum_out = 0;
for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
......
...@@ -21,12 +21,12 @@ ...@@ -21,12 +21,12 @@
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <utility>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#define ASSERT_TRUE(x) x #define ASSERT_TRUE(x) x
#define ASSERT_EQ(x, y) assert(x == y) #define ASSERT_EQ(x, y) assert(x == y)
// DEFINE_string(dirname, "./LB_icnet_model", // DEFINE_string(dirname, "./LB_icnet_model",
// "Directory of the inference model."); // "Directory of the inference model.");
namespace paddle { namespace paddle {
...@@ -34,7 +34,7 @@ NativeConfig GetConfig() { ...@@ -34,7 +34,7 @@ NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.prog_file = "./hs_lb_without_bn_cuda/__model__"; config.prog_file = "./hs_lb_without_bn_cuda/__model__";
config.param_file = "./hs_lb_without_bn_cuda/__params__"; config.param_file = "./hs_lb_without_bn_cuda/__params__";
config.fraction_of_gpu_memory = 0.5; config.fraction_of_gpu_memory = 0.0;
config.use_gpu = true; config.use_gpu = true;
config.device = 0; config.device = 0;
return config; return config;
...@@ -54,7 +54,7 @@ void test_naive(int batch_size, std::string model_path) { ...@@ -54,7 +54,7 @@ void test_naive(int batch_size, std::string model_path) {
int height = 449; int height = 449;
int width = 581; int width = 581;
std::vector<float> data; std::vector<float> data;
for(int i=0; i < 3 * height * width; ++i) { for (int i = 0; i < 3 * height * width; ++i) {
data.push_back(0.0); data.push_back(0.0);
} }
...@@ -86,47 +86,61 @@ void test_naive(int batch_size, std::string model_path) { ...@@ -86,47 +86,61 @@ void test_naive(int batch_size, std::string model_path) {
// in_img.close(); // in_img.close();
// std::cout << "sum: " << sum_n << std::endl; // std::cout << "sum: " << sum_n << std::endl;
PaddleTensor tensor; PaddleTensor tensor;
tensor.shape = std::vector<int>({batch_size, 3, height, width}); tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width); tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
std::copy(data.begin(), data.end(), std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
static_cast<float*>(tensor.data.data())); tensor.dtype = PaddleDType::FLOAT32;
tensor.dtype = PaddleDType::FLOAT32; std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
constexpr int num_jobs = 5; // each job run 1 batch
constexpr int num_jobs = 2; // each job run 1 batch std::vector<std::thread> threads;
std::vector<std::thread> threads; // using PtrPred = std::vector<std::unique_ptr<PaddlePredictor>>;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
for (int tid = 0; tid < num_jobs; ++tid) {
auto& pred = CreatePaddlePredictor<NativeConfig>(config);
predictors.emplace_back(std::move(pred));
}
for (int tid = 0; tid < num_jobs; ++tid) { using namespace std::chrono_literals;
threads.emplace_back([&, tid]() { // std::this_thread::sleep_for(std::chrono::seconds(20));
std::cout << "before start predict";
int epoches = 100000;
for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() {
// auto predictor = CreatePaddlePredictor<NativeConfig>(config);
auto& predictor = predictors[tid];
// auto& predictor = predictors[tid];
// auto predictor = preds[tid];
// std::this_thread::sleep_for(std::chrono::seconds(20));
PaddleTensor tensor_out; PaddleTensor tensor_out;
std::vector<PaddleTensor> outputs(1, tensor_out); std::vector<PaddleTensor> outputs(1, tensor_out);
auto predictor = CreatePaddlePredictor<NativeConfig>(config); for (size_t i = 0; i < epoches; i++) {
for (size_t i = 0; i < 1000; i++) { ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); VLOG(0) << "tid : " << tid << " run: " << i << "finished";
VLOG(0) << "tid : " << tid << " run: " << i << "finished"; // std::cout <<"tid : " << tid << " run: " << i << "finished" <<
//std::cout <<"tid : " << tid << " run: " << i << "finished" << std::endl; // std::endl;
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
// int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data()); // int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
// int64_t sum_out = 0; // int64_t sum_out = 0;
// for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); // for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
// ++j) { // ++j) {
// sum_out += data_o[j]; // sum_out += data_o[j];
// } // }
// std::cout << "tid : " << tid << "pass : " << i << " " << sum_out // std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
// << std::endl; // << std::endl;
} }
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
threads[i].join(); threads[i].join();
}
} }
}
// } // }
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) { int main(int argc, char** argv) {
paddle::test_naive(1 << 0, ""); paddle::test_naive(1 << 0, "");
return 0; return 0;
} }
...@@ -11,7 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "glog/logging.h" #include "glog/logging.h"
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
......
...@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor) The input of Topk op"); AddInput("X", "(Tensor) The input of Topk op");
AddOutput("Out", "(Tensor) The output tensor of Topk op"); AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
AddComment(R"DOC( AddComment(R"DOC(
Top K operator Top K operator
......
...@@ -256,65 +256,36 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -256,65 +256,36 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
* 3. go to the second setp, until one thread's topk value is null; * 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value. * 4. go to the first setp, until get the topk value.
*/ */
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
const T* src, int lds, int dim, int k, const T* src, int lds, int dim, int k) {
int grid_dim, int num) {
__shared__ Pair<T> sh_topk[BlockSize]; __shared__ Pair<T> sh_topk[BlockSize];
__shared__ int maxid[BlockSize / 2];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int warp = threadIdx.x / 32; const int warp = threadIdx.x / 32;
output += blockIdx.x * output_stride;
indices += blockIdx.x * k;
const int bid = blockIdx.x; Pair<T> topk[MaxLength];
for (int i = bid; i < num; i += grid_dim) { int beam = MaxLength;
int top_num = k; Pair<T> max;
__shared__ int maxid[BlockSize / 2]; bool is_empty = false;
T* out = output + i * output_stride; bool firststep = true;
int64_t* inds = indices + i * k;
Pair<T> topk[MaxLength];
int beam = MaxLength;
Pair<T> max;
bool is_empty = false;
bool firststep = true;
for (int j = 0; j < MaxLength; j++) {
topk[j].set(-INFINITY, -1);
}
while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>(
topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
sh_topk[tid] = topk[0]; for (int k = 0; k < MaxLength; k++) {
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds, topk[k].set(-INFINITY, -1);
&beam, &top_num, tid, warp);
}
} }
} while (k) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
inline static int GetDesiredBlockDim(int dim) { src + blockIdx.x * lds, &firststep,
if (dim > 128) { &is_empty, &max, dim, tid);
return 256;
} else if (dim > 64) { sh_topk[tid] = topk[0];
return 128; BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
} else if (dim > 32) { &indices, &beam, &k, tid, warp);
return 64;
} else {
return 32;
} }
} }
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template <typename T> template <typename T>
class TopkOpCUDAKernel : public framework::OpKernel<T> { class TopkOpCUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -327,38 +298,30 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -327,38 +298,30 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
size_t k = static_cast<int>(ctx.Attr<int>("k")); size_t k = static_cast<int>(ctx.Attr<int>("k"));
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
// FIXME(typhoonzero): data is always converted to type T? // FIXME(typhoonzero): data is always converted to type T?
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
framework::DDim inputdims = input->dims(); size_t input_height = input->dims()[0];
const size_t input_height = framework::product( size_t input_width = input->dims()[1];
framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
const size_t input_width = inputdims[inputdims.size() - 1];
if (k > input_width) k = input_width; if (k > input_width) k = input_width;
// NOTE: pass lds and dim same to input width. // NOTE: pass lds and dim same to input width.
// NOTE: old matrix implementation of stride is different to eigen. // NOTE: old matrix implementation of stride is different to eigen.
// TODO(typhoonzero): refine this kernel. // TODO(typhoonzero): refine this kernel.
const int kMaxHeight = 2048; dim3 threads(256, 1);
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; dim3 grid(input_height, 1);
auto& dev_ctx = ctx.cuda_device_context();
switch (GetDesiredBlockDim(input_width)) { KeMatrixTopK<T, 5, 256><<<
FIXED_BLOCK_DIM( grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
KeMatrixTopK<T, 5, ctx.device_context())
kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>( .stream()>>>(
output_data, k, indices_data, input_data, input_width, output_data, output->dims()[1], indices_data, input_data, input_width,
input_width, static_cast<int>(k), gridx, input_height)); input_width, static_cast<int>(k));
default:
PADDLE_THROW("Error");
}
} }
}; };
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -34,6 +34,7 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -34,6 +34,7 @@ class TopkKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor // Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
...@@ -43,6 +44,8 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -43,6 +44,8 @@ class TopkKernel : public framework::OpKernel<T> {
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
auto eg_input = EigenMatrix<T>::From(*input);
// reshape input to a flattern matrix(like flat_inner_dims) // reshape input to a flattern matrix(like flat_inner_dims)
framework::DDim inputdims = input->dims(); framework::DDim inputdims = input->dims();
const size_t row = framework::product( const size_t row = framework::product(
...@@ -50,7 +53,7 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -50,7 +53,7 @@ class TopkKernel : public framework::OpKernel<T> {
const size_t col = inputdims[inputdims.size() - 1]; const size_t col = inputdims[inputdims.size() - 1];
Eigen::DSizes<int, 2> flat2dims(row, col); Eigen::DSizes<int, 2> flat2dims(row, col);
// NOTE: eigen shape doesn't affect paddle tensor. // NOTE: eigen shape doesn't affect paddle tensor.
auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1); eg_input.reshape(flat2dims);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
......
...@@ -27,6 +27,12 @@ ENDIF() ...@@ -27,6 +27,12 @@ ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
set(MYDEPS ${MYDEPS} libcmt shlwapi)
set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce boost) cc_library(place SRCS place.cc DEPS enforce boost)
...@@ -58,6 +64,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ ...@@ -58,6 +64,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
cc_test(init_test SRCS init_test.cc DEPS device_context) cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
target_link_libraries(cudnn_helper_test ${MYDEPS})
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
......
...@@ -68,7 +68,14 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { ...@@ -68,7 +68,14 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
} \ } \
} while (false) } while (false)
#else #else
#define CUDNN_ENFORCE(condition) // windows
#define CUDNN_ENFORCE(condition) \
do { \
cudnnStatus_t status = condition; \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << ::paddle::platform::cudnnGetErrorString(status); \
} \
} while (false)
#endif #endif
enum class DataLayout { // Not use enum class DataLayout { // Not use
......
...@@ -127,7 +127,7 @@ struct EOFException : public std::exception { ...@@ -127,7 +127,7 @@ struct EOFException : public std::exception {
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else #else
// there is no equivalent intrinsics in msvc. // there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition == 0) #define UNLIKELY(condition) ((condition) == 0)
#endif #endif
template <typename... Args> template <typename... Args>
...@@ -309,7 +309,6 @@ inline void throw_on_error(T e) { ...@@ -309,7 +309,6 @@ inline void throw_on_error(T e) {
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \ do { \
if (UNLIKELY(nullptr == (__VAL))) { \ if (UNLIKELY(nullptr == (__VAL))) { \
...@@ -330,26 +329,26 @@ inline void throw_on_error(T e) { ...@@ -330,26 +329,26 @@ inline void throw_on_error(T e) {
} \ } \
} while (0) } while (0)
#else #else
#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0)==(__VAL1)) #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0)!=(__VAL1)) #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0)>(__VAL1)) #define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0)>=(__VAL1)) #define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0)<(__VAL1)) #define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0)<=(__VAL1)) #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \ do { \
if (!((__VAL0)__CMP(__VAL1))) { \ if (!((__VAL0)__CMP(__VAL1))) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed."); \ PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
} \ } \
} while(0) } while (0)
#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
do { \ do { \
if (nullptr == (__VAL1)) { \ if (nullptr == (__VAL1)) { \
PADDLE_THROW("Windows disable the enforce. Enforce failed"); \ PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
} \ } \
} while(0) } while (0)
#endif // !_WIN32 #endif // !_WIN32
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册