提交 09409bad 编写于 作者: D dzhwinter

staged. test speed=49ms in 1080.

上级 468467f3
...@@ -397,72 +397,72 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -397,72 +397,72 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
VLOG(3) << "start checking"; // VLOG(3) << "start checking";
auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_); // auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
std::vector<std::string> outputs; // std::vector<std::string> outputs;
auto& block = ctx->prog_.Block(0); // auto& block = ctx->prog_.Block(0);
for(auto& op : block.AllOps()) { // for(auto& op : block.AllOps()) {
if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue; // if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue;
// for(auto& real_op : ctx->ops_) { // // for(auto& real_op : ctx->ops_) {
// if(real_op->Type() == op->Type()) { // // if(real_op->Type() == op->Type()) {
// VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope); // // VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope);
// // }
// // }
// //VLOG(3) << "start op output" << op->Type();
// for(auto var_name: op->InputArgumentNames()) {
// auto* var = local_scope->Var(var_name);
// auto* var_desc = block.FindVar(var_name);
// if (var_desc->Persistable()) continue;
// auto* tensor = var->GetMutable<framework::LoDTensor>();
// framework::Tensor check;
// VLOG(3) << "before tensor copy";
// framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
// VLOG(3) << "after tensor copy";
// float sum = .0;
// for(size_t i=0; i < check.numel(); ++i) {
// if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
// sum += static_cast<float>(check.data<int64_t>()[i]);
// } else {
// sum += check.data<float>()[i];
// } // }
// } // }
// VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
// }
//VLOG(3) << "start op output" << op->Type(); // VLOG(3) << "op " << op->Type() << "input finished";
for(auto var_name: op->InputArgumentNames()) { // for(auto var_name: op->OutputArgumentNames()) {
auto* var = local_scope->Var(var_name); // auto* var = local_scope->Var(var_name);
auto* var_desc = block.FindVar(var_name); // auto* var_desc = block.FindVar(var_name);
if (var_desc->Persistable()) continue; // if (var_desc->Persistable()) continue;
auto* tensor = var->GetMutable<framework::LoDTensor>(); // auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::Tensor check; // framework::Tensor check;
VLOG(3) << "before tensor copy"; // VLOG(3) << "before tensor copy";
// if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); // VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
// tensor->mutable_data<float>(place_);
VLOG(3) << "after tensor copy"; // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
float sum = .0; // } else {
for(size_t i=0; i < check.numel(); ++i) { // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) { // }
sum += static_cast<float>(check.data<int64_t>()[i]);
} else {
sum += check.data<float>()[i];
}
}
VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
}
VLOG(3) << "op " << op->Type() << "input finished";
for(auto var_name: op->OutputArgumentNames()) {
auto* var = local_scope->Var(var_name);
auto* var_desc = block.FindVar(var_name);
if (var_desc->Persistable()) continue;
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::Tensor check;
VLOG(3) << "before tensor copy";
if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
tensor->mutable_data<float>(place_);
framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
} else {
framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
}
VLOG(3) << "after tensor copy"; // VLOG(3) << "after tensor copy";
float sum = .0; // float sum = .0;
for(size_t i=0; i < check.numel(); ++i) { // for(size_t i=0; i < check.numel(); ++i) {
if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) { // if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
sum += static_cast<float>(check.data<int64_t>()[i]); // sum += static_cast<float>(check.data<int64_t>()[i]);
} else { // } else {
sum += check.data<float>()[i]; // sum += check.data<float>()[i];
} // }
} // }
VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum; // VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
} // }
} // }
VLOG(3) << "after checking result"; // VLOG(3) << "after checking result";
if (local_scope != scope) { if (local_scope != scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <fstream>
#include <map> #include <map>
#include <set> #include <set>
#include <sstream> #include <sstream>
...@@ -88,6 +89,7 @@ bool NativePaddlePredictor::Init( ...@@ -88,6 +89,7 @@ bool NativePaddlePredictor::Init(
VLOG(3) << config_.model_dir; VLOG(3) << config_.model_dir;
inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
config_.model_dir); config_.model_dir);
VLOG(3) << "load model finish"; VLOG(3) << "load model finish";
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) { } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
...@@ -100,6 +102,31 @@ bool NativePaddlePredictor::Init( ...@@ -100,6 +102,31 @@ bool NativePaddlePredictor::Init(
VLOG(3) << "scope_"; VLOG(3) << "scope_";
inference_program_ = paddle::inference::Load( inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file); executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
// VLOG(3) << "modify the program!";
// {
// std::ofstream ofs("program.txt", std::ios::out);
// std::string s = inference_program_->Proto()->SerializeAsString();
// ofs.write(s.data(), s.size());
// ofs.close();
// }
auto &block = inference_program_->Block(0);
for (auto *op_desc : block.AllOps()) {
if (op_desc->HasAttr("use_cudnn")) {
op_desc->SetAttr("use_cudnn", false);
}
if (op_desc->HasAttr("workspace_size_MB")) {
op_desc->SetAttr("workspace_size_MB", 0);
}
}
// {
// std::ofstream ofs("after_program.txt", std::ios::out);
// std::string s = inference_program_->Proto()->SerializeAsString();
// ofs.write(s.data(), s.size());
// ofs.close();
// }
VLOG(3) << "load program finish"; VLOG(3) << "load program finish";
} else { } else {
LOG(ERROR) << "fail to load inference model."; LOG(ERROR) << "fail to load inference model.";
...@@ -308,7 +335,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -308,7 +335,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "before check"; VLOG(3) << "before check";
// PADDLE_ENFORCE_GT( // PADDLE_ENFORCE_GT(
// config.fraction_of_gpu_memory, 0.f, // config.fraction_of_gpu_memory, 0.f,
// "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); // "fraction_of_gpu_memory in the config should be set to range (0.,
// 1.]");
VLOG(3) << "failed on first"; VLOG(3) << "failed on first";
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
VLOG(3) << "after flags"; VLOG(3) << "after flags";
......
...@@ -77,7 +77,7 @@ add_executable(real_data_icnet_tester real_data_icnet_tester.cc) ...@@ -77,7 +77,7 @@ add_executable(real_data_icnet_tester real_data_icnet_tester.cc)
# add_library(${DEMO_NAME} SHARED ${DEMO_NAME}.cc) # add_library(${DEMO_NAME} SHARED ${DEMO_NAME}.cc)
# add_executable(test test.cc) # add_executable(test test.cc)
# add_executable(thread_icnet_test thread_icnet_test.cc) add_executable(thread_icnet_test thread_icnet_test.cc)
if(WITH_MKL) if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include") include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
...@@ -130,6 +130,5 @@ target_link_libraries(real_data_icnet_tester ${DEPS}) ...@@ -130,6 +130,5 @@ target_link_libraries(real_data_icnet_tester ${DEPS})
# target_link_libraries(${DEMO_NAME} ${DEPS}) # target_link_libraries(${DEMO_NAME} ${DEPS})
# target_link_libraries(test ${DEMO_NAME} ) # target_link_libraries(test ${DEMO_NAME} )
# target_link_libraries(thread_icnet_test ${DEPS}) target_link_libraries(thread_icnet_test ${DEPS})
# target_compile_definitions(${DEMO_NAME} PRIVATE "API_DEFINITION") # target_compile_definitions(${DEMO_NAME} PRIVATE "API_DEFINITION")
...@@ -25,10 +25,13 @@ namespace paddle { ...@@ -25,10 +25,13 @@ namespace paddle {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
// config.model_dir = FLAGS_dirname; // config.model_dir = FLAGS_dirname;
config.prog_file= "hs_lb_without_bn/__model__"; config.prog_file = "hs_lb_without_bn/__model__";
config.param_file= "hs_lb_without_bn/__params__"; config.param_file = "hs_lb_without_bn/__params__";
config.fraction_of_gpu_memory = 0.8; // config.prog_file = "hs_lb_without_bn_cuda/__model__";
// config.param_file = "hs_lb_without_bn_cuda/__params__";
config.fraction_of_gpu_memory = 0.0;
config.use_gpu = true; config.use_gpu = true;
config.device = 0; config.device = 0;
return config; return config;
...@@ -43,8 +46,7 @@ double time_diff(Time t1, Time t2) { ...@@ -43,8 +46,7 @@ double time_diff(Time t1, Time t2) {
return counter.count() / 1000.0; return counter.count() / 1000.0;
} }
void test_naive(int batch_size) {
void test_naive(int batch_size){
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
auto predictor = CreatePaddlePredictor<NativeConfig>(config); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
int height = 449; int height = 449;
...@@ -62,7 +64,7 @@ void test_naive(int batch_size){ ...@@ -62,7 +64,7 @@ void test_naive(int batch_size){
// size_t file_num = all_files.size(); // size_t file_num = all_files.size();
infile.close(); infile.close();
// =============read file list ============= // =============read file list =============
for (size_t f_k = 0; f_k < 1; f_k ++) { for (size_t f_k = 0; f_k < 1; f_k++) {
std::ifstream in_img(all_files[f_k]); std::ifstream in_img(all_files[f_k]);
std::cout << all_files[f_k] << std::endl; std::cout << all_files[f_k] << std::endl;
float temp_v; float temp_v;
...@@ -82,7 +84,8 @@ void test_naive(int batch_size){ ...@@ -82,7 +84,8 @@ void test_naive(int batch_size){
PaddleTensor tensor; PaddleTensor tensor;
tensor.shape = std::vector<int>({batch_size, 3, height, width}); tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width); tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data())); std::copy(data.begin(), data.end(),
static_cast<float*>(tensor.data.data()));
tensor.dtype = PaddleDType::FLOAT32; tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor); std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
PaddleTensor tensor_out; PaddleTensor tensor_out;
...@@ -91,17 +94,20 @@ void test_naive(int batch_size){ ...@@ -91,17 +94,20 @@ void test_naive(int batch_size){
// predictor->Run(paddle_tensor_feeds, &outputs, batch_size); // predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
std::cout << "start predict123:" << std::endl; std::cout << "start predict123:" << std::endl;
auto time1 = time(); auto time1 = time();
int steps = 100;
for(size_t i = 0; i < 1; i++) { for (size_t i = 0; i < steps; i++) {
if (i == 5) time1 = time();
predictor->Run(paddle_tensor_feeds, &outputs, batch_size); predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
} }
auto time2 = time(); auto time2 = time();
std::ofstream ofresult("naive_test_result.txt", std::ios::app); std::ofstream ofresult("naive_test_result.txt", std::ios::app);
std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 1000.0 << "ms" << std::endl; std::cout << "batch: " << batch_size
<< " predict cost: " << time_diff(time1, time2) / steps << "ms"
<< std::endl;
std::cout << outputs.size() << std::endl; std::cout << outputs.size() << std::endl;
int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data()); int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
int64_t sum_out = 0; int64_t sum_out = 0;
for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
ofresult << std::to_string(data_o[j]) << " "; ofresult << std::to_string(data_o[j]) << " ";
...@@ -116,7 +122,7 @@ void test_naive(int batch_size){ ...@@ -116,7 +122,7 @@ void test_naive(int batch_size){
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) { int main(int argc, char** argv) {
// google::ParseCommandLineFlags(&argc, &argv, true); // google::ParseCommandLineFlags(&argc, &argv, true);
paddle::test_naive(1<<0); paddle::test_naive(1 << 0);
return 0; return 0;
} }
...@@ -20,22 +20,21 @@ ...@@ -20,22 +20,21 @@
#include <chrono> #include <chrono>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#define ASSERT_TRUE(x) x #define ASSERT_TRUE(x) x
#define ASSERT_EQ(x, y) assert(x == y) #define ASSERT_EQ(x, y) assert(x == y)
namespace paddle {
// DEFINE_string(dirname, "./LB_icnet_model", // DEFINE_string(dirname, "./LB_icnet_model",
// "Directory of the inference model."); // "Directory of the inference model.");
namespace paddle {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.prog_file= "./dzh_lb/__model__"; config.prog_file = "./hs_lb_without_bn_cuda/__model__";
config.param_file= "./dzh_lb/__params__"; config.param_file = "./hs_lb_without_bn_cuda/__params__";
config.fraction_of_gpu_memory = 0.08; config.fraction_of_gpu_memory = 0.5;
config.use_gpu = true; config.use_gpu = true;
config.device = 0; config.device = 0;
return config; return config;
...@@ -50,56 +49,84 @@ double time_diff(Time t1, Time t2) { ...@@ -50,56 +49,84 @@ double time_diff(Time t1, Time t2) {
return counter.count() / 1000.0; return counter.count() / 1000.0;
} }
void test_naive(int batch_size, std::string model_path){ void test_naive(int batch_size, std::string model_path) {
PaddlePredictor* pres[2];
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
// config.model_dir = model_path;
auto predictor0 = CreatePaddlePredictor<NativeConfig>(config);
auto predictor1 = CreatePaddlePredictor<NativeConfig>(config);
pres[0] = predictor0.get();
pres[1] = predictor1.get();
int height = 449; int height = 449;
int width = 581; int width = 581;
std::vector<float> data; std::vector<float> data;
for (int i = 0; i < 3 * height * width; i++) { for(int i=0; i < 3 * height * width; ++i) {
data.push_back(0); data.push_back(0.0);
} }
// read data
// std::ifstream infile("new_file.list");
// std::string temp_s;
// std::vector<std::string> all_files;
// while (!infile.eof()) {
// infile >> temp_s;
// all_files.push_back(temp_s);
// }
// // size_t file_num = all_files.size();
// infile.close();
// // =============read file list =============
// for (size_t f_k = 0; f_k < 1; f_k++) {
// std::ifstream in_img(all_files[f_k]);
// std::cout << all_files[f_k] << std::endl;
// float temp_v;
// float sum_n = 0.0;
// std::vector<float> data;
// while (!in_img.eof()) {
// in_img >> temp_v;
// data.push_back(float(temp_v));
// sum_n += temp_v;
// }
// in_img.close();
// std::cout << "sum: " << sum_n << std::endl;
PaddleTensor tensor; PaddleTensor tensor;
tensor.shape = std::vector<int>({batch_size, 3, height, width}); tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width); tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data())); std::copy(data.begin(), data.end(),
static_cast<float*>(tensor.data.data()));
tensor.dtype = PaddleDType::FLOAT32; tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor); std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
constexpr int num_jobs = 5; // each job run 1 batch constexpr int num_jobs = 2; // each job run 1 batch
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) { for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
auto predictor = pres[tid]; PaddleTensor tensor_out;
std::vector<PaddleTensor> local_outputs; std::vector<PaddleTensor> outputs(1, tensor_out);
for(size_t i = 0; i < 1000; i++) { auto predictor = CreatePaddlePredictor<NativeConfig>(config);
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &local_outputs)); for (size_t i = 0; i < 1000; i++) {
std::cout << "run: " << tid << std::endl; ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
VLOG(0) << "tid : " << tid << " run: " << i << "finished";
//std::cout <<"tid : " << tid << " run: " << i << "finished" << std::endl;
ASSERT_EQ(outputs.size(), 1UL);
// int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
// int64_t sum_out = 0;
// for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
// ++j) {
// sum_out += data_o[j];
// }
// std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
// << std::endl;
} }
ASSERT_EQ(local_outputs.size(), 1UL);
}); });
} }
for (int i = 0; i < num_jobs; ++i) { for (int i = 0; i < num_jobs; ++i) {
threads[i].join(); threads[i].join();
} }
} }
// }
//TEST(alexnet, naive) {
// test_naive(1 << 0, "./trt_models/vgg19");
//}
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) { int main(int argc, char** argv) {
paddle::test_naive(1 << 0, ""); paddle::test_naive(1 << 0, "");
return 0;
} }
...@@ -163,7 +163,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -163,7 +163,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
VLOG(3) << "after get workspace"; VLOG(3) << "after get workspace";
// Allocate on GPU memory // Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024; // workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
VLOG(3) << "allocate memory"; VLOG(3) << "allocate memory";
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
...@@ -324,7 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -324,7 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024; //workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
......
...@@ -62,18 +62,18 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -62,18 +62,18 @@ class LoadCombineOp : public framework::OperatorBase {
VLOG(3) << "before deserialization"; VLOG(3) << "before deserialization";
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx); DeserializeFromStream(fin, tensor, dev_ctx);
VLOG(3) << "after deserialization"; // VLOG(3) << "after deserialization";
framework::Tensor check; // framework::Tensor check;
framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check); // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
float sum = .0; // float sum = .0;
for(size_t i=0; i < check.numel(); ++i) { // for(size_t i=0; i < check.numel(); ++i) {
if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) { // if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
sum += static_cast<float>(check.data<int64_t>()[i]); // sum += static_cast<float>(check.data<int64_t>()[i]);
} else { // } else {
sum += check.data<float>()[i]; // sum += check.data<float>()[i];
} // }
} // }
VLOG(3) << "sum result" << sum; // VLOG(3) << "sum result" << sum;
auto in_dtype = framework::ToDataType(tensor->type()); auto in_dtype = framework::ToDataType(tensor->type());
auto out_dtype = auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
......
...@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor) The input of Topk op"); AddInput("X", "(Tensor) The input of Topk op");
AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X"); AddOutput("Out", "(Tensor) The output tensor of Topk op");
AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
AddComment(R"DOC( AddComment(R"DOC(
Top K operator Top K operator
......
...@@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid, ...@@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
* 3. go to the second setp, until one thread's topk value is null; * 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value. * 4. go to the first setp, until get the topk value.
*/ */
template <typename T, int MaxLength, int BlockSize> template <typename T, int MaxLength, int BlockSize>
__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
const T* src, int lds, int dim, int k) { const T* src, int lds, int dim, int k,
int grid_dim, int num) {
__shared__ Pair<T> sh_topk[BlockSize]; __shared__ Pair<T> sh_topk[BlockSize];
__shared__ int maxid[BlockSize / 2];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int warp = threadIdx.x / 32; const int warp = threadIdx.x / 32;
output += blockIdx.x * output_stride;
indices += blockIdx.x * k;
const int bid = blockIdx.x;
for (int i = bid; i < num; i += grid_dim) {
int top_num = k;
__shared__ int maxid[BlockSize / 2];
T* out = output + i * output_stride;
int64_t* inds = indices + i * k;
Pair<T> topk[MaxLength]; Pair<T> topk[MaxLength];
int beam = MaxLength; int beam = MaxLength;
Pair<T> max; Pair<T> max;
bool is_empty = false; bool is_empty = false;
bool firststep = true; bool firststep = true;
for (int k = 0; k < MaxLength; k++) { for (int j = 0; j < MaxLength; j++) {
topk[k].set(-INFINITY, -1); topk[j].set(-INFINITY, -1);
} }
while (k) { while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k, ThreadGetTopK<T, MaxLength, BlockSize>(
src + blockIdx.x * lds, &firststep, topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
&is_empty, &max, dim, tid);
sh_topk[tid] = topk[0]; sh_topk[tid] = topk[0];
BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output, BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
&indices, &beam, &k, tid, warp); &beam, &top_num, tid, warp);
}
} }
} }
inline static int GetDesiredBlockDim(int dim) {
if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template <typename T> template <typename T>
class TopkOpCUDAKernel : public framework::OpKernel<T> { class TopkOpCUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -298,30 +327,38 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -298,30 +327,38 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
size_t k = static_cast<int>(ctx.Attr<int>("k")); size_t k = static_cast<int>(ctx.Attr<int>("k"));
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
// FIXME(typhoonzero): data is always converted to type T? // FIXME(typhoonzero): data is always converted to type T?
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
size_t input_height = input->dims()[0]; framework::DDim inputdims = input->dims();
size_t input_width = input->dims()[1]; const size_t input_height = framework::product(
framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
const size_t input_width = inputdims[inputdims.size() - 1];
if (k > input_width) k = input_width; if (k > input_width) k = input_width;
// NOTE: pass lds and dim same to input width. // NOTE: pass lds and dim same to input width.
// NOTE: old matrix implementation of stride is different to eigen. // NOTE: old matrix implementation of stride is different to eigen.
// TODO(typhoonzero): refine this kernel. // TODO(typhoonzero): refine this kernel.
dim3 threads(256, 1); const int kMaxHeight = 2048;
dim3 grid(input_height, 1); int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
auto& dev_ctx = ctx.cuda_device_context();
KeMatrixTopK<T, 5, 256><<< switch (GetDesiredBlockDim(input_width)) {
grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>( FIXED_BLOCK_DIM(
ctx.device_context()) KeMatrixTopK<T, 5,
.stream()>>>( kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
output_data, output->dims()[1], indices_data, input_data, input_width, output_data, k, indices_data, input_data, input_width,
input_width, static_cast<int>(k)); input_width, static_cast<int>(k), gridx, input_height));
default:
PADDLE_THROW("Error");
}
} }
}; };
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor // Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
...@@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel<T> {
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
auto eg_input = EigenMatrix<T>::From(*input);
// reshape input to a flattern matrix(like flat_inner_dims) // reshape input to a flattern matrix(like flat_inner_dims)
framework::DDim inputdims = input->dims(); framework::DDim inputdims = input->dims();
const size_t row = framework::product( const size_t row = framework::product(
...@@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel<T> {
const size_t col = inputdims[inputdims.size() - 1]; const size_t col = inputdims[inputdims.size() - 1];
Eigen::DSizes<int, 2> flat2dims(row, col); Eigen::DSizes<int, 2> flat2dims(row, col);
// NOTE: eigen shape doesn't affect paddle tensor. // NOTE: eigen shape doesn't affect paddle tensor.
eg_input.reshape(flat2dims); auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册