提交 06180779 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/ut/lac

......@@ -213,9 +213,11 @@ include(configure) # add paddle env configuration
if(WITH_GPU)
include(cuda)
include(tensorrt)
endif()
if(WITH_MKL OR WITH_MKLML)
include(external/anakin)
elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
endif()
include(generic) # simplify cmake module
......
......@@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
include_directories(${ANAKIN_INCLUDE}/saber/core/)
......@@ -48,6 +38,11 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp)
if(WITH_GPU)
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
else()
set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
endif()
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
......@@ -56,13 +51,11 @@ ExternalProject_Add(
GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
CMAKE_ARGS ${CMAKE_ARGS_PREFIX}
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
-DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
......
......@@ -145,7 +145,7 @@ copy(memory_lib
set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api")
if (WITH_ANAKIN AND WITH_GPU)
if (WITH_ANAKIN AND WITH_MKL)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
......
......@@ -822,6 +822,14 @@ pad
.. autofunction:: paddle.fluid.layers.pad
:noindex:
.. _api_fluid_layers_pad_constant_like:
pad_constant_like
---
.. autofunction:: paddle.fluid.layers.pad_constant_like
:noindex:
.. _api_fluid_layers_label_smooth:
label_smooth
......@@ -1145,6 +1153,14 @@ sigmoid
.. autofunction:: paddle.fluid.layers.sigmoid
:noindex:
.. _api_fluid_layers_hsigmoid:
hsigmoid
-------
.. autofunction:: paddle.fluid.layers.hsigmoid
:noindex:
.. _api_fluid_layers_logsigmoid:
logsigmoid
......
......@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
# 访问 http://127.0.0.1:8080
```
如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上,运行`pip install --upgrade protobuf`就能解决。
如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。
......
......@@ -43,6 +43,7 @@ paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list',
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
......
......@@ -13,13 +13,10 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/helper.h"
namespace paddle {
namespace framework {
......
......@@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
......
......@@ -85,7 +85,7 @@ void GraphPatternDetector::operator()(Graph* graph,
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
int id = 0;
for (auto& g : subgraphs) {
LOG(INFO) << "optimizing #" << id++ << " subgraph";
VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph);
}
}
......
......@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
Dot dot;
std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "red")});
std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
// Dot::Attr("shape", "diamond"),
Dot::Attr("fillcolor", "yellow")});
std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "lightgray")});
std::vector<Dot::Attr> marked_var_attrs(
{Dot::Attr("style", "filled,rounded"),
// Dot::Attr("shape", "diamond"),
Dot::Attr("fillcolor", "lightgray")});
const std::vector<Dot::Attr> op_attrs({
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("shape", "box"), //
Dot::Attr("color", "#303A3A"), //
Dot::Attr("fontcolor", "#ffffff"), //
Dot::Attr("width", "1.3"), //
Dot::Attr("height", "0.84"), //
Dot::Attr("fontname", "Arial"), //
});
const std::vector<Dot::Attr> arg_attrs({
Dot::Attr("shape", "box"), //
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fontname", "Arial"), //
Dot::Attr("fillcolor", "#999999"), //
Dot::Attr("color", "#dddddd"), //
});
const std::vector<Dot::Attr> param_attrs({
Dot::Attr("shape", "box"), //
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fontname", "Arial"), //
Dot::Attr("color", "#148b97"), //
Dot::Attr("fontcolor", "#ffffff"), //
});
const std::vector<Dot::Attr> marked_op_attrs(
{Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")});
const std::vector<Dot::Attr> marked_var_attrs(
{Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")});
auto marked_nodes = ConsumeMarkedNodes(graph.get());
// Create nodes
......@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
marked_nodes.count(n) ? marked_op_attrs : op_attrs;
dot.AddNode(node_id, attr, node_id);
} else if (n->IsVar()) {
decltype(op_attrs) attr =
marked_nodes.count(n) ? marked_var_attrs : var_attrs;
dot.AddNode(node_id, attr, node_id);
decltype(op_attrs)* attr;
if (marked_nodes.count(n)) {
attr = &marked_var_attrs;
} else if (const_cast<Node*>(n)->Var() &&
const_cast<Node*>(n)->Var()->Persistable()) {
attr = &param_attrs;
} else {
attr = &arg_attrs;
}
dot.AddNode(node_id, *attr, node_id);
}
node2dot[n] = node_id;
}
......
......@@ -6,6 +6,7 @@ cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits
analyzer.cc
helper.cc
# passes
analysis_pass.cc
fluid_to_data_flow_graph_pass.cc
data_flow_graph_to_fluid_pass.cc
dfg_graphviz_draw_pass.cc
......@@ -99,12 +100,17 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS test_text_classification.cc
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
--topn=1 # Just run top 1 batch.
)
......@@ -12,4 +12,4 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
......@@ -28,10 +28,10 @@ namespace paddle {
namespace inference {
namespace analysis {
class Pass {
class AnalysisPass {
public:
Pass() = default;
virtual ~Pass() = default;
AnalysisPass() = default;
virtual ~AnalysisPass() = default;
// Mutable Pass.
virtual bool Initialize(Argument *argument) { return false; }
// Readonly Pass.
......@@ -42,23 +42,16 @@ class Pass {
virtual bool Finalize() { return false; }
// Get a Pass appropriate to print the Node this pass operates on.
virtual Pass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const {
virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const {
return nullptr;
}
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
virtual void Run() { LOG(FATAL) << "not valid"; }
// Run on a single Node.
virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
// Run on a single Function.
virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
// Run on a single FunctionBlock.
virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
// Run on a single DataFlowGraph.
virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
virtual void Run(DataFlowGraph *x) = 0;
// Human-readable short representation.
virtual std::string repr() const = 0;
......@@ -66,29 +59,8 @@ class Pass {
virtual std::string description() const { return "No DOC"; }
};
// NodePass process on any Node types.
class NodePass : public Pass {
public:
virtual void Run(Node *node) = 0;
};
// NodePass process on any Function node types.
class FunctionPass : public Pass {
public:
virtual void Run(Function *node) = 0;
};
// NodePass process on any FunctionBlock node types.
class FunctionBlockPass : public Pass {
public:
virtual void Run(FunctionBlock *node) = 0;
};
// GraphPass processes on any GraphType.
class DataFlowGraphPass : public Pass {
public:
virtual void Run(DataFlowGraph *graph) = 0;
};
class DataFlowGraphPass : public AnalysisPass {};
} // namespace analysis
} // namespace inference
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
......@@ -58,7 +59,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
std::string description() const override { return "DFG pass manager."; }
private:
void AddPass(const std::string& name, Pass* pass) {
void AddPass(const std::string& name, AnalysisPass* pass) {
VLOG(3) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
......@@ -87,7 +88,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
}
// Add the graphviz debuger pass if the parent pass has one.
void AddGraphvizDebugerPass(Pass* pass) {
void AddGraphvizDebugerPass(AnalysisPass* pass) {
auto* debuger_pass = pass->CreateGraphvizDebugerPass();
if (debuger_pass) {
Register(debuger_pass->repr(), debuger_pass);
......@@ -106,7 +107,6 @@ void Analyzer::Run(Argument* argument) {
}
}
passes.push_back("graph_viz_pass");
// Ugly support fluid-to-ir-pass
argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
for (auto& x : data_) {
......
......@@ -38,8 +38,9 @@ limitations under the License. */
#include <gflags/gflags.h>
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
......
......@@ -16,6 +16,7 @@
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
......@@ -24,12 +25,12 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
......@@ -220,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
} // namespace
const float ditu_rnn_target_data[] = {
......@@ -266,11 +234,29 @@ const float ditu_rnn_target_data[] = {
10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0,
93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
169.426, 0, 0, 0, 0, 0, 0, 0};
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestDituRNNPrediction(const std::string &model_path,
const std::string &data_path, int batch_size,
bool use_analysis, bool activate_ir,
int num_times = 1) {
void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
int num_threads) {
AnalysisConfig config;
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
......@@ -281,6 +267,8 @@ void TestDituRNNPrediction(const std::string &model_path,
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
......@@ -288,40 +276,55 @@ void TestDituRNNPrediction(const std::string &model_path,
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(data_path, batch_size);
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
<< ", latency: " << timer.toc() / num_times << "ms";
LOG(INFO) << "=====================================";
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t j = 0; j < size; j++) {
EXPECT_NEAR(data[j], base_data[j], 1e-3);
if (num_threads == 1) {
// Prepare inputs.
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
CompareResult(outputs, base_outputs);
} else {
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
LOG(INFO) << "=====================================";
if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor =
......@@ -350,25 +353,26 @@ void TestDituRNNPrediction(const std::string &model_path,
}
}
// Directly infer with the original model.
TEST(Analyzer, DituRNN_without_analysis) {
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, false, false, FLAGS_repeat);
// Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, DituRNN) {
TestDituRNNPrediction(true, true, FLAGS_num_threads);
}
// Inference with the original model with the analysis turned on, the analysis
// module will transform the program to a data flow graph.
TEST(Analyzer, DituRNN_with_analysis) {
LOG(INFO) << "ditu rnn with analysis";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, true, false, FLAGS_repeat);
}
// Inference with analysis and IR. The IR module will fuse some large kernels.
TEST(Analyzer, DituRNN_with_analysis_with_IR) {
LOG(INFO) << "ditu rnn with analysis and IR fuse";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, true, true, FLAGS_repeat);
// Other unit-tests of DituRNN, test different options of use_analysis,
// activate_ir and multi-threads.
TEST(Analyzer, DituRNN_tests) {
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestDituRNNPrediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis
// module will transform the program to a data flow graph.
TestDituRNNPrediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestDituRNNPrediction(true, true, i);
}
}
} // namespace analysis
......
......@@ -12,19 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
namespace paddle {
......@@ -44,41 +48,67 @@ void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "=====================================";
}
void Main(int batch_size) {
// Three sequence inputs.
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
int64_t data0[] = {0, 1, 2};
for (auto &input : input_slots) {
input.data.Reset(data0, sizeof(data0));
input.shape = std::vector<int>({3, 1});
// dtype --
input.dtype = PaddleDType::INT64;
// LoD --
input.lod = std::vector<std::vector<size_t>>({{0, 3}});
struct DataReader {
DataReader(const std::string &path) : file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line;
tensor->lod.clear();
tensor->lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data);
}
tensor->lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear();
tensor->shape.push_back(data.size());
tensor->shape.push_back(1);
return true;
}
std::unique_ptr<std::ifstream> file;
};
void Main(int batch_size) {
// shape --
// Create Predictor --
AnalysisConfig config;
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
config.ir_passes.push_back("fc_lstm_fuse_pass");
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
int num_batches = 0;
for (int t = 0; t < FLAGS_repeat; t++) {
DataReader reader(FLAGS_infer_data);
while (reader.NextBatch(&input, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
++num_batches;
}
}
PrintTime(sum, batch_size, FLAGS_repeat);
PrintTime(sum, batch_size, num_batches);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
......@@ -100,10 +130,3 @@ void Main(int batch_size) {
TEST(text_classification, basic) { Main(FLAGS_batch_size); }
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
......@@ -263,7 +263,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
};
} // namespace
Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_IA_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger"));
......
......@@ -21,8 +21,8 @@
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
......@@ -42,7 +42,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
return "Transform a DFG to a Fluid ProgramDesc";
}
Pass *CreateGraphvizDebugerPass() const override;
AnalysisPass *CreateGraphvizDebugerPass() const override;
protected:
// Add a Fluid Op into the ProgramDesc.
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include <fstream>
#include <string>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
......
......@@ -66,7 +66,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
};
}
Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
}
......
......@@ -22,8 +22,8 @@
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
......@@ -46,7 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
return "transform a fluid ProgramDesc to a data flow graph.";
}
Pass *CreateGraphvizDebugerPass() const override;
AnalysisPass *CreateGraphvizDebugerPass() const override;
private:
framework::proto::ProgramDesc const *desc_;
......
......@@ -14,15 +14,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
namespace analysis {
using namespace framework;
static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__";
......@@ -48,7 +50,8 @@ class FluidToIrPass final : public DataFlowGraphPass {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
// Load program.
auto program = LoadProgramDesc(*argument->fluid_model_program_path);
argument->origin_program_desc.reset(new proto::ProgramDesc(program));
argument->origin_program_desc.reset(
new framework::proto::ProgramDesc(program));
// Create main data flow graph.
if (!argument->main_dfg) {
argument->main_dfg.reset(new DataFlowGraph);
......@@ -78,12 +81,13 @@ class FluidToIrPass final : public DataFlowGraphPass {
IRPassManager ir_passes(argument_->Get<ProgramDesc>("ir_program_desc"),
nullptr);
// Pass the scope from analysis to IR if needed.
if (argument_->Has(ir::kParamScopeAttr)) {
if (argument_->Has(framework::ir::kParamScopeAttr)) {
// Here the address is passed, attention that IR doesn't own the scope, so
// the real scope in analysis should live during the IR phase.
ir_passes.graph().Set(
ir::kParamScopeAttr,
new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
framework::ir::kParamScopeAttr,
new framework::Scope *(&argument_->Get<framework::Scope>(
framework::ir::kParamScopeAttr)));
}
if (FLAGS_IA_enable_ir) {
......@@ -95,12 +99,12 @@ class FluidToIrPass final : public DataFlowGraphPass {
PADDLE_ENFORCE(argument_->main_dfg.get());
argument_->main_dfg->Build(ir_passes.graph());
// inherit the arguments from ir.
if (ir_passes.graph().Has(ir::kFuseStatisAttr)) {
if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) {
argument_->Set(
ir::kFuseStatisAttr,
framework::ir::kFuseStatisAttr,
new std::unordered_map<std::string, int>(
ir_passes.graph().Get<std::unordered_map<std::string, int>>(
ir::kFuseStatisAttr)));
framework::ir::kFuseStatisAttr)));
}
}
......@@ -112,7 +116,7 @@ class FluidToIrPass final : public DataFlowGraphPass {
private:
// Load parameters from a single file or from a directory.
bool LoadParams(Scope *scope, const std::string &dir,
bool LoadParams(framework::Scope *scope, const std::string &dir,
const std::string &prog_file, const std::string &param_file);
private:
......
......@@ -19,7 +19,7 @@
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace paddle {
namespace inference {
......
......@@ -40,17 +40,6 @@ void DfgPassManager::RunAll() {
}
}
void NodePassManager::RunAll() {
PADDLE_ENFORCE(argument_);
PADDLE_ENFORCE(argument_->main_dfg.get());
auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
for (auto& node : trait) {
for (auto& pass : data_) {
pass->Run(&node);
}
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -33,7 +33,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace paddle {
namespace inference {
......@@ -43,7 +43,7 @@ namespace analysis {
* PassManager is the base class for all pass managers, a pass manager has
* several Pass-es registered, and execute them in the linear order.
*/
class PassManager : public OrderedRegistry<Pass> {
class PassManager : public OrderedRegistry<AnalysisPass> {
public:
PassManager() = default;
// Call all the passes' Initialize methods. The desc and data_flow_graph are
......@@ -89,18 +89,6 @@ class DfgPassManager : public PassManager {
virtual ~DfgPassManager() = default;
};
/*
* A pass manager that process a Node each time.
*/
class NodePassManager : public PassManager {
public:
NodePassManager() = default;
void RunAll() override;
virtual ~NodePassManager() = default;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -34,28 +34,6 @@ class TestDfgPassManager final : public DfgPassManager {
std::string description() const override { return "test doc"; }
};
class TestNodePassManager final : public NodePassManager {
public:
virtual ~TestNodePassManager() = default;
std::string repr() const override { return "test-node-pass-manager"; }
std::string description() const override { return "test doc"; }
};
class TestNodePass final : public NodePass {
public:
virtual ~TestNodePass() = default;
bool Initialize(Argument* argument) override { return true; }
void Run(Node* node) override {
LOG(INFO) << "- Processing node " << node->repr();
}
std::string repr() const override { return "test-node"; }
std::string description() const override { return "some doc"; }
};
TEST(PassManager, DFG_pass_manager) {
TestDfgPassManager manager;
DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
......@@ -71,19 +49,6 @@ TEST(PassManager, DFG_pass_manager) {
manager.RunAll();
}
TEST(PassManager, Node_pass_manager) {
Argument argument(FLAGS_inference_model_dir);
// Pre-process: initialize the DFG with the ProgramDesc first.
FluidToDataFlowGraphPass pass0;
pass0.Initialize(&argument);
pass0.Run(argument.main_dfg.get());
TestNodePassManager manager;
manager.Register("test-node-pass", new TestNodePass);
ASSERT_TRUE(manager.Initialize(&argument));
manager.RunAll();
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -68,7 +68,7 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
}
};
Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
"tensorrt_marked_node");
return new DfgDebuggerPass(config);
......
......@@ -20,7 +20,7 @@
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle {
......@@ -48,7 +48,7 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
return "tensorrt sub-graph mark pass";
}
Pass* CreateGraphvizDebugerPass() const override;
AnalysisPass* CreateGraphvizDebugerPass() const override;
bool Finalize() override;
private:
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle {
......
......@@ -44,20 +44,7 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
analysis
ir_pass_manager
pass
fc_fuse_pass
fc_lstm_fuse_pass
fc_gru_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
......@@ -74,7 +61,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif()
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
......@@ -84,12 +71,24 @@ if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared)
if (WITH_TESTING)
cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
if(WITH_GPU)
set(anakin_test_extra_deps dynload_cuda)
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif()
cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif(WITH_TESTING)
endif()
......@@ -48,7 +48,6 @@ bool AnalysisPredictor::Init(
} else {
place_ = paddle::platform::CPUPlace();
}
PADDLE_ENFORCE(!parent_scope);
if (parent_scope) {
scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope());
......
......@@ -193,7 +193,9 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
return std::move(cls);
}
#ifdef PADDLE_WITH_CUDA
template class PaddleInferenceAnakinPredictor<anakin::NV>;
#endif
template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor.
......@@ -202,10 +204,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create.";
if (config.target_type == AnakinConfig::NVGPU) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::NV>(config));
return x;
#else
LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
return nullptr;
#endif
} else if (config.target_type == AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x(
......
......@@ -14,6 +14,7 @@
#pragma once
#include <glog/logging.h>
#include <sys/time.h>
#include <algorithm>
#include <numeric>
......@@ -88,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
}
}
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) {
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms";
}
} // namespace inference
} // namespace paddle
......@@ -119,7 +119,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale) {
auto& gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
......
......@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
}
};
// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
// the XShape is used to carry the shape and lod of X which will be used in
// flatten_grad, in this way, the framework can reuse the memory of X
// immediately the flatten2_op is finished.
// Considering compatibility issues, we could not fix flatten2_op
class Flatten2OpInferShape : public FlattenOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
FlattenOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output (XShape) of Flatten op should not be null.");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(in_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", "XShape");
}
};
class Flatten2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axis = Attr<int>("axis");
auto in_dims =
scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
framework::AttributeMap attrs;
attrs["shape"] = out_dims;
attrs["inplace"] = false;
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Flatten2OpMaker : public FlattenOpMaker {
public:
void Make() override {
FlattenOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("flatten2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Flatten2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Flatten2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = false;
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
......@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
ops::FlattenOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
ops::Flatten2GradInferShape);
......@@ -30,14 +30,7 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of GRU should not be null.");
......@@ -80,15 +73,20 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
}
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
ctx->ShareLoD("X", "Hidden");
int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims);
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
......
......@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
__global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
T *y, T *mean, T *var, float epsilon,
int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var
T mean_val = static_cast<T>(0);
T var_val = static_cast<T>(0);
double mean_val = 0;
double var_val = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) {
T tmp = x[i];
mean_val += tmp;
var_val += (tmp * tmp);
}
auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(mean_val, var_val),
PairForLayerNormAddFunctor<T>());
.Reduce(PairForLayerNorm<double>(mean_val, var_val),
PairForLayerNormAddFunctor<double>());
if (threadIdx.x == 0) {
auto tmp = pair.first_ / feature_size;
mean[blockIdx.x] = tmp;
var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
mean[blockIdx.x] = static_cast<T>(tmp);
var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
}
__syncthreads();
mean_val = mean[blockIdx.x];
......
......@@ -246,6 +246,88 @@ class ReshapeGradKernel {
}
};
// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
// the XShape is used to carry the shape and lod of X which will be used in
// reshape_grad, in this way, the framework can reuse the memory of X
// immediately the reshape_op is finished.
// Considering compatibility issues, we could not fix reshape_op
class Reshape2Op : public ReshapeOp {
public:
Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ReshapeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
ReshapeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of ReshapeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Reshape2OpMaker : public ReshapeOpMaker {
public:
void Make() override {
ReshapeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("reshape2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Reshape2GradOp : public framework::OperatorWithKernel {
public:
Reshape2GradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = ctx->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
......@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
ops::Reshape2GradMaker);
REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
......@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#endif
......@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault({});
AddComment(R"DOC(
Squeeze Operator.
Remove single-dimensional entries from the shape of a tensor.
Takes a parameter axes with a list of axes to squeeze.
If axes is not provided, all the single dimensions will be removed from the shape.
Remove single-dimensional entries from the shape of a tensor.
Takes a parameter axes with a list of axes to squeeze.
If axes is not provided, all the single dimensions will be removed from the shape.
If an axis is selected with shape entry not equal to one, an error is raised.
Examples:
Case 1:
Given
Given
X.shape = (1, 3, 1, 5)
and
axes = [0]
......@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
Case 2:
Given
X.shape = (1, 3, 1, 5)
and
and
axes = []
we get:
Out.shape = (3, 5)
......@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
}
};
// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
// the XShape is used to carry the shape and lod of X which will be used in
// squeeze_grad, in this way, the framework can reuse the memory of X
// immediately the squeeze2_op is finished.
// Considering compatibility issues, we could not fix squeeze2_op
class Squeeze2OpMaker : public SqueezeOpMaker {
public:
void Make() override {
SqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in SqueezeGradOp.")
.AsIntermediate();
}
};
class Squeeze2OpInferShape : public SqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
SqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Squeeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Squeeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("squeeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Squeeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Squeeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
......@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
ops::SqueezeOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
ops::Squeeze2GradInferShape);
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h"
#include <string>
#include <vector>
namespace paddle {
......@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
auto x_dims = ctx->GetInputDim("X");
......@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
2 &5
\end{pmatrix}$$
- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
)DOC");
......@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
......@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
}
};
// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
// transpose, the XShape is used to carry the shape and lod of X which
// will be used in transpose_grad, in this way, the framework can reuse
// the memory of X immediately the transpose2_op is finished.
// Considering compatibility issues, we could not fix transpose2_op
class Transpose2Op : public TransposeOp {
public:
Transpose2Op(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: TransposeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
TransposeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) should not be null");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
x_shape_dim[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
x_shape_dim[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
ctx->ShareLoD("X", /*->*/ "XShape");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
class Transpose2OpMaker : public TransposeOpMaker {
public:
void Make() override {
TransposeOpMaker::Make();
AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
}
};
class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("transpose2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Transpose2OpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
if (ctx->HasOutput(framework::GradVarName("X"))) {
auto xshape_dim = ctx->GetInputDim("XShape");
auto x_shape_dim =
framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
......@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker);
REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
REGISTER_OP_CPU_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
......@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
REGISTER_OP_CUDA_KERNEL(
transpose_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
......@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddComment(R"DOC(
Unsqueeze Operator.
Insert single-dimensional entries to the shape of a tensor.
Takes one required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example:
Given a tensor such that tensor with shape [3, 4, 5],
Insert single-dimensional entries to the shape of a tensor.
Takes one required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example:
Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
)DOC");
}
......@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
}
};
// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
// unsqueeze, the XShape is used to carry the shape and lod of X which
// will be used in unsqueeze_grad, in this way, the framework can reuse
// the memory of X immediately the unsqueeze2_op is finished.
// Considering compatibility issues, we could not fix unsqueeze2_op
class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
UnsqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Unsqueeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
public:
void Make() override {
UnsqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in UnsqueezeGradOp.")
.AsIntermediate();
}
};
class Unsqueeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape op.
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("unsqueeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Unsqueeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Unsqueeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
......@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
ops::UnsqueezeGradInferShape);
REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
ops::Unsqueeze2GradInferShape);
......@@ -121,6 +121,12 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
if (nullptr == dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")";
if (dlPath.find("nccl") != std::string::npos) {
std::cout
<< "You may need to install 'nccl2' from NVIDIA official website: "
<< "https://developer.nvidia.com/nccl/nccl-download"
<< "before install PaddlePaddle" << std::endl;
}
dlPath = dso_name;
dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
}
......
......@@ -115,6 +115,7 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DPY_VERSION=${PY_VERSION:-2.7}
========================================
......@@ -144,6 +145,7 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DPY_VERSION=${PY_VERSION:-2.7}
}
......@@ -498,7 +500,7 @@ EOF
EOF
if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
else
NCCL_DEPS=""
fi
......
......@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
protocol=2)
file_id += 1
data = []
labels = []
......@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
output['label'] = labels
output['data'] = data
pickle.dump(
output,
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
with open(meta_file, 'a') as meta:
for file in os.listdir(out_path):
......
......@@ -4025,10 +4025,12 @@ def transpose(x, perm, name=None):
helper = LayerHelper('transpose', **locals())
out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op(
type='transpose',
type='transpose2',
inputs={'X': [x]},
outputs={'Out': [out]},
outputs={'Out': [out],
'XShape': [x_shape]},
attrs={'axis': perm})
return out
......@@ -4520,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"Each dimension size given in shape must not be negtive "
"except one unknown dimension.")
helper = LayerHelper("reshape", **locals())
helper = LayerHelper("reshape2", **locals())
out = helper.create_tmp_variable(dtype=x.dtype)
x_shape = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type="reshape",
type="reshape2",
inputs=inputs,
attrs={"shape": shape},
outputs={"Out": out})
outputs={"Out": out,
"XShape": x_shape})
return helper.append_activation(out)
......@@ -4570,11 +4574,13 @@ def squeeze(input, axes, name=None):
"""
helper = LayerHelper("squeeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type="squeeze",
type="squeeze2",
inputs={"X": input},
attrs={"axes": axes},
outputs={"Out": out})
outputs={"Out": out,
"XShape": x_shape})
return out
......@@ -4605,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
"""
helper = LayerHelper("unsqueeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type="unsqueeze",
type="unsqueeze2",
inputs={"X": input},
attrs={"axes": axes},
outputs={"Out": out})
outputs={"Out": out,
"XShape": x_shape})
return out
......@@ -5811,10 +5819,12 @@ def flatten(x, axis=1, name=None):
raise ValueError("The axis should be a int, and in range [0, rank(x)]")
out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op(
type='flatten',
type='flatten2',
inputs={"X": x},
outputs={'Out': out},
outputs={'Out': out,
'XShape': x_shape},
attrs={"axis": axis})
return out
......
......@@ -47,14 +47,14 @@ def train_program():
loss = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(loss)
return avg_loss
return [avg_loss, y_predict]
def optimizer_func():
return fluid.optimizer.SGD(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname):
def train(use_cuda, train_program, params_dirname, inference_model_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer(
......@@ -74,6 +74,8 @@ def train(use_cuda, train_program, params_dirname):
'''
if params_dirname is not None:
trainer.save_params(params_dirname)
trainer.save_inference_model(inference_model_dirname,
['x'], [1])
trainer.stop()
trainer.train(
......@@ -99,15 +101,55 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0])
def infer_by_saved_model(use_cuda, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension should be 2-D and the second dim is 13
# The input data should be >= 0
batch_size = 10
test_reader = paddle.batch(
paddle.dataset.uci_housing.test(), batch_size=batch_size)
test_data = next(test_reader())
test_feat = numpy.array(
[data[0] for data in test_data]).astype("float32")
test_label = numpy.array(
[data[1] for data in test_data]).astype("float32")
assert feed_target_names[0] == 'x'
results = exe.run(inference_program,
feed={feed_target_names[0]: numpy.array(test_feat)},
fetch_list=fetch_targets)
print("infer shape: ", results[0].shape)
print("infer results: ", results[0])
print("ground truth: ", test_label)
def main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
# Directory for saving the trained model
params_dirname = "fit_a_line.inference.model"
params_dirname = "fit_a_line.model"
inference_model_dirname = "fit_a_line.inference_model"
train(use_cuda, train_program, params_dirname)
train(use_cuda, train_program, params_dirname, inference_model_dirname)
infer(use_cuda, inference_program, params_dirname)
infer_by_saved_model(use_cuda, inference_model_dirname)
class TestFitALine(unittest.TestCase):
......
......@@ -36,6 +36,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid import core
from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.compat as cpt
from paddle.compat import long_type
import hashlib
......@@ -315,8 +316,9 @@ def pad_batch_data(insts,
"""
return_list = []
max_len = max(len(inst) for inst in insts)
num_token = reduce(lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0
num_token = six.moves.reduce(
lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array(
......@@ -328,7 +330,7 @@ def pad_batch_data(insts,
return_list += [inst_weight.astype("float32").reshape([-1, 1])]
else: # position data
inst_pos = np.array([
range(1, len(inst) + 1) + [0] * (max_len - len(inst))
list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, 1])]
......@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
return_num_token=True)
data_input_dict = dict(
zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
]))
list(
zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
])))
return data_input_dict, np.asarray([num_token], dtype="float32")
......@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20))
init = False
for pass_id in xrange(TrainTaskConfig.pass_num):
for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
for batch_id, data in enumerate(train_data()):
if batch_id >= 5:
......@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model)
total_num_token += num_token
feed_kv_pairs = data_input_dict.items()
feed_kv_pairs = list(data_input_dict.items())
if TrainTaskConfig.local:
feed_kv_pairs += {
feed_kv_pairs += list({
lr_scheduler.learning_rate.name: lr_rate
}.items()
}.items())
feed_list.append(dict(feed_kv_pairs))
if not init:
......@@ -873,6 +876,7 @@ class DataReader(object):
f = tarfile.open(fpaths[0], "r")
for line in f.extractfile(tar_fname):
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1):
......@@ -882,8 +886,9 @@ class DataReader(object):
if not os.path.isfile(fpath):
raise IOError("Invalid file: %s" % fpath)
with open(fpath, "r") as f:
with open(fpath, "rb") as f:
for line in f:
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1):
......@@ -892,8 +897,9 @@ class DataReader(object):
@staticmethod
def load_dict(dict_path, reverse=False):
word_dict = {}
with open(dict_path, "r") as fdict:
with open(dict_path, "rb") as fdict:
for idx, line in enumerate(fdict):
line = cpt.to_text(line)
if reverse:
word_dict[idx] = line.strip("\n")
else:
......@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
"""
......
......@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
outs, _ = self._calc_output(place)
return outs
def _calc_output(self, place, parallel=False):
def _calc_output(self, place, parallel=False, no_check_set=None):
program = Program()
block = program.global_block()
......@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
# if not, fill the fetch_list by the user configured outputs in test.
if len(fetch_list) == 0:
for var_name, var in six.iteritems(outputs):
if no_check_set is not None and var_name in no_check_set:
continue
if isinstance(var, list):
for v in var:
fetch_list.append(v)
......@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
return_numpy=False)
return outs, fetch_list
def check_output_with_place(self, place, atol, equal_nan=False):
outs, fetch_list = self._calc_output(place)
def check_output_with_place(self,
place,
atol,
no_check_set=None,
equal_nan=False):
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs:
continue
if no_check_set is not None and out_name in no_check_set:
continue
def find_actual(target_name, fetch_list):
found = [
......@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0))
return places
def check_output(self, atol=1e-5, equal_nan=False):
def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
places = self._get_places()
for place in places:
self.check_output_with_place(place, atol, equal_nan)
self.check_output_with_place(place, atol, no_check_set, equal_nan)
def check_output_customized(self, checker):
places = self._get_places()
......
......@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
pserver_prog = t.get_pserver_program(args.current_endpoint)
startup_prog = t.get_startup_program(args.current_endpoint,
pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
......@@ -147,6 +148,8 @@ def runtime_main(test_class):
import paddle.compat as cpt
import socket
from contextlib import closing
class TestDistBase(unittest.TestCase):
......@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
def setUp(self):
self._trainers = 2
self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._python_interp = "python"
self._sync_mode = True
self._mem_opt = False
self._use_reduce = False
self._setup_config()
def _find_free_port(self):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
......
......@@ -22,14 +22,17 @@ from op_test import OpTest
class TestFlattenOp(OpTest):
def setUp(self):
self.op_type = "flatten"
self.op_type = "flatten2"
self.init_test_case()
self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.in_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=["XShape"])
def test_check_grad(self):
self.check_grad(["X"], "Out")
......
......@@ -53,12 +53,11 @@ class TestFusionLSTMOp(OpTest):
self.M = 8
self.D = 16
self.has_initial_state = False
self.use_peepholes = False
self.is_reverse = False
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.use_peepholes = False
self.use_seq = False
self.set_conf()
T = sum(self.lod[0])
......@@ -108,7 +107,6 @@ class TestFusionLSTMOp(OpTest):
}
self.attrs = {
'use_peepholes': self.use_peepholes,
'use_seq': self.use_seq,
'is_reverse': self.is_reverse,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
......@@ -178,50 +176,18 @@ class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
self.is_reverse = True
class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
class TestFusionLSTMOpPeepholesInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.lod = [[3]]
self.D = 16
class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.is_reverse = True
class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
self.is_reverse = True
class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.is_reverse = True
self.lod = [[2]]
self.D = 8
if __name__ == '__main__':
......
......@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i]))
@unittest.skip(reason="CI timeout")
def test_fetch_op(self):
tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
tst_reader_iter = tst_reader()
......@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
if batch_id == 2:
break
@unittest.skip(reason="CI timeout")
def test_feed_op(self):
os.environ['CPU_NUM'] = str(4)
if core.is_compiled_with_cuda():
......
......@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
import six
from op_test import OpTest
......@@ -62,17 +63,20 @@ class PReluTest(OpTest):
# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
# class TestCase1(PReluTest):
# def initTestCase(self):
# self.attrs = {'mode': "all"}
if six.PY2:
# class TestCase2(PReluTest):
# def initTestCase(self):
# self.attrs = {'mode': "channel"}
class TestCase1(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "all"}
class TestCase2(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "channel"}
class TestCase3(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "element"}
# class TestCase3(PReluTest):
# def initTestCase(self):
# self.attrs = {'mode': "element"}
if __name__ == "__main__":
unittest.main()
......@@ -22,106 +22,39 @@ from op_test import OpTest
class TestReshapeOp(OpTest):
def setUp(self):
ori_shape = (2, 25)
new_shape = (5, 10)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer2(OpTest):
def setUp(self):
ori_shape = (2, 2, 6)
new_shape = (2, 0, 3, -1)
infered_shape = (2, 2, 3, -1)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpInplace(OpTest):
def setUp(self):
ori_shape = (2, 25)
new_shape = (5, 10)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.init_data()
self.op_type = "reshape2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.attrs = {"shape": self.new_shape}
self.outputs = {
"Out": self.inputs["X"].reshape(self.infered_shape),
'XShape': np.random.random(self.ori_shape).astype("float32")
}
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def init_data(self):
self.ori_shape = (2, 25)
self.new_shape = (5, 10)
self.infered_shape = (5, 10)
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace2(OpTest):
def setUp(self):
ori_shape = (2, 2, 6)
new_shape = (2, 0, 3, -1)
infered_shape = (2, 2, 3, -1)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
class TestReshapeOpDimInfer1(TestReshapeOp):
def init_data(self):
self.ori_shape = (5, 10)
self.new_shape = (5, -1, 5)
self.infered_shape = (5, -1, 5)
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer2(TestReshapeOp):
def init_data(self):
self.ori_shape = (2, 2, 6)
self.new_shape = (2, 0, 3, -1)
self.infered_shape = (2, 2, 3, -1)
class TestReshapeOpWithInputShape(OpTest):
......@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
new_shape = (0, -1, 5)
actual_shape = (2, 3, 5)
self.op_type = "reshape"
self.op_type = "reshape2"
self.inputs = {
"X": np.random.random(ori_shape).astype("float32"),
"Shape": np.array(
actual_shape, dtype="int32")
}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
self.outputs = {
"Out": self.inputs["X"].reshape(actual_shape),
'XShape': np.random.random(ori_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(["X"], "Out")
self.check_grad(["X"], "Out", sum_outputs=["Out"])
if __name__ == "__main__":
......
......@@ -23,14 +23,17 @@ from op_test import OpTest
# Correct: General.
class TestSqueezeOp(OpTest):
def setUp(self):
self.op_type = "squeeze"
self.op_type = "squeeze2"
self.init_test_case()
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(["X"], "Out")
......
......@@ -22,16 +22,19 @@ from op_test import OpTest
class TestTransposeOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = "transpose"
self.op_type = "transpose2"
self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.attrs = {'axis': list(self.axis)}
self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
self.outputs = {
'XShape': np.random.random(self.shape).astype("float32"),
'Out': self.inputs['X'].transpose(self.axis)
}
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(['X'], 'Out')
self.check_grad(['X'], 'Out', sum_outputs=['Out'])
def initTestCase(self):
self.shape = (3, 4)
......
......@@ -24,13 +24,16 @@ from op_test import OpTest
class TestUnsqueezeOp(OpTest):
def setUp(self):
self.init_test_case()
self.op_type = "unsqueeze"
self.op_type = "unsqueeze2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
self.check_output(no_check_set=["XShape"])
def test_check_grad(self):
self.check_grad(["X"], "Out")
......
......@@ -431,6 +431,28 @@ class Trainer(object):
exe = executor.Executor(self.place)
io.save_persistables(exe, dirname=param_path)
def save_inference_model(self, param_path, feeded_var_names,
target_var_indexes):
"""
Save model for cpp inference into :code:`param_path`.
Args:
param_path(str): The path to save parameters.
feeded_var_names(list(str)): The name of the vars that you
need to feed in before run program.
target_var_indexes(list(int)): the index of target var that
you need to return in trainer.train_func.
Returns:
None
"""
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
target_vars = [
self.train_func_outputs[index] for index in target_var_indexes
]
io.save_inference_model(param_path, feeded_var_names, target_vars,
exe)
@contextlib.contextmanager
def _prog_and_scope_guard(self):
with framework.program_guard(
......
......@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
indent += 1
# sort all vars
all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
for var in all_vars:
print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
......
......@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
input_deps = grad_name_to_send_dummy_out.values()
program.global_block().append_op(
type="send_barrier",
inputs={"X": input_deps},
inputs={"X": list(input_deps)},
outputs={"Out": send_barrier_out},
attrs={
"endpoints": pserver_endpoints,
......@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
Args:
recv_vars (list): Variable list to recv for current trainer_id
eplist (list): A list of strings indicating
eplist (list): A list of strings indicating
Returns:
Program: trainer side startup program.
......@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
if len(splited_var) <= 1:
continue
# NOTE: if enable memory optimization, origin vars maybe removed.
if startup_program.global_block().vars.has_key(varname):
if varname in startup_program.global_block().vars:
orig_param = startup_program.global_block().vars[varname]
else:
origin_param_var = self.origin_program.global_block().vars[
......@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
Args:
endpoint (str): current pserver endpoint.
Returns:
tuple: (main_program, startup_program), of type "Program"
"""
......@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
endpoint (str): current pserver endpoint.
pserver_program (Program): deprecated, call get_pserver_program first.
startup_program (Program): deprecated, should pass startup_program
when initalizing
when initalizing
Returns:
Program: parameter server side startup program.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册