From 2f82d72ede17822f52a789e92afca6f8112bc44e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 17 Oct 2016 07:17:08 +0000 Subject: [PATCH] Fix bug in yield dictionary in DataProvider. (#197) * Fix bug in yield dictionary in DataProvider. * Also make virtualenv work in Paddle. --- CMakeLists.txt | 2 +- cmake/util.cmake | 17 ++++++++ demo/mnist/data/get_mnist_data.sh | 0 demo/mnist/mnist_provider.py | 19 ++++----- demo/mnist/vgg_16_mnist.py | 1 + .../ui/data_provider/mnist_provider.dict.py | 10 ++--- doc_cn/ui/data_provider/pydataprovider2.rst | 2 - .../gserver/dataproviders/PyDataProvider2.cpp | 3 +- paddle/gserver/tests/test_PyDataProvider2.cpp | 2 +- paddle/gserver/tests/test_PyDataProvider2.py | 2 +- paddle/utils/.gitignore | 1 + paddle/utils/CMakeLists.txt | 6 ++- paddle/utils/PythonUtil.cpp | 31 ++++++++++---- paddle/utils/PythonUtil.h | 2 + paddle/utils/enable_virtualenv.py | 10 +++++ python/paddle/trainer/PyDataProvider2.py | 24 ++++++----- python/paddle/trainer/config_parser.py | 4 ++ .../paddle/trainer_config_helpers/networks.py | 42 ++++++++++++++----- 18 files changed, 126 insertions(+), 52 deletions(-) mode change 100644 => 100755 demo/mnist/data/get_mnist_data.sh create mode 100644 paddle/utils/.gitignore create mode 100644 paddle/utils/enable_virtualenv.py diff --git a/CMakeLists.txt b/CMakeLists.txt index b85709f807b..4613155f770 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8) project(paddle CXX C) set(PADDLE_MAJOR_VERSION 0) set(PADDLE_MINOR_VERSION 8) -set(PADDLE_PATCH_VERSION 0b1) +set(PADDLE_PATCH_VERSION 0b2) set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") diff --git a/cmake/util.cmake b/cmake/util.cmake index d776c3ae499..0fa36f070cc 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME) cuda_add_library(${TARGET_NAME} STATIC ${ARGN}) set(CUDA_NVCC_FLAGS ${NVCC_FLAG}) endmacro() + + +# Creates C resources file from files in given resource file +function(create_resources res_file output) + # Create empty output file + file(WRITE ${output} "") + # Get short filename + string(REGEX MATCH "([^/]+)$" filename ${res_file}) + # Replace filename spaces & extension separator for C compatibility + string(REGEX REPLACE "\\.| |-" "_" filename ${filename}) + # Read hex data from file + file(READ ${res_file} filedata HEX) + # Convert hex data for C compatibility + string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata}) + # Append data to output file + file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n") +endfunction() diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh old mode 100644 new mode 100755 diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py index 0f14ded2dce..32af29730a7 100644 --- a/demo/mnist/mnist_provider.py +++ b/demo/mnist/mnist_provider.py @@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import * # Define a py data provider -@provider(input_types=[ - dense_vector(28 * 28), - integer_value(10) -]) +@provider(input_types={ + 'pixel': dense_vector(28 * 28), + 'label': integer_value(10) +}) def process(settings, filename): # settings is not used currently. imgf = filename + "-images-idx3-ubyte" labelf = filename + "-labels-idx1-ubyte" @@ -14,20 +14,19 @@ def process(settings, filename): # settings is not used currently. f.read(16) l.read(8) - + # Define number of samples for train/test if "train" in filename: n = 60000 else: n = 10000 - + for i in range(n): label = ord(l.read(1)) pixels = [] - for j in range(28*28): + for j in range(28 * 28): pixels.append(float(ord(f.read(1))) / 255.0) - yield { "pixel": pixels, 'label': label } - + yield {"pixel": pixels, 'label': label} + f.close() l.close() - \ No newline at end of file diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py index ad0a4de3215..45a45bb061a 100644 --- a/demo/mnist/vgg_16_mnist.py +++ b/demo/mnist/vgg_16_mnist.py @@ -47,6 +47,7 @@ predict = small_vgg(input_image=img, if not is_predict: lbl = data_layer(name="label", size=label_size) + inputs(img, lbl) outputs(classification_cost(input=predict, label=lbl)) else: outputs(predict) diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py index 4eab5b1fd3b..bf13b56372b 100644 --- a/doc_cn/ui/data_provider/mnist_provider.dict.py +++ b/doc_cn/ui/data_provider/mnist_provider.dict.py @@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import * # Define a py data provider -@provider(input_types=[ - dense_vector(28 * 28), - integer_value(10) -]) +@provider(input_types={ + 'pixel': dense_vector(28 * 28), + 'label': integer_value(10) +}) def process(settings, filename): # settings is not used currently. f = open(filename, 'r') # open one of training file @@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently. pixels_float.append(float(each_pixel_str)) # give data to paddle. - yield { "pixel": pixels_float, 'label': int(label) } + yield {"pixel": pixels_float, 'label': int(label)} f.close() # close file diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst index 9e1d8c531f5..80b40084d8f 100644 --- a/doc_cn/ui/data_provider/pydataprovider2.rst +++ b/doc_cn/ui/data_provider/pydataprovider2.rst @@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数: 是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size * cache 是数据缓存的策略,参考 `cache`_ * init_hook 是初始化时调用的函数,参考 `init_hook`_ -* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以 - 返回一个list或者tuple。如果是false的话,只能够返回list或者tuple * check 设置成true的话,会根据input_types检查数据的合法性。 * check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果 check是false的话,没有作用。 diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp index 2f9a1223c6e..e3e472ac166 100644 --- a/paddle/gserver/dataproviders/PyDataProvider2.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp @@ -246,8 +246,7 @@ private: PyObjectPtr && kwargs) { LOG(INFO) << "loading dataprovider " << model <<"::" << className; - PyObjectPtr module(PyImport_ImportModule(model.c_str())); - CHECK_PY(module) << "Cannot imort module " << model.c_str(); + PyObjectPtr module = py::import(model); PyObjectPtr moduleDict(PyModule_GetDict(module.get())); CHECK_PY(moduleDict) << "Invoke module.__dict__ error"; PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp index e75e53ab7f4..6bf1e329251 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.cpp +++ b/paddle/gserver/tests/test_PyDataProvider2.cpp @@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) { } TEST(PyDataProvider2, init_hook) { - paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle")); + paddle::PyObjectPtr pickle = paddle::py::import("pickle"); paddle::PyObjectPtr globals( PyModule_GetDict(PyImport_AddModule("__main__"))); PyDict_SetItemString(globals.get(), "pickle", pickle.get()); diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py index 145fe85cff7..71c3335231e 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.py +++ b/paddle/gserver/tests/test_PyDataProvider2.py @@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename): yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)] -@provider(input_types=[index_slot(10), index_slot(10)]) +@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)}) def test_input_order(setting, filename): for _ in xrange(1000): yield { diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore new file mode 100644 index 00000000000..f2cfd740941 --- /dev/null +++ b/paddle/utils/.gitignore @@ -0,0 +1 @@ +enable_virtualenv.c diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt index 0557b01e36f..45240b5002a 100644 --- a/paddle/utils/CMakeLists.txt +++ b/paddle/utils/CMakeLists.txt @@ -2,6 +2,9 @@ file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_SOURCES . *.cpp) +create_resources(enable_virtualenv.py enable_virtualenv.c) +set(UTIL_RES enable_virtualenv.c) + if(APPLE) file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp) else() @@ -9,7 +12,8 @@ else() endif() add_library(paddle_utils STATIC ${UTIL_SOURCES} - ${UTIL_ARCH_SOURCES}) + ${UTIL_ARCH_SOURCES} + ${UTIL_RES}) add_style_check_target(paddle_utils ${UTIL_HEADERS}) add_style_check_target(paddle_utils ${UTIL_SOURCES} ${UTIL_ARCH_SOURCES}) diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp index 78c3a80674f..90e5093f96e 100644 --- a/paddle/utils/PythonUtil.cpp +++ b/paddle/utils/PythonUtil.cpp @@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex; PyGuard::PyGuard() : guard_(g_pyMutex) {} -static void printPyErrorStack(std::ostream& os, bool withEndl = false) { +static void printPyErrorStack(std::ostream& os, bool withEndl = false, + bool withPyPath = true) { PyObject * ptype, *pvalue, *ptraceback; PyErr_Fetch(&ptype, &pvalue, &ptraceback); PyErr_NormalizeException(&ptype, &pvalue, &ptraceback); PyErr_Clear(); + if (withPyPath) { + os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path"))); + if (withEndl) { + os << std::endl; + } + } PyTracebackObject* obj = (PyTracebackObject*)ptraceback; os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) @@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName, const std::string& funcName, const std::vector& args) { PyGuard guard; - PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str())); - CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName; - PyObjectPtr pyModule(PyImport_Import(pyModuleName.get())); - CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed."; + PyObjectPtr pyModule = py::import(moduleName); PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str())); CHECK_PY(pyFunc) << "GetAttrString failed."; PyObjectPtr pyArgs(PyTuple_New(args.size())); @@ -143,7 +147,7 @@ PyObjectPtr createPythonClass( const std::vector& args, const std::map& kwargs) { PyGuard guard; - PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str())); + PyObjectPtr pyModule = py::import(moduleName); LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str(); CHECK_PY(pyModule) << "Import module " << moduleName << " failed."; PyObjectPtr pyDict(PyModule_GetDict(pyModule.get())); @@ -181,18 +185,29 @@ std::string getPyCallStack() { printPyErrorStack(os, true); return os.str(); } + +PyObjectPtr import(const std::string &moduleName) { + auto module = PyImport_ImportModule(moduleName.c_str()); + CHECK_PY(module) << "Import " << moduleName << "Error"; + return PyObjectPtr(module); +} + } // namespace py #endif - +extern "C" { +extern const char enable_virtualenv_py[]; +} void initPython(int argc, char** argv) { #ifndef PADDLE_NO_PYTHON Py_SetProgramName(argv[0]); Py_Initialize(); PySys_SetArgv(argc, argv); - // python blocks SIGINT. Need to enable it. signal(SIGINT, SIG_DFL); + + // Manually activate virtualenv when user is using virtualenv + PyRun_SimpleString(enable_virtualenv_py); #endif } diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h index db02d1252b4..00fc177022a 100644 --- a/paddle/utils/PythonUtil.h +++ b/paddle/utils/PythonUtil.h @@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName, CHECK((x) != nullptr) << ::paddle::py::getPyCallStack() namespace py { +PyObjectPtr import(const std::string& moduleName); + /** * Cast a PyLong or PyInt to int type T. * @tparam T return type. diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py new file mode 100644 index 00000000000..99d822a4145 --- /dev/null +++ b/paddle/utils/enable_virtualenv.py @@ -0,0 +1,10 @@ +import os + +def __activate_virtual_env__(): + __path__ = os.getenv('VIRTUAL_ENV') + if __path__ is None: + return + __script__ = os.path.join(__path__, 'bin', 'activate_this.py') + execfile(__script__, {'__file__': __script__}) + +__activate_virtual_env__() diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index 34f5dd41b7e..53409b746d8 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, calc_batch_size=None, cache=CacheType.NO_CACHE, check=False, check_fail_continue=False, - use_dynamic_order=True, init_hook=None, **kwargs): """ Provider decorator. Use it to make a function into PyDataProvider2 object. @@ -228,9 +227,15 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, The configuration of data provider should be setup by\: :param input_types: Specify the input types, can also be set in init_hook. - It is a list of InputType object. For example, input_types= \ - [dense_vector(9), integer_value(2)]. - :type input_types: list|tuple + It could be a list of InputType object. For example, + input_types=[dense_vector(9), integer_value(2)]. Or user + can set a dict of InputType object, which key is + data_layer's name. For example, input_types=\ + {'img': img_features, 'label': label}. when using dict of + InputType, user could yield a dict of feature values, which + key is also data_layer's name. + + :type input_types: list|tuple|dict :param should_shuffle: True if data should shuffle. Pass None means shuffle when is training and not to shuffle when is testing. @@ -281,12 +286,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, drop the wrong format data when it is True. Has no effect when check set to False. :type check_fail_continue: bool - - :param use_dynamic_order: Allow provider to yield a dictionary object, whose - key is a input data layer name, and value is the - feature value. The tuples are still allowed when - use_dynmaic_order is True. - :type use_dynamic_order: bool """ def __wrapper__(generator): @@ -340,6 +339,11 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, assert self.slots is not None assert self.generator is not None + use_dynamic_order = False + if isinstance(self.slots, dict): # reorder input_types + self.slots = [self.slots[ipt] for ipt in self.input_order] + use_dynamic_order = True + if len(self.slots) == 1: self.generator = SingleSlotWrapper(self.generator) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 18f0b1b4e49..c1e74c7a2d8 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -216,6 +216,10 @@ def Inputs(*args): if g_current_submodel is g_root_submodel: g_config.model_config.input_layer_names.append(name) +@config_func +def HasInputsSet(): + return len(g_config.model_config.input_layer_names) != 0 + # Define the name of the output layers of the NeuralNetwork. # Usually the output is simply the cost layer. diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index c54ec309698..d8f96195020 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -30,7 +30,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", 'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', 'simple_attention', 'text_conv_pool', - 'bidirectional_lstm', 'outputs'] + 'bidirectional_lstm', 'inputs', 'outputs'] ###################################################### @@ -372,8 +372,8 @@ def small_vgg(input_image, num_channels, num_classes): tmp = __vgg__(tmp, 128, 2, [0.4, 0]) tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0]) tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0]) - tmp = img_pool_layer(input = tmp, stride = 2, - pool_size = 2, pool_type = MaxPooling()) + tmp = img_pool_layer(input=tmp, stride=2, + pool_size=2, pool_type=MaxPooling()) tmp = dropout_layer(input=tmp, dropout_rate=0.5) tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5), act=LinearActivation()) @@ -745,7 +745,6 @@ def gru_group(input, gru_bias_attr=None, act=None, gate_act=None, gru_layer_attr=None): - """ gru_group is a recurrent layer group version Gated Recurrent Unit. It does exactly the same calculation as the grumemory layer does. A promising @@ -919,12 +918,12 @@ def bidirectional_lstm(input, size, name=None, return_seq=False, fw = simple_lstm(name='%s_fw' % name, input=input, size=size, **dict((k[len('fwd_'):], v) for k, v in args.iteritems() - if k.startswith('fwd_'))) + if k.startswith('fwd_'))) bw = simple_lstm(name="%s_bw" % name, input=input, size=size, reverse=True, **dict((k[len('bwd_'):], v) for k, v in args.iteritems() - if k.startswith('bwd_'))) + if k.startswith('bwd_'))) if return_seq: return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr, @@ -1052,14 +1051,30 @@ def dropout_layer(input, dropout_rate, name=None): layer_attr=ExtraAttr(drop_rate=dropout_rate)) -def outputs(layers, *args): +def inputs(layers, *args): + """ + Declare the inputs of network. The order of input should be as same as + the data provider's return order. + + :param layers: Input Layers. + :type layers: list|tuple|LayerOutput. + :return: """ - Declare the end of network. Currently it will only calculate the - input/output order of network. It will calculate the predict network or - train network's output automatically. + if isinstance(layers, LayerOutput) or isinstance(layers, basestring): + layers = [layers] + if len(args) != 0: + layers.extend(args) - :param layers: + Inputs(*[l.name for l in layers]) + + +def outputs(layers, *args): + """ + Declare the outputs of network. If user have not defined the inputs of + network, this method will calculate the input order by dfs travel. + + :param layers: Output layers. :type layers: list|tuple|LayerOutput :return: """ @@ -1093,6 +1108,11 @@ def outputs(layers, *args): layers.extend(args) assert len(layers) > 0 + + if HasInputsSet(): # input already set + Outputs(*[l.name for l in layers]) + return # just return outputs. + if len(layers) != 1: logger.warning("`outputs` routine try to calculate network's" " inputs and outputs order. It might not work well." -- GitLab