add git ignore

217ed14f · Zeyu Chen · 83aba534 · 217ed14f · 217ed14f · 217ed14f
5 changed file
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/paddle_hub/module.py
+++ b/paddle_hub/module.py
@@ -17,21 +17,19 @@ from __future__ import division
 from __future__ import print_function

 import paddle.fluid as fluid
-import paddle_hub as hub
 import numpy as np
 import tempfile
 import os

 from collections import defaultdict
-from paddle_hub.downloader import download_and_uncompress
+from downloader import download_and_uncompress

-__all__ = ["Module", "ModuleSpec"]
+__all__ = ["Module", "ModuleDesc"]


 class Module(object):
    def __init__(self, module_url):
        # donwload module
-        #module_dir = downloader.download_and_uncompress(module_url)
        module_dir = download_and_uncompress(module_url)

        # load paddle inference model
@@ -49,7 +47,9 @@ class Module(object):
        print(self.fetch_targets)

        # load assets
-        # self._load_assets(module_dir)
+        self.dict = defaultdict(int)
+        self.dict.setdefault(0)
+        self._load_assets(module_dir)

    #TODO(ZeyuChen): Need add register more signature to execute different
    # implmentation
@@ -60,7 +60,7 @@ class Module(object):
        # if it's NLP word embedding task, then do words preprocessing
        # if it's image classification or image feature task do the other works

-        # if it's 
+        # if it's
        word_ids_lod_tensor = self._process_input(inputs)
        np_words_id = np.array(word_ids_lod_tensor)
        print("word_ids_lod_tensor\n", np_words_id)
@@ -71,9 +71,9 @@ class Module(object):
            fetch_list=self.fetch_targets,
            return_numpy=False)  # return_numpy=Flase is important

-        print(self.feed_target_names)
-        print(self.fetch_targets)
-        # np_result = np.array(results[0])
+        print("module fetch_target_names", self.feed_target_names)
+        print("module fetch_targets", self.fetch_targets)
+        np_result = np.array(results[0])

        return np_result

@@ -128,8 +128,6 @@ class Module(object):

    # load assets folder
    def _load_assets(self, module_dir):
-        self.dict = defaultdict(int)
-        self.dict.setdefault(0)
        assets_dir = os.path.join(module_dir, "assets")
        tokens_path = os.path.join(assets_dir, "tokens.txt")
        word_id = 0
@@ -153,11 +151,6 @@ class Module(object):
            os.makedirs(path)


-class ModuleImpl(object):
-    def get_signature_name():
-        pass
-
-
 class ModuleDesc(object):
    def __init__(self):
        pass
@@ -165,22 +158,25 @@ class ModuleDesc(object):
    @staticmethod
    def _mkdir(path):
        if not os.path.exists(path):
-            print("mkdir", path)
            os.makedirs(path)

    @staticmethod
-    def save_dict(path, word_dict, dict_name="dict.txt"):
+    def save_dict(path, word_dict, dict_name):
+        """ Save dictionary for NLP module
+        """
        ModuleDesc._mkdir(path)
-        with open(os.path.join(path, "tokens.txt"), "w") as fo:
+        with open(os.path.join(path, dict_name), "w") as fo:
            print("tokens.txt path", os.path.join(path, "tokens.txt"))
            dict_str = "\n".join(word_dict)
            fo.write(dict_str)

    @staticmethod
-    def save_module_dict(module_path, word_dict):
+    def save_module_dict(module_path, word_dict, dict_name="dict.txt"):
+        """ Save dictionary for NLP module
+        """
        assets_path = os.path.join(module_path, "assets")
        print("save_module_dict", assets_path)
-        ModuleDesc.save_dict(assets_path, word_dict)
+        ModuleDesc.save_dict(assets_path, word_dict, dict_name)
        pass



--- a/paddle_hub/test_module.py
+++ b/paddle_hub/test_module.py
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from module import *
+
+
+class TestModule(unittest.TestCase):
+    def test_word2vec_module_usage(self):
+        module_link = "http://paddlehub.cdn.bcebos.com/word2vec/w2v_saved_inference_module.tar.gz"
+        module = Module(module_link)
+        inputs = [["it", "is", "new"], ["hello", "world"]]
+        tensor = module._process_input(inputs)
+        print(tensor)
+        result = module(inputs)
+        print(result)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/paddle_hub/utils.py
+++ b/paddle_hub/utils.py
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["mkdir"]
+
+
+def mkdir(path):
+    """ the same as the shell command mkdir -p "
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
--- a/test_export_n_load_module.py
+++ b/test_export_n_load_module.py
+# coding: utf-8
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+import paddle_hub as hub
+import unittest
+import os
+
+EMBED_SIZE = 16
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 64
+PASS_NUM = 1
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+data = paddle.dataset.imikolov.train(word_dict, N)
+
+_MOCK_DATA = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
+
+
+def mock_data():
+    for d in _MOCK_DATA:
+        yield d
+
+
+#batch_reader = paddle.batch(mock_data, BATCH_SIZE)
+batch_reader = paddle.batch(data, BATCH_SIZE)
+batch_size = 0
+for d in batch_reader():
+    batch_size += 1
+print("imikolov simple dataset batch_size =", batch_size)
+
+
+def module_fn(trainable=False):
+    # create word input
+    words = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype="int64")
+
+    # create embedding
+    # emb_name = "{}:embedding".format(module_scope)
+    emb_name = "embedding"
+    emb_param_attr = fluid.ParamAttr(name=emb_name, trainable=trainable)
+    word_emb = fluid.layers.embedding(
+        input=words,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=True,
+        param_attr=emb_param_attr)
+
+    # return feeder and fetch_list
+    return words, word_emb
+
+
+def word2vec(words, is_sparse, trainable=True):
+    emb_param_attr = fluid.ParamAttr(name="embedding", trainable=trainable)
+    embed_first = fluid.layers.embedding(
+        input=words[0],
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=emb_param_attr)
+    embed_second = fluid.layers.embedding(
+        input=words[1],
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=emb_param_attr)
+    embed_third = fluid.layers.embedding(
+        input=words[2],
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=emb_param_attr)
+    embed_fourth = fluid.layers.embedding(
+        input=words[3],
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=emb_param_attr)
+
+    concat_emb = fluid.layers.concat(
+        input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
+    hidden1 = fluid.layers.fc(input=concat_emb, size=HIDDEN_SIZE, act='sigmoid')
+    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+
+    # declare later than predict word
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+    avg_cost = fluid.layers.mean(cost)
+
+    return predict_word, avg_cost
+
+
+def train(use_cuda=False):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+    word_list = [first_word, second_word, third_word, forth_word, next_word]
+    predict_word, avg_cost = word2vec(word_list, is_sparse=True)
+
+    main_program = fluid.default_main_program()
+    startup_program = fluid.default_startup_program()
+
+    sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=1e-3)
+
+    sgd_optimizer.minimize(avg_cost)
+    exe = fluid.Executor(place)
+    exe.run(startup_program)  # initialization
+
+    # fluid.io.load_vars(
+    #     executor=exe,
+    #     dirname="./w2v_model",
+    #     vars=[main_program.global_block().var("embedding")])
+    # 也可使用predicate方式搜索变量
+    step = 0
+    for epoch in range(0, PASS_NUM):
+        for mini_batch in batch_reader():
+            # print("mini_batch", mini_batch)
+            # 定义输入变量
+            feed_var_list = [
+                main_program.global_block().var("firstw"),
+                main_program.global_block().var("secondw"),
+                main_program.global_block().var("thirdw"),
+                main_program.global_block().var("fourthw"),
+                main_program.global_block().var("nextw")
+            ]
+            feeder = fluid.DataFeeder(feed_list=feed_var_list, place=place)
+            cost = exe.run(
+                main_program,
+                feed=feeder.feed(mini_batch),
+                fetch_list=[avg_cost])
+            step += 1
+            if step % 100 == 0:
+                print("Epoch={} Step={} Cost={}".format(epoch, step, cost[0]))
+
+    model_dir = "./w2v_model"
+    # save part of model
+    var_list_to_saved = [main_program.global_block().var("embedding")]
+    print("saving model to %s" % model_dir)
+    fluid.io.save_vars(
+        executor=exe, dirname=model_dir + "_save_vars", vars=var_list_to_saved)
+
+    # save the whole model
+    fluid.io.save_persistables(
+        executor=exe, dirname=model_dir + "_save_persistables")
+
+    saved_model_path = "w2v_saved_inference_model"
+    # save inference model including feed and fetch variable info
+    fluid.io.save_inference_model(
+        dirname=saved_model_path,
+        feeded_var_names=["firstw", "secondw", "thirdw", "fourthw"],
+        target_vars=[predict_word],
+        executor=exe)
+
+    dictionary = []
+    for w in word_dict:
+        if isinstance(w, bytes):
+            w = w.decode("ascii")
+        dictionary.append(w)
+
+    # save word dict to assets folder
+    hub.ModuleDesc.save_module_dict(
+        module_path=saved_model_path, word_dict=dictionary)
+
+
+def test_save_module(use_cuda=False):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+    exe = fluid.Executor(place)
+
+    with fluid.program_guard(main_program, startup_program):
+        words, word_emb = module_fn()
+        exe.run(startup_program)
+        # load inference embedding parameters
+        saved_model_path = "./w2v_saved_inference_model"
+        fluid.io.load_inference_model(executor=exe, dirname=saved_model_path)
+
+        feed_var_list = [main_program.global_block().var("words")]
+        feeder = fluid.DataFeeder(feed_list=feed_var_list, place=place)
+        results = exe.run(
+            main_program,
+            feed=feeder.feed([[[1, 2, 3, 4, 5]]]),
+            fetch_list=[word_emb],
+            return_numpy=False)
+
+        np_result = np.array(results[0])
+        print(np_result)
+
+        saved_module_path = "./test/word2vec_inference_module"
+        fluid.io.save_inference_model(
+            dirname=saved_module_path,
+            feeded_var_names=["words"],
+            target_vars=[word_emb],
+            executor=exe)
+
+        dictionary = []
+        for w in word_dict:
+            if isinstance(w, bytes):
+                w = w.decode("ascii")
+            dictionary.append(w)
+        # save word dict to assets folder
+        hub.ModuleDesc.save_module_dict(
+            module_path=saved_module_path, word_dict=dictionary)
+
+
+def test_load_module(use_cuda=False):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(fluid.CPUPlace())
+    saved_module_path = "./test/word2vec_inference_module"
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(
+         saved_module_path, executor=exe)
+
+    # Sequence input in Paddle must be LOD Tensor, so we need to convert them inside Module
+    word_ids = [[1, 2, 3, 4, 5]]
+    lod = [[5]]
+    word_ids_lod_tensor = fluid.create_lod_tensor(word_ids, lod, place)
+
+    results = exe.run(
+        inference_program,
+        feed={feed_target_names[0]: word_ids_lod_tensor},
+        fetch_list=fetch_targets,
+        return_numpy=False)
+
+    print(feed_target_names)
+    print(fetch_targets)
+    np_result = np.array(results[0])
+    print(np_result)
+
+
+if __name__ == "__main__":
+    use_cuda = True
+    train(use_cuda)
+    test_save_module()
+    test_load_module()