fix the bug that `print_signature.py` cannot get all the public apis (#33423)

* 增加方法获取和执行全量api的示例代码进行测试 * start the sample code test for gpu * should import paddle separately * add a stdout handler, the default is stderr. the paddle_build.sh will catch the stdout content. * add RUN_ON_DEVICE into the requires set * if codeblok['required'] is empty, use the RUN_ON_DEVICE instead * set the threads to 16 http://agroup.baidu.com/paddlepaddle-org-cn/md/article/4036225 * 设置默认日志级别为INFO级别 * using the logic from gen_doc.py * using modulelist to get the all apis * as we don't care which name is the shorttest, so fetch the first name in the all_names list * the new list from project * 先不启用gpu测试，先把print_signature获取不到全部API的问题解决了

fix the bug that `print_signature.py` cannot get all the public apis (#33423)
* 增加方法获取和执行全量api的示例代码进行测试 * start the sample code test for gpu * should import paddle separately * add a stdout handler, the default is stderr. the paddle_build.sh will catch the stdout content. * add RUN_ON_DEVICE into the requires set * if codeblok['required'] is empty, use the RUN_ON_DEVICE instead * set the threads to 16 http://agroup.baidu.com/paddlepaddle-org-cn/md/article/4036225 * 设置默认日志级别为INFO级别 * using the logic from gen_doc.py * using modulelist to get the all apis * as we don't care which name is the shorttest, so fetch the first name in the all_names list * the new list from project * 先不启用gpu测试，先把print_signature获取不到全部API的问题解决了
afa4bf51 · Ren Wei (任卫) · GitHub · f89a7b55 · afa4bf51 · afa4bf51
Showing with 189 addition and 20 deletion

paddle/scripts/paddle_build.sh paddle/scripts/paddle_build.sh +13 -5

tools/print_signatures.py tools/print_signatures.py +131 -4

tools/sampcd_processor.py tools/sampcd_processor.py +45 -11

未找到文件。
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2009,12 +2009,16 @@ function build_document_preview() {
    sh /paddle/tools/document_preview.sh ${PORT}
 }
+# origin name: example
-function example() {
+function exec_samplecode_test() {
    pip install ${PADDLE_ROOT}/build/python/dist/*.whl
    paddle version
    cd ${PADDLE_ROOT}/tools
-    python sampcd_processor.py cpu;example_error=$?
+    if [ "$1" = "cpu" ] ; then
+        python sampcd_processor.py cpu; example_error=$?
+    elif [ "$1" = "gpu" ] ; then
+        python sampcd_processor.py --threads=16 --full-test gpu; example_error=$?
+    fi
    if [ "$example_error" != "0" ];then
      echo "Code instance execution failed" >&2
      exit 5
@@ -2127,7 +2131,7 @@ function main() {
        check_sequence_op_unittest
        generate_api_spec ${PYTHON_ABI:-""} "PR"
        set +e
-        example_info=$(example)
+        example_info=$(exec_samplecode_test cpu)
        example_code=$?
        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
        assert_api_spec_approvals
@@ -2286,7 +2290,11 @@ function main() {
        build_document_preview
        ;;
      api_example)
-        example
+        example_info=$(exec_samplecode_test cpu)
+        example_code=$?
+        check_style_code=0
+        check_style_info=
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
        ;;
      test_op_benchmark)
        test_op_benchmark

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,11 +27,25 @@ import pydoc
 import hashlib
 import platform
 import functools
+import pkgutil
+import logging
+import paddle
 member_dict = collections.OrderedDict()
 visited_modules = set()
+logger = logging.getLogger()
+if logger.handlers:
+    # we assume the first handler is the one we want to configure
+    console = logger.handlers[0]
+else:
+    console = logging.StreamHandler(sys.stderr)
+    logger.addHandler(console)
+console.setFormatter(
+    logging.Formatter(
+        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
 def md5(doc):
    try:
@@ -199,11 +213,124 @@ def visit_all_module(mod):
                visit_member(mod.__name__, instance)
+# all from gen_doc.py
+api_info_dict = {}  # used by get_all_api
+# step 1: walkthrough the paddle package to collect all the apis in api_set
+def get_all_api(root_path='paddle', attr="__all__"):
+    """
+    walk through the paddle package to collect all the apis.
+    """
+    global api_info_dict
+    api_counter = 0
+    for filefinder, name, ispkg in pkgutil.walk_packages(
+            path=paddle.__path__, prefix=paddle.__name__ + '.'):
+        try:
+            if name in sys.modules:
+                m = sys.modules[name]
+            else:
+                # importlib.import_module(name)
+                m = eval(name)
+                continue
+        except AttributeError:
+            logger.warning("AttributeError occurred when `eval(%s)`", name)
+            pass
+        else:
+            api_counter += process_module(m, attr)
+    api_counter += process_module(paddle, attr)
+    logger.info('%s: collected %d apis, %d distinct apis.', attr, api_counter,
+                len(api_info_dict))
+    return [api_info['all_names'][0] for api_info in api_info_dict.values()]
+def insert_api_into_dict(full_name, gen_doc_anno=None):
+    """
+    insert add api into the api_info_dict
+    Return:
+        api_info object or None
+    """
+    try:
+        obj = eval(full_name)
+        fc_id = id(obj)
+    except AttributeError:
+        logger.warning("AttributeError occurred when `id(eval(%s))`", full_name)
+        return None
+    except:
+        logger.warning("Exception occurred when `id(eval(%s))`", full_name)
+        return None
+    else:
+        logger.debug("adding %s to api_info_dict.", full_name)
+        if fc_id in api_info_dict:
+            api_info_dict[fc_id]["all_names"].add(full_name)
+        else:
+            api_info_dict[fc_id] = {
+                "all_names": set([full_name]),
+                "id": fc_id,
+                "object": obj,
+                "type": type(obj).__name__,
+            }
+            docstr = inspect.getdoc(obj)
+            if docstr:
+                api_info_dict[fc_id]["docstring"] = inspect.cleandoc(docstr)
+            if gen_doc_anno:
+                api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
+        return api_info_dict[fc_id]
+# step 1 fill field : `id` & `all_names`, type, docstring
+def process_module(m, attr="__all__"):
+    api_counter = 0
+    if hasattr(m, attr):
+        # may have duplication of api
+        for api in set(getattr(m, attr)):
+            if api[0] == '_': continue
+            # Exception occurred when `id(eval(paddle.dataset.conll05.test, get_dict))`
+            if ',' in api: continue
+            # api's fullname
+            full_name = m.__name__ + "." + api
+            api_info = insert_api_into_dict(full_name)
+            if api_info is not None:
+                api_counter += 1
+                if inspect.isclass(api_info['object']):
+                    for name, value in inspect.getmembers(api_info['object']):
+                        if (not name.startswith("_")) and hasattr(value,
+                                                                  '__name__'):
+                            method_full_name = full_name + '.' + name  # value.__name__
+                            method_api_info = insert_api_into_dict(
+                                method_full_name, 'class_method')
+                            if method_api_info is not None:
+                                api_counter += 1
+    return api_counter
+def get_all_api_from_modulelist():
+    modulelist = [
+        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
+        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
+        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
+        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
+        paddle.utils, paddle.utils.download, paddle.utils.profiler,
+        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
+        paddle.distributed, paddle.distributed.fleet,
+        paddle.distributed.fleet.utils, paddle.distributed.parallel,
+        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd
+    ]
+    for m in modulelist:
+        visit_all_module(m)
+    return member_dict
 if __name__ == '__main__':
-    import paddle
+    # modules = sys.argv[1].split(",")
-    modules = sys.argv[1].split(",")
+    # for m in modules:
-    for m in modules:
+    #    visit_all_module(importlib.import_module(m))
-        visit_all_module(importlib.import_module(m))
+    get_all_api_from_modulelist()
    for name in member_dict:
        print(name, member_dict[name])
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -39,14 +39,13 @@ if logger.handlers:
    console = logger.handlers[
        0]  # we assume the first handler is the one we want to configure
 else:
-    console = logging.StreamHandler()
+    console = logging.StreamHandler(stream=sys.stderr)
    logger.addHandler(console)
 console.setFormatter(logging.Formatter("%(message)s"))
 RUN_ON_DEVICE = 'cpu'
 SAMPLE_CODE_TEST_CAPACITY = set()
 GPU_ID = 0
-methods = []
 whl_error = []
 API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
 API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
@@ -247,13 +246,15 @@ def is_required_match(requirestr, cbtitle='not-specified'):
        False - not match
        None - skipped  # trick
    """
-    global SAMPLE_CODE_TEST_CAPACITY  # readonly
+    global SAMPLE_CODE_TEST_CAPACITY, RUN_ON_DEVICE  # readonly
    requires = set(['cpu'])
    if requirestr:
        for r in requirestr.split(','):
            rr = r.strip().lower()
            if rr:
                requires.add(rr)
+    else:
+        requires.add(RUN_ON_DEVICE)
    if 'skip' in requires or 'skiptest' in requires:
        logger.info('%s: skipped', cbtitle)
        return None
@@ -283,8 +284,8 @@ def insert_codes_into_codeblock(codeblock, apiname='not-specified'):
        cpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n'
        gpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
            GPU_ID)
-        if 'required' in codeblock:
+        if 'required' in codeblock and codeblock['required']:
-            if codeblock['required'] is None or codeblock['required'] == 'cpu':
+            if codeblock['required'] == 'cpu':
                inserted_codes_f = cpu_str
            elif codeblock['required'] == 'gpu':
                inserted_codes_f = gpu_str
@@ -426,20 +427,25 @@ stdout: %s
    return result, tfname, msg, end_time - start_time
-def get_filenames():
+def get_filenames(full_test=False):
    '''
    this function will get the sample code files that pending for check.
+    Args:
+        full_test: the full apis or the increment
    Returns:
        dict: the sample code files pending for check .
    '''
-    global methods  # write
    global whl_error
    import paddle
    whl_error = []
-    get_incrementapi()
+    if full_test:
+        get_full_api()
+    else:
+        get_incrementapi()
    all_sample_code_filenames = {}
    with open(API_DIFF_SPEC_FN) as f:
        for line in f.readlines():
@@ -472,8 +478,9 @@ def get_api_md5(path):
        api_md5(dict): key is the api's real fullname, value is the md5sum.
    """
    api_md5 = {}
-    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
+    API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path))
-                          path)
+    if not os.path.isfile(API_spec):
+        return api_md5
    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
    patArgSpec = re.compile(
        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
@@ -487,6 +494,28 @@ def get_api_md5(path):
    return api_md5
+def get_full_api():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api_from_modulelist
+    member_dict = get_all_api_from_modulelist()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(member_dict.keys()))
+def get_full_api_by_walk():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api
+    apilist = get_all_api()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(apilist))
 def get_incrementapi():
    '''
    this function will get the apis that difference between API_DEV.spec and API_PR.spec.
@@ -526,6 +555,7 @@ def parse_args():
    #                     help='Use CPU mode (overrides --gpu)')
    # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('--full-test', dest='full_test', action="store_true")
    parser.add_argument('mode', type=str, help='run on device', default='cpu')
    for item in arguments:
        parser.add_argument(
@@ -545,6 +575,8 @@ if __name__ == '__main__':
    args = parse_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.INFO)
    if args.logf:
        logfHandler = logging.FileHandler(args.logf)
        logfHandler.setFormatter(
@@ -573,7 +605,7 @@ if __name__ == '__main__':
    else:
        os.mkdir(SAMPLECODE_TEMPDIR)
-    filenames = get_filenames()
+    filenames = get_filenames(args.full_test)
    if len(filenames) == 0 and len(whl_error) == 0:
        logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
        exit(0)
@@ -593,6 +625,8 @@ if __name__ == '__main__':
    if not args.debug:
        shutil.rmtree(SAMPLECODE_TEMPDIR)
+    stdout_handler = logging.StreamHandler(stream=sys.stdout)
+    logger.addHandler(stdout_handler)
    logger.info("----------------End of the Check--------------------")
    if len(whl_error) != 0:
        logger.info("%s is not in whl.", whl_error)