提交 718033e1 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into refine/op/peephole

...@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080 ...@@ -104,6 +104,7 @@ visualDL --logdir=scratch_log --port=8080
# 访问 http://127.0.0.1:8080 # 访问 http://127.0.0.1:8080
``` ```
如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上,运行`pip install --upgrade protobuf`就能解决。
如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。 如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。
......
...@@ -500,7 +500,7 @@ EOF ...@@ -500,7 +500,7 @@ EOF
EOF EOF
if [[ ${WITH_GPU} == "ON" ]]; then if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&" NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
else else
NCCL_DEPS="" NCCL_DEPS=""
fi fi
......
...@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file, ...@@ -104,7 +104,7 @@ def batch_images_from_tar(data_file,
pickle.dump( pickle.dump(
output, output,
open('%s/batch_%d' % (out_path, file_id), 'wb'), open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL) protocol=2)
file_id += 1 file_id += 1
data = [] data = []
labels = [] labels = []
...@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file, ...@@ -113,9 +113,7 @@ def batch_images_from_tar(data_file,
output['label'] = labels output['label'] = labels
output['data'] = data output['data'] = data
pickle.dump( pickle.dump(
output, output, open('%s/batch_%d' % (out_path, file_id), 'wb'), protocol=2)
open('%s/batch_%d' % (out_path, file_id), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
with open(meta_file, 'a') as meta: with open(meta_file, 'a') as meta:
for file in os.listdir(out_path): for file in os.listdir(out_path):
......
...@@ -55,6 +55,7 @@ class TestDistRunnerBase(object): ...@@ -55,6 +55,7 @@ class TestDistRunnerBase(object):
pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_prog = t.get_pserver_program(args.current_endpoint)
startup_prog = t.get_startup_program(args.current_endpoint, startup_prog = t.get_startup_program(args.current_endpoint,
pserver_prog) pserver_prog)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
...@@ -147,6 +148,8 @@ def runtime_main(test_class): ...@@ -147,6 +148,8 @@ def runtime_main(test_class):
import paddle.compat as cpt import paddle.compat as cpt
import socket
from contextlib import closing
class TestDistBase(unittest.TestCase): class TestDistBase(unittest.TestCase):
...@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase): ...@@ -156,13 +159,19 @@ class TestDistBase(unittest.TestCase):
def setUp(self): def setUp(self):
self._trainers = 2 self._trainers = 2
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True self._sync_mode = True
self._mem_opt = False self._mem_opt = False
self._use_reduce = False self._use_reduce = False
self._setup_config() self._setup_config()
def _find_free_port(self):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
......
...@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase): ...@@ -85,6 +85,7 @@ class TestFetchOp(unittest.TestCase):
assert not math.isnan(np.sum(ret[i])) and \ assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i])) not math.isinf(np.sum(ret[i]))
@unittest.skip(reason="CI timeout")
def test_fetch_op(self): def test_fetch_op(self):
tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
tst_reader_iter = tst_reader() tst_reader_iter = tst_reader()
...@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase): ...@@ -139,6 +140,7 @@ class TestFeedParallel(unittest.TestCase):
if batch_id == 2: if batch_id == 2:
break break
@unittest.skip(reason="CI timeout")
def test_feed_op(self): def test_feed_op(self):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册