未验证 提交 27acc6c3 编写于 作者: A Allen Guo 提交者: GitHub

[IPU] update to popart v2.5.0 (#42552)

* update to popart v2.5.0

* use a specific version of sdk2.5.0
上级 c4bed7e4
......@@ -80,8 +80,8 @@ class Stat : public StatBase {
while (prev_value < current_value &&
!peak_value_.compare_exchange_weak(prev_value, current_value)) {
}
VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
<< " , current value = " << current_value;
VLOG(8) << "Update peak_value, after update, peak_value = "
<< peak_value_.load() << " , current value = " << current_value;
}
}
......
......@@ -341,21 +341,26 @@ IpuStrategy::IpuStrategy() {
return std::to_string(popart_options.partialsTypeMatMuls == "half");
});
RegisterSetter(
container_options, "dot_checks",
[&](const std::pair<std::string, std::string>& p) {
std::uint64_t value = std::stoul(p.first);
popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
});
RegisterSetter(container_options, "dot_checks",
[&](const std::pair<std::string, std::string>& p) {
std::vector<std::string> valid_dot{"Fwd0", "Fwd1", "Bwd0",
"PreAlias", "Final"};
if (std::find(valid_dot.begin(), valid_dot.end(), p.first) ==
valid_dot.end()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unknown dot check: %s", p.first));
}
popart_options.dotChecks.insert(p.first);
});
RegisterGetter(
vector_options_getter, options_type, "dot_checks", "vector", [&]() {
std::vector<std::string> res;
for (auto x : popart_options.dotChecks) {
res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
}
return res;
});
RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector",
[&]() {
std::vector<std::string> res;
for (auto x : popart_options.dotChecks) {
res.push_back(x);
}
return res;
});
RegisterSetter(container_options, "hardware_instrumentations",
[&](const std::pair<std::string, std::string>& p) {
......@@ -516,6 +521,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
}
}
void IpuStrategy::SetReplicatedCollectivesSettings(const std::string& opt,
bool value) {
VLOG(10) << "Set Replica Setting " << opt << " to " << value;
if (opt == "prepare_schedule_for_merging_collectives") {
popart_options.replicatedCollectivesSettings
.prepareScheduleForMergingCollectives = value;
} else if (opt == "merge_all_reduce_collectives") {
popart_options.replicatedCollectivesSettings.mergeAllReduceCollectives =
value;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unknown option ' %s' for replicated collectives settings", opt));
}
}
void IpuStrategy::SetAccumulateOuterFragmentSettings(
const std::uint64_t& schedule, const std::vector<int>& values) {
VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
......
......@@ -118,6 +118,7 @@ class IpuStrategy {
const std::string &value);
void SetTensorLocation(const std::string &tensor, const std::string &option,
std::uint64_t value);
void SetReplicatedCollectivesSettings(const std::string &opt, bool value);
void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
const std::vector<int> &values);
void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
......
......@@ -4394,6 +4394,12 @@ All parameter, weight, gradient are variables in Paddle.
option_name, option.first.cast<std::string>(),
option.second.cast<std::uint64_t>());
}
} else if (option_name == "replicated_collectives_settings") {
for (auto option : element.second.cast<py::dict>()) {
self.SetReplicatedCollectivesSettings(
option.first.cast<std::string>(),
option.second.cast<bool>());
}
} else if (option_name == "accumulate_outer_fragment") {
for (auto option : element.second.cast<py::dict>()) {
std::vector<int> values;
......
......@@ -27,12 +27,13 @@ class TestIpuStrategy(unittest.TestCase):
ipu_strategy = paddle.static.IpuStrategy()
all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
skip_options = []
skip_options.append(
'mean_accumulation_and_replication_reduction_strategy')
skip_options.append('random_seed')
for option_name in all_option_names:
if option_name in skip_options:
continue
option = ipu_strategy._ipu_strategy.get_option(option_name)
option_type = option['type']
option_value = option['value']
......@@ -67,7 +68,7 @@ class TestIpuStrategy(unittest.TestCase):
def test_set_other_options(self):
ipu_strategy = paddle.static.IpuStrategy()
options = {}
options['dot_checks'] = ['0', '1', '2', '3']
options['dot_checks'] = ['Fwd0', 'Fwd1', 'Bwd0', 'PreAlias', "Final"]
options['engine_options'] = {
'debug.allowOutOfMemory': 'true',
'autoReport.directory': 'path',
......@@ -76,7 +77,12 @@ class TestIpuStrategy(unittest.TestCase):
options['random_seed'] = 1234
for k, v in options.items():
ipu_strategy.set_options({k: v})
assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
if (isinstance(v, list)):
assert v.sort() == ipu_strategy.get_option(k).sort(
), f"set {k} to {v} failed "
else:
assert v == ipu_strategy.get_option(
k), f"set {k} to {v} failed "
# The custom logger need 2 int as inputs
logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
......
......@@ -148,6 +148,36 @@ class TestReplicaInference(TestBase):
}
class TestReplicaCollectiveInference(TestBase):
def set_attrs(self):
self.ipu_options = {
"batches_per_step": 1,
"enable_pipelining": False,
"enable_gradient_accumulation": False,
"accumulation_factor": 1,
"enable_replicated_graphs": True,
"replicated_graph_count": 2,
"accumulate_outer_fragment": {
0: []
},
"replicated_collectives_settings": {
"prepare_schedule_for_merging_collectives": True,
"merge_all_reduce_collectives": True
}
}
self.cpu_bs = 1
self.ipu_bs = 1
def set_data_feed(self):
np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
self.feed_cpu = {"image": np_image}
self.feed_ipu = {
"image":
np.tile(np_image,
[self.ipu_options['replicated_graph_count'], 1, 1, 1])
}
class TestPipelineInference(TestBase):
def set_attrs(self):
self.ipu_options = {
......@@ -190,6 +220,36 @@ class TestTrainBase(TestBase):
class TestReplicaTrain(TestTrainBase):
def set_attrs(self):
self.ipu_options = {
"batches_per_step": 1,
"enable_pipelining": False,
"enable_gradient_accumulation": False,
"accumulation_factor": 1,
"enable_replicated_graphs": True,
"replicated_graph_count": 2
}
self.cpu_bs = 2
self.ipu_bs = 1
self.optimizer = 'sgd'
def set_data_feed(self):
np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])}
self.feed_ipu = {
"image":
np.tile(np_image,
[self.ipu_options['replicated_graph_count'], 1, 1, 1])
}
def test(self):
cpu_outputs = self._test_base(False)
ipu_outputs = self._test_base(True)[::2]
self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol))
class TestReplicaCollectiveTrain(TestTrainBase):
def set_attrs(self):
self.ipu_options = {
"batches_per_step": 1,
......@@ -198,6 +258,13 @@ class TestReplicaTrain(TestTrainBase):
"accumulation_factor": 1,
"enable_replicated_graphs": True,
"replicated_graph_count": 2,
"accumulate_outer_fragment": {
0: []
},
"replicated_collectives_settings": {
"prepare_schedule_for_merging_collectives": True,
"merge_all_reduce_collectives": True
}
}
self.cpu_bs = 2
self.ipu_bs = 1
......
......@@ -6,7 +6,7 @@
# run a container
# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
FROM graphcore/poplar:2.3.0
FROM graphcore/poplar:poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
# ENV variables
......@@ -25,6 +25,7 @@ RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-ut
bison graphviz libjpeg-dev zlib1g zlib1g-dev automake locales swig net-tools libtool module-init-tools numactl libnuma-dev \
openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++
RUN apt-get update && apt-get install -y rdma-core librdmacm1
RUN apt-get update && apt-get install libspdlog-dev
# Downgrade gcc&&g++
WORKDIR /usr/bin
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册