optimizer.py 16.2 KB
Newer Older
D
dogsheng 已提交
1 2 3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 5 6
# A-Tune is licensed under the Mulan PSL v2.
# You can use this software according to the terms and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
7
#     http://license.coscl.org.cn/MulanPSL2
D
dogsheng 已提交
8 9 10
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
11
# See the Mulan PSL v2 for more details.
D
dogsheng 已提交
12 13 14 15 16 17
# Create: 2019-10-29

"""
This class is used to find optimal settings and generate optimized profile.
"""

Z
Zhipeng Xie 已提交
18
import logging
19
import numbers
20
import multiprocessing
21
import collections
G
gaoruoshu 已提交
22
import numpy as np
G
gaoruoshu 已提交
23
import analysis.engine.utils.utils as utils
24 25
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
D
dogsheng 已提交
26

27 28
from skopt import Optimizer as baseOpt
from skopt.utils import normalize_dimensions
29
from skopt.utils import cook_estimator
30

31
from analysis.optimizer.abtest_tuning_manager import ABtestTuningManager
32
from analysis.optimizer.knob_sampling_manager import KnobSamplingManager
33
from analysis.optimizer.tpe_optimizer import TPEOptimizer
34
from analysis.optimizer.weighted_ensemble_feature_selector import WeightedEnsembleFeatureSelector
35

Z
Zhipeng Xie 已提交
36
LOGGER = logging.getLogger(__name__)
D
dogsheng 已提交
37 38


39
class Optimizer(multiprocessing.Process):
Z
Zhipeng Xie 已提交
40 41
    """find optimal settings and generate optimized profile"""

G
gaoruoshu 已提交
42
    def __init__(self, name, params, child_conn, prj_name, engine="bayes", max_eval=50, sel_feature=False,
G
gaoruoshu 已提交
43
                 x0=None, y0=None, n_random_starts=20, split_count=5, noise=0.00001 ** 2):
D
dogsheng 已提交
44 45 46
        super(Optimizer, self).__init__(name=name)
        self.knobs = params
        self.child_conn = child_conn
G
gaoruoshu 已提交
47
        self.project_name = prj_name
D
dogsheng 已提交
48
        self.engine = engine
49
        self.noise = noise
D
dogsheng 已提交
50
        self.max_eval = int(max_eval)
51
        self.split_count = split_count
52
        self.sel_feature = sel_feature
G
gaoruoshu 已提交
53 54 55
        self.x_ref = x0
        self.y_ref = y0
        if self.x_ref is not None and len(self.x_ref) == 1:
H
hanxinke 已提交
56
            ref_x, _ = self.transfer()
57 58 59
            self.ref = ref_x[0]
        else:
            self.ref = []
60
        self._n_random_starts = 20 if n_random_starts is None else n_random_starts
D
dogsheng 已提交
61 62

    def build_space(self):
Z
Zhipeng Xie 已提交
63
        """build space"""
D
dogsheng 已提交
64
        objective_params_list = []
65
        for i, p_nob in enumerate(self.knobs):
Z
Zhipeng Xie 已提交
66
            if p_nob['type'] == 'discrete':
67
                items = self.handle_discrete_data(p_nob, i)
68
                objective_params_list.append(items)
Z
Zhipeng Xie 已提交
69 70
            elif p_nob['type'] == 'continuous':
                r_range = p_nob['range']
71 72
                if r_range is None or len(r_range) != 2:
                    raise ValueError("the item of the scope value of {} must be 2"
73
                                     .format(p_nob['name']))
74 75 76 77 78
                if p_nob['dtype'] == 'int':
                    try:
                        r_range[0] = int(r_range[0])
                        r_range[1] = int(r_range[1])
                    except ValueError:
79
                        raise ValueError("the range value of {} is not an integer value"
G
gaoruoshu 已提交
80
                                         .format(p_nob['name']))
81 82 83 84 85
                elif p_nob['dtype'] == 'float':
                    try:
                        r_range[0] = float(r_range[0])
                        r_range[1] = float(r_range[1])
                    except ValueError:
86
                        raise ValueError("the range value of {} is not an float value"
G
gaoruoshu 已提交
87
                                         .format(p_nob['name']))
88

89 90
                if len(self.ref) > 0:
                    if self.ref[i] < r_range[0] or self.ref[i] > r_range[1]:
G
gaoruoshu 已提交
91 92
                        raise ValueError("the ref value of {} is out of range"
                                         .format(p_nob['name']))
Z
Zhipeng Xie 已提交
93
                objective_params_list.append((r_range[0], r_range[1]))
94 95
            else:
                raise ValueError("the type of {} is not supported".format(p_nob['name']))
D
dogsheng 已提交
96 97
        return objective_params_list

98
    def handle_discrete_data(self, p_nob, index):
99 100 101 102 103 104 105 106
        """handle discrete data"""
        if p_nob['dtype'] == 'int':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 1
            if 'step' in p_nob.keys():
107
                step = 1 if p_nob['step'] < 1 else p_nob['step']
108
            if r_range is not None:
109 110
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
111 112
                    items.extend(list(np.arange(r_range[i], r_range[i + 1] + 1, step=step)))
            items = list(set(items))
113 114 115 116 117 118 119 120
            if len(self.ref) > 0:
                try:
                    ref_value = int(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not an integer value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
121
            return items
122 123 124 125 126 127 128 129 130 131 132 133 134
        if p_nob['dtype'] == 'float':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 0.1
            if 'step' in p_nob.keys():
                step = 0.1 if p_nob['step'] <= 0 else p_nob['step']
            if r_range is not None:
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
                    items.extend(list(np.arange(r_range[i], r_range[i + 1], step=step)))
            items = list(set(items))
135 136 137 138 139 140 141 142
            if len(self.ref) > 0:
                try:
                    ref_value = float(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not a float value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
143
            return items
144 145
        if p_nob['dtype'] == 'string':
            items = p_nob['options']
146 147 148 149 150 151 152 153 154
            if len(self.ref) > 0:
                try:
                    ref_value = str(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not a string value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
            return items
155 156
        raise ValueError("the dtype of {} is not supported".format(p_nob['name']))

157 158 159 160 161 162 163
    @staticmethod
    def feature_importance(options, performance, labels):
        """feature importance"""
        options = StandardScaler().fit_transform(options)
        lasso = Lasso()
        lasso.fit(options, performance)
        result = zip(lasso.coef_, labels)
164 165 166
        total_sum = sum(map(abs, lasso.coef_))
        if total_sum == 0:
            return ", ".join("%s: 0" % label for label in labels)
167
        result = sorted(result, key=lambda x: -np.abs(x[0]))
168 169
        rank = ", ".join("%s: %s%%" % (label, round(coef * 100 / total_sum, 2))
                         for coef, label in result)
170 171
        return rank

172
    def _get_value_from_knobs(self, kv):
173 174 175 176
        x_each = []
        for p_nob in self.knobs:
            if p_nob['name'] not in kv.keys():
                raise ValueError("the param {} is not in the x0 ref".format(p_nob['name']))
177
            if p_nob['dtype'] == 'int':
178
                x_each.append(int(kv[p_nob['name']]))
179 180 181 182
            elif p_nob['dtype'] == 'float':
                x_each.append(float(kv[p_nob['name']]))
            else:
                x_each.append(kv[p_nob['name']])
183 184 185 186 187 188
        return x_each

    def transfer(self):
        """transfer ref x0 to int, y0 to float"""
        list_ref_x = []
        list_ref_y = []
G
gaoruoshu 已提交
189
        if self.x_ref is None or self.y_ref is None:
190 191
            return (list_ref_x, list_ref_y)

G
gaoruoshu 已提交
192
        for x_value in self.x_ref:
193
            kv = {}
G
gaoruoshu 已提交
194
            if len(x_value) != len(self.knobs):
195 196
                raise ValueError("x0 is not the same length with knobs")

G
gaoruoshu 已提交
197
            for val in x_value:
198 199 200 201 202
                params = val.split("=")
                if len(params) != 2:
                    raise ValueError("the param format of {} is not correct".format(params))
                kv[params[0]] = params[1]

203
            ref_x = self._get_value_from_knobs(kv)
204 205 206
            if len(ref_x) != len(self.knobs):
                raise ValueError("tuning parameter is not the same length with knobs")
            list_ref_x.append(ref_x)
G
gaoruoshu 已提交
207
        list_ref_y = [float(y) for y in self.y_ref]
208 209
        return (list_ref_x, list_ref_y)

D
dogsheng 已提交
210
    def run(self):
211
        """start the tuning process"""
G
gaoruoshu 已提交
212

D
dogsheng 已提交
213
        def objective(var):
214
            """objective method receive the benchmark result and send the next parameters"""
G
gaoruoshu 已提交
215
            iter_result = {}
216
            option = []
D
dogsheng 已提交
217
            for i, knob in enumerate(self.knobs):
218
                params[knob['name']] = var[i]
Z
Zhipeng Xie 已提交
219
                if knob['dtype'] == 'string':
220
                    option.append(knob['options'].index(var[i]))
Z
Zhipeng Xie 已提交
221
                else:
222
                    option.append(var[i])
G
gaoruoshu 已提交
223 224 225

            iter_result["param"] = params
            self.child_conn.send(iter_result)
D
dogsheng 已提交
226
            result = self.child_conn.recv()
Z
Zhipeng Xie 已提交
227 228 229
            x_num = 0.0
            eval_list = result.split(',')
            for value in eval_list:
D
dogsheng 已提交
230
                num = float(value)
Z
Zhipeng Xie 已提交
231
                x_num = x_num + num
232
            options.append(option)
233
            performance.append(x_num)
Z
Zhipeng Xie 已提交
234
            return x_num
D
dogsheng 已提交
235

G
gaoruoshu 已提交
236 237
        utils.change_file_name()

D
dogsheng 已提交
238
        params = {}
239 240 241
        options = []
        performance = []
        labels = []
242
        estimator = None
G
gaoruoshu 已提交
243 244 245 246 247 248

        parameters = ""
        for knob in self.knobs:
            parameters += knob["name"] + ","
        utils.add_data_to_file(parameters[:-1], "w", self.project_name)

D
dogsheng 已提交
249
        try:
250
            if self.engine == 'random' or self.engine == 'forest' or \
251
                    self.engine == 'gbrt' or self.engine == 'bayes' or self.engine == 'extraTrees':
252 253 254
                params_space = self.build_space()
                ref_x, ref_y = self.transfer()
                if len(ref_x) == 0:
255 256 257 258
                    if len(self.ref) == 0:
                        ref_x = None
                    else:
                        ref_x = self.ref
259
                    ref_y = None
260
                if ref_x is not None and not isinstance(ref_x[0], (list, tuple)):
261 262 263 264 265 266
                    ref_x = [ref_x]
                LOGGER.info('x0: %s', ref_x)
                LOGGER.info('y0: %s', ref_y)

                if ref_x is not None and isinstance(ref_x[0], (list, tuple)):
                    self._n_random_starts = 0 if len(ref_x) >= self._n_random_starts \
G
gaoruoshu 已提交
267
                        else self._n_random_starts - len(ref_x) + 1
268 269 270 271 272 273 274

                LOGGER.info('n_random_starts parameter is: %d', self._n_random_starts)
                LOGGER.info("Running performance evaluation.......")
                if self.engine == 'random':
                    estimator = 'dummy'
                elif self.engine == 'forest':
                    estimator = 'RF'
275 276
                elif self.engine == 'extraTrees':
                    estimator = 'ET'
277 278 279 280
                elif self.engine == 'gbrt':
                    estimator = 'GBRT'
                elif self.engine == 'bayes':
                    params_space = normalize_dimensions(params_space)
281
                    estimator = cook_estimator("GP", space=params_space, noise=self.noise)
282 283 284 285 286 287 288 289 290 291 292 293

                LOGGER.info("base_estimator is: %s", estimator)
                optimizer = baseOpt(
                    dimensions=params_space,
                    n_random_starts=self._n_random_starts,
                    random_state=1,
                    base_estimator=estimator
                )
                n_calls = self.max_eval
                # User suggested points at which to evaluate the objective first
                if ref_x and ref_y is None:
                    ref_y = list(map(objective, ref_x))
294
                    LOGGER.info("ref_y is: %s", ref_y)
295 296 297 298
                    n_calls -= len(ref_y)

                # Pass user suggested initialisation points to the optimizer
                if ref_x:
H
hanxinke 已提交
299
                    if not isinstance(ref_y, (collections.Iterable, numbers.Number)):
G
gaoruoshu 已提交
300 301
                        raise ValueError("`ref_y` should be an iterable or a scalar, "
                                         "got %s" % type(ref_y))
302
                    if len(ref_x) != len(ref_y):
G
gaoruoshu 已提交
303 304
                        raise ValueError("`ref_x` and `ref_y` should "
                                         "have the same length")
305 306 307 308 309 310 311
                    LOGGER.info("ref_x: %s", ref_x)
                    LOGGER.info("ref_y: %s", ref_y)
                    ret = optimizer.tell(ref_x, ref_y)

                for i in range(n_calls):
                    next_x = optimizer.ask()
                    LOGGER.info("next_x: %s", next_x)
312
                    LOGGER.info("Running performance evaluation.......")
313 314 315 316
                    next_y = objective(next_x)
                    LOGGER.info("next_y: %s", next_y)
                    ret = optimizer.tell(next_x, next_y)
                    LOGGER.info("finish (ref_x, ref_y) tell")
G
gaoruoshu 已提交
317 318 319 320 321 322 323 324 325

                    data = ""
                    for element in next_x:
                        data += str(element) + ","
                    data += str(abs(next_y))
                    utils.add_data_to_file(data, "a", self.project_name)

                utils.add_data_to_file("END", "a", self.project_name)

326
            elif self.engine == 'abtest':
G
gaoruoshu 已提交
327 328
                abtuning_manager = ABtestTuningManager(self.knobs, self.child_conn,
                                                       self.split_count)
329 330
                options, performance = abtuning_manager.do_abtest_tuning_abtest()
                params = abtuning_manager.get_best_params()
G
gaoruoshu 已提交
331 332
                # convert string option into index
                options = abtuning_manager.get_options_index(options)
333
            elif self.engine == 'lhs':
G
gaoruoshu 已提交
334 335
                knobsampling_manager = KnobSamplingManager(self.knobs, self.child_conn,
                                                           self.max_eval, self.split_count)
336 337 338
                options = knobsampling_manager.get_knob_samples()
                performance = knobsampling_manager.do_knob_sampling_test(options)
                params = knobsampling_manager.get_best_params(options, performance)
339
                options = knobsampling_manager.get_options_index(options)
340 341 342
            elif self.engine == 'tpe':
                tpe_opt = TPEOptimizer(self.knobs, self.child_conn, self.max_eval)
                best_params = tpe_opt.tpe_minimize_tuning()
343 344 345 346
                final_param = {}
                final_param["finished"] = True
                final_param["param"] = best_params
                self.child_conn.send(final_param)
347
                return best_params
Z
Zhipeng Xie 已提交
348
            LOGGER.info("Minimization procedure has been completed.")
349 350
        except ValueError as value_error:
            LOGGER.error('Value Error: %s', repr(value_error))
351 352
            self.child_conn.send(value_error)
            return None
353 354 355 356
        except RuntimeError as runtime_error:
            LOGGER.error('Runtime Error: %s', repr(runtime_error))
            self.child_conn.send(runtime_error)
            return None
G
gaoruoshu 已提交
357 358 359
        except Exception as err:
            LOGGER.error('Unexpected Error: %s', repr(err))
            self.child_conn.send(Exception("Unexpected Error:", repr(err)))
360
            return None
D
dogsheng 已提交
361 362

        for i, knob in enumerate(self.knobs):
363
            if estimator is not None:
364
                params[knob['name']] = ret.x[i]
365
            labels.append(knob['name'])
366

Z
Zhipeng Xie 已提交
367 368
        LOGGER.info("Optimized result: %s", params)
        LOGGER.info("The optimized profile has been generated.")
369
        final_param = {}
370 371 372 373 374
        if self.sel_feature is True:
            wefs = WeightedEnsembleFeatureSelector()
            rank = wefs.get_ensemble_feature_importance(options, performance, labels)
            final_param["rank"] = rank
            LOGGER.info("The feature importances of current evaluation are: %s", rank)
375

376 377 378
        final_param["param"] = params
        final_param["finished"] = True
        self.child_conn.send(final_param)
D
dogsheng 已提交
379 380
        return params

Z
Zhipeng Xie 已提交
381 382
    def stop_process(self):
        """stop process"""
D
dogsheng 已提交
383 384
        self.child_conn.close()
        self.terminate()