optimizer.py 15.0 KB
Newer Older
D
dogsheng 已提交
1 2 3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 5 6
# A-Tune is licensed under the Mulan PSL v2.
# You can use this software according to the terms and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
7
#     http://license.coscl.org.cn/MulanPSL2
D
dogsheng 已提交
8 9 10
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
11
# See the Mulan PSL v2 for more details.
D
dogsheng 已提交
12 13 14 15 16 17
# Create: 2019-10-29

"""
This class is used to find optimal settings and generate optimized profile.
"""

Z
Zhipeng Xie 已提交
18
import logging
19
import numbers
20
import multiprocessing
Z
Zhipeng Xie 已提交
21
import numpy as np
22
import collections
23 24
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
D
dogsheng 已提交
25

26 27 28
from skopt import Optimizer as baseOpt
from skopt.utils import normalize_dimensions

29
from analysis.optimizer.abtest_tuning_manager import ABtestTuningManager
30
from analysis.optimizer.knob_sampling_manager import KnobSamplingManager
31
from analysis.optimizer.tpe_optimizer import TPEOptimizer
32
from analysis.optimizer.weighted_ensemble_feature_selector import WeightedEnsembleFeatureSelector
33

Z
Zhipeng Xie 已提交
34
LOGGER = logging.getLogger(__name__)
D
dogsheng 已提交
35 36


37
class Optimizer(multiprocessing.Process):
Z
Zhipeng Xie 已提交
38 39
    """find optimal settings and generate optimized profile"""

40
    def __init__(self, name, params, child_conn, engine="bayes",\
41
            max_eval=50, x0=None, y0=None, n_random_starts=20, split_count=5):
D
dogsheng 已提交
42 43 44 45 46
        super(Optimizer, self).__init__(name=name)
        self.knobs = params
        self.child_conn = child_conn
        self.engine = engine
        self.max_eval = int(max_eval)
47
        self.split_count = split_count
48 49
        self.x0 = x0
        self.y0 = y0
50 51 52 53 54
        if self.x0 is not None and len(self.x0) == 1:
            ref_x, ref_y = self.transfer()
            self.ref = ref_x[0]
        else:
            self.ref = []
55
        self._n_random_starts = 20 if n_random_starts is None else n_random_starts
D
dogsheng 已提交
56 57

    def build_space(self):
Z
Zhipeng Xie 已提交
58
        """build space"""
D
dogsheng 已提交
59
        objective_params_list = []
60
        for i, p_nob in enumerate(self.knobs):
Z
Zhipeng Xie 已提交
61
            if p_nob['type'] == 'discrete':
62
                items = self.handle_discrete_data(p_nob, i)
63
                objective_params_list.append(items)
Z
Zhipeng Xie 已提交
64 65
            elif p_nob['type'] == 'continuous':
                r_range = p_nob['range']
66 67
                if r_range is None or len(r_range) != 2:
                    raise ValueError("the item of the scope value of {} must be 2"
68
                                     .format(p_nob['name']))
69 70 71 72 73
                if p_nob['dtype'] == 'int':
                    try:
                        r_range[0] = int(r_range[0])
                        r_range[1] = int(r_range[1])
                    except ValueError:
74
                        raise ValueError("the range value of {} is not an integer value"
75 76 77 78 79 80
                                 .format(p_nob['name']))
                elif p_nob['dtype'] == 'float':
                    try:
                        r_range[0] = float(r_range[0])
                        r_range[1] = float(r_range[1])
                    except ValueError:
81
                        raise ValueError("the range value of {} is not an float value"
82 83
                                 .format(p_nob['name']))

84 85 86
                if len(self.ref) > 0:
                    if self.ref[i] < r_range[0] or self.ref[i] > r_range[1]:
                        raise ValueError("the ref value of {} is out of range".format(p_nob['name']))
Z
Zhipeng Xie 已提交
87
                objective_params_list.append((r_range[0], r_range[1]))
88 89
            else:
                raise ValueError("the type of {} is not supported".format(p_nob['name']))
D
dogsheng 已提交
90 91
        return objective_params_list

92
    def handle_discrete_data(self, p_nob, index):
93 94 95 96 97 98 99 100
        """handle discrete data"""
        if p_nob['dtype'] == 'int':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 1
            if 'step' in p_nob.keys():
101
                step = 1 if p_nob['step'] < 1 else p_nob['step']
102
            if r_range is not None:
103 104
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
105 106
                    items.extend(list(np.arange(r_range[i], r_range[i + 1] + 1, step=step)))
            items = list(set(items))
107 108 109 110 111 112 113 114
            if len(self.ref) > 0:
                try:
                    ref_value = int(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not an integer value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
115
            return items
116 117 118 119 120 121 122 123 124 125 126 127 128
        if p_nob['dtype'] == 'float':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 0.1
            if 'step' in p_nob.keys():
                step = 0.1 if p_nob['step'] <= 0 else p_nob['step']
            if r_range is not None:
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
                    items.extend(list(np.arange(r_range[i], r_range[i + 1], step=step)))
            items = list(set(items))
129 130 131 132 133 134 135 136
            if len(self.ref) > 0:
                try:
                    ref_value = float(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not a float value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
137
            return items
138 139
        if p_nob['dtype'] == 'string':
            items = p_nob['options']
140 141 142 143 144 145 146 147 148
            if len(self.ref) > 0:
                try:
                    ref_value = str(self.ref[index])
                except ValueError:
                    raise ValueError("the ref value of {} is not a string value"
                                     .format(p_nob['name']))
                if ref_value not in items:
                    items.append(ref_value)
            return items
149 150
        raise ValueError("the dtype of {} is not supported".format(p_nob['name']))

151 152 153 154 155 156 157
    @staticmethod
    def feature_importance(options, performance, labels):
        """feature importance"""
        options = StandardScaler().fit_transform(options)
        lasso = Lasso()
        lasso.fit(options, performance)
        result = zip(lasso.coef_, labels)
158 159 160
        total_sum = sum(map(abs, lasso.coef_))
        if total_sum == 0:
            return ", ".join("%s: 0" % label for label in labels)
161
        result = sorted(result, key=lambda x: -np.abs(x[0]))
162 163
        rank = ", ".join("%s: %s%%" % (label, round(coef * 100 / total_sum, 2))
                         for coef, label in result)
164 165
        return rank

166
    def _get_value_from_knobs(self, kv):
167 168 169 170
        x_each = []
        for p_nob in self.knobs:
            if p_nob['name'] not in kv.keys():
                raise ValueError("the param {} is not in the x0 ref".format(p_nob['name']))
171
            if p_nob['dtype'] == 'int':
172
                x_each.append(int(kv[p_nob['name']]))
173 174 175 176
            elif p_nob['dtype'] == 'float':
                x_each.append(float(kv[p_nob['name']]))
            else:
                x_each.append(kv[p_nob['name']])
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
        return x_each

    def transfer(self):
        """transfer ref x0 to int, y0 to float"""
        list_ref_x = []
        list_ref_y = []
        if self.x0 is None or self.y0 is None:
            return (list_ref_x, list_ref_y)

        for xValue in self.x0:
            kv = {}
            if len(xValue) != len(self.knobs):
                raise ValueError("x0 is not the same length with knobs")

            for i, val in enumerate(xValue):
                params = val.split("=")
                if len(params) != 2:
                    raise ValueError("the param format of {} is not correct".format(params))
                kv[params[0]] = params[1]

197
            ref_x = self._get_value_from_knobs(kv)
198 199 200 201 202 203
            if len(ref_x) != len(self.knobs):
                raise ValueError("tuning parameter is not the same length with knobs")
            list_ref_x.append(ref_x)
        list_ref_y = [float(y) for y in self.y0]
        return (list_ref_x, list_ref_y)

D
dogsheng 已提交
204
    def run(self):
205
        """start the tuning process"""
D
dogsheng 已提交
206
        def objective(var):
207 208
            """objective method receive the benchmark result and send the next parameters"""
            iterResult = {}
209
            option = []
D
dogsheng 已提交
210
            for i, knob in enumerate(self.knobs):
211
                params[knob['name']] = var[i]
Z
Zhipeng Xie 已提交
212
                if knob['dtype'] == 'string':
213
                    option.append(knob['options'].index(var[i]))
Z
Zhipeng Xie 已提交
214
                else:
215
                    option.append(var[i])
216 217 218
            
            iterResult["param"] = params
            self.child_conn.send(iterResult)
D
dogsheng 已提交
219
            result = self.child_conn.recv()
Z
Zhipeng Xie 已提交
220 221 222
            x_num = 0.0
            eval_list = result.split(',')
            for value in eval_list:
D
dogsheng 已提交
223
                num = float(value)
Z
Zhipeng Xie 已提交
224
                x_num = x_num + num
225
            options.append(option)
226
            performance.append(x_num)
Z
Zhipeng Xie 已提交
227
            return x_num
D
dogsheng 已提交
228 229

        params = {}
230 231 232
        options = []
        performance = []
        labels = []
233
        estimator = 'DIY'
D
dogsheng 已提交
234
        try:
235 236 237 238 239
            if self.engine == 'random' or self.engine == 'forest' or \
                    self.engine == 'gbrt' or self.engine == 'bayes':
                params_space = self.build_space()
                ref_x, ref_y = self.transfer()
                if len(ref_x) == 0:
240 241 242 243
                    if len(self.ref) == 0:
                        ref_x = None
                    else:
                        ref_x = self.ref
244
                    ref_y = None
245
                if ref_x is not None and not isinstance(ref_x[0], (list, tuple)):
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
                    ref_x = [ref_x]
                LOGGER.info('x0: %s', ref_x)
                LOGGER.info('y0: %s', ref_y)

                if ref_x is not None and isinstance(ref_x[0], (list, tuple)):
                    self._n_random_starts = 0 if len(ref_x) >= self._n_random_starts \
                            else self._n_random_starts - len(ref_x) + 1

                LOGGER.info('n_random_starts parameter is: %d', self._n_random_starts)
                LOGGER.info("Running performance evaluation.......")
                if self.engine == 'random':
                    estimator = 'dummy'
                elif self.engine == 'forest':
                    estimator = 'RF'
                elif self.engine == 'gbrt':
                    estimator = 'GBRT'
                elif self.engine == 'bayes':
                    estimator = 'GP'
                    params_space = normalize_dimensions(params_space)

                LOGGER.info("base_estimator is: %s", estimator)
                optimizer = baseOpt(
                    dimensions=params_space,
                    n_random_starts=self._n_random_starts,
                    random_state=1,
                    base_estimator=estimator
                )
                n_calls = self.max_eval
                # User suggested points at which to evaluate the objective first
                if ref_x and ref_y is None:
                    ref_y = list(map(objective, ref_x))
277
                    LOGGER.info("ref_y is: %s", ref_y)
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
                    n_calls -= len(ref_y)

                # Pass user suggested initialisation points to the optimizer
                if ref_x:
                    if not (isinstance(ref_y, collections.Iterable) or isinstance(ref_y, numbers.Number)):
                        raise ValueError(
                            "`ref_y` should be an iterable or a scalar, got %s" % type(ref_y))
                    if len(ref_x) != len(ref_y):
                        raise ValueError("`ref_x` and `ref_y` should have the same length")
                    LOGGER.info("ref_x: %s", ref_x)
                    LOGGER.info("ref_y: %s", ref_y)
                    ret = optimizer.tell(ref_x, ref_y)

                for i in range(n_calls):
                    next_x = optimizer.ask()
                    LOGGER.info("next_x: %s", next_x)
294
                    LOGGER.info("Running performance evaluation.......")
295 296 297 298
                    next_y = objective(next_x)
                    LOGGER.info("next_y: %s", next_y)
                    ret = optimizer.tell(next_x, next_y)
                    LOGGER.info("finish (ref_x, ref_y) tell")
299 300 301 302 303
            elif self.engine == 'abtest':
                abtuning_manager = ABtestTuningManager(self.knobs, self.child_conn, self.split_count)
                options, performance = abtuning_manager.do_abtest_tuning_abtest()
                params = abtuning_manager.get_best_params()
                options = abtuning_manager.get_options_index(options) # convert string option into index
304 305 306 307 308
            elif self.engine == 'lhs':
                knobsampling_manager = KnobSamplingManager(self.knobs, self.child_conn, self.max_eval, self.split_count)
                options = knobsampling_manager.get_knob_samples()
                performance = knobsampling_manager.do_knob_sampling_test(options)
                params = knobsampling_manager.get_best_params(options, performance)
309
                options = knobsampling_manager.get_options_index(options)
310 311 312
            elif self.engine == 'tpe':
                tpe_opt = TPEOptimizer(self.knobs, self.child_conn, self.max_eval)
                best_params = tpe_opt.tpe_minimize_tuning()
313 314 315 316
                final_param = {}
                final_param["finished"] = True
                final_param["param"] = best_params
                self.child_conn.send(final_param)
317
                return best_params
Z
Zhipeng Xie 已提交
318
            LOGGER.info("Minimization procedure has been completed.")
319 320
        except ValueError as value_error:
            LOGGER.error('Value Error: %s', repr(value_error))
321 322
            self.child_conn.send(value_error)
            return None
323 324 325 326 327 328
        except RuntimeError as runtime_error:
            LOGGER.error('Runtime Error: %s', repr(runtime_error))
            self.child_conn.send(runtime_error)
            return None
        except Exception as e:
            LOGGER.error('Unexpected Error: %s', repr(e))
329
            self.child_conn.send(Exception("Unexpected Error:", repr(e)))
330
            return None
D
dogsheng 已提交
331 332

        for i, knob in enumerate(self.knobs):
333
            if estimator != "DIY":
334
                params[knob['name']] = ret.x[i]
335
            labels.append(knob['name'])
336

Z
Zhipeng Xie 已提交
337 338
        LOGGER.info("Optimized result: %s", params)
        LOGGER.info("The optimized profile has been generated.")
339
        final_param = {}
340 341
        wefs = WeightedEnsembleFeatureSelector()
        rank = wefs.get_ensemble_feature_importance(options, performance, labels)
342

343 344 345 346
        final_param["param"] = params
        final_param["rank"] = rank
        final_param["finished"] = True
        self.child_conn.send(final_param)
347
        LOGGER.info("The feature importances of current evaluation are: %s", rank)
D
dogsheng 已提交
348 349
        return params

Z
Zhipeng Xie 已提交
350 351
    def stop_process(self):
        """stop process"""
D
dogsheng 已提交
352 353
        self.child_conn.close()
        self.terminate()
354