optimizer.py 15.1 KB
Newer Older
D
dogsheng 已提交
1 2 3
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (c) 2019 Huawei Technologies Co., Ltd.
4 5 6
# A-Tune is licensed under the Mulan PSL v2.
# You can use this software according to the terms and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
7
#     http://license.coscl.org.cn/MulanPSL2
D
dogsheng 已提交
8 9 10
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
# PURPOSE.
11
# See the Mulan PSL v2 for more details.
D
dogsheng 已提交
12 13 14 15 16 17
# Create: 2019-10-29

"""
This class is used to find optimal settings and generate optimized profile.
"""

Z
Zhipeng Xie 已提交
18
import logging
19
import numbers
20
import multiprocessing
Z
Zhipeng Xie 已提交
21
import numpy as np
22
import collections
23 24
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
D
dogsheng 已提交
25

26 27 28
from skopt import Optimizer as baseOpt
from skopt.utils import normalize_dimensions

29
from analysis.optimizer.abtest_tuning_manager import ABtestTuningManager
30
from analysis.optimizer.knob_sampling_manager import KnobSamplingManager
31
from analysis.optimizer.tpe_optimizer import TPEOptimizer
32
from analysis.optimizer.weighted_ensemble_feature_selector import WeightedEnsembleFeatureSelector
33

Z
Zhipeng Xie 已提交
34
LOGGER = logging.getLogger(__name__)
D
dogsheng 已提交
35 36


37
class Optimizer(multiprocessing.Process):
Z
Zhipeng Xie 已提交
38 39
    """find optimal settings and generate optimized profile"""

40
    def __init__(self, name, params, child_conn, engine="bayes",\
41
            max_eval=50, x0=None, y0=None, n_random_starts=20, split_count=5):
D
dogsheng 已提交
42 43 44 45 46
        super(Optimizer, self).__init__(name=name)
        self.knobs = params
        self.child_conn = child_conn
        self.engine = engine
        self.max_eval = int(max_eval)
47
        self.split_count = split_count
48
        self.ref = []
49 50 51
        self.x0 = x0
        self.y0 = y0
        self._n_random_starts = 20 if n_random_starts is None else n_random_starts
D
dogsheng 已提交
52 53

    def build_space(self):
Z
Zhipeng Xie 已提交
54
        """build space"""
D
dogsheng 已提交
55
        objective_params_list = []
Z
Zhipeng Xie 已提交
56 57
        for p_nob in self.knobs:
            if p_nob['type'] == 'discrete':
58 59
                items = self.handle_discrete_data(p_nob)
                objective_params_list.append(items)
Z
Zhipeng Xie 已提交
60 61
            elif p_nob['type'] == 'continuous':
                r_range = p_nob['range']
62 63
                if r_range is None or len(r_range) != 2:
                    raise ValueError("the item of the scope value of {} must be 2"
64
                                     .format(p_nob['name']))
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
                if p_nob['dtype'] == 'int':
                    try:
                        ref_value = int(p_nob['ref'])
                        r_range[0] = int(r_range[0])
                        r_range[1] = int(r_range[1])
                    except ValueError:
                        raise ValueError("the ref value of {} is not an integer value"
                                 .format(p_nob['name']))
                elif p_nob['dtype'] == 'float':
                    try:
                        ref_value = float(p_nob['ref'])
                        r_range[0] = float(r_range[0])
                        r_range[1] = float(r_range[1])
                    except ValueError:
                        raise ValueError("the ref value of {} is not an integer value"
                                 .format(p_nob['name']))

82 83 84
                if ref_value < r_range[0] or ref_value > r_range[1]:
                    raise ValueError("the ref value of {} is out of range".format(p_nob['name']))
                self.ref.append(ref_value)
Z
Zhipeng Xie 已提交
85
                objective_params_list.append((r_range[0], r_range[1]))
86 87
            else:
                raise ValueError("the type of {} is not supported".format(p_nob['name']))
D
dogsheng 已提交
88 89
        return objective_params_list

90 91 92 93 94 95 96 97 98
    def handle_discrete_data(self, p_nob):
        """handle discrete data"""
        if p_nob['dtype'] == 'int':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 1
            if 'step' in p_nob.keys():
99
                step = 1 if p_nob['step'] < 1 else p_nob['step']
100
            if r_range is not None:
101 102
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
103 104
                    items.extend(list(np.arange(r_range[i], r_range[i + 1] + 1, step=step)))
            items = list(set(items))
105 106 107 108
            try:
                ref_value = int(p_nob['ref'])
            except ValueError:
                raise ValueError("the ref value of {} is not an integer value"
109
                                 .format(p_nob['name']))
110
            if ref_value not in items:
111
                items.append(ref_value)
112
            self.ref.append(ref_value)
113
            return items
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
        if p_nob['dtype'] == 'float':
            items = p_nob['items']
            if items is None:
                items = []
            r_range = p_nob['range']
            step = 0.1
            if 'step' in p_nob.keys():
                step = 0.1 if p_nob['step'] <= 0 else p_nob['step']
            if r_range is not None:
                length = len(r_range) if len(r_range) % 2 == 0 else len(r_range) - 1
                for i in range(0, length, 2):
                    items.extend(list(np.arange(r_range[i], r_range[i + 1], step=step)))
            items = list(set(items))
            try:
                ref_value = float(p_nob['ref'])
            except ValueError:
                raise ValueError("the ref value of {} is not a float value"
                                 .format(p_nob['name']))
            if ref_value not in items:
133
                items.append(ref_value)
134 135
            self.ref.append(ref_value)
            return items
136 137 138 139 140 141 142 143 144
        if p_nob['dtype'] == 'string':
            items = p_nob['options']
            keys = []
            length = len(self.ref)
            for key, value in enumerate(items):
                keys.append(key)
                if p_nob['ref'] == value:
                    self.ref.append(key)
            if len(self.ref) == length:
145
                raise ValueError("the ref value of {} is out of range"
146 147 148 149
                                 .format(p_nob['name']))
            return keys
        raise ValueError("the dtype of {} is not supported".format(p_nob['name']))

150 151 152 153 154 155 156
    @staticmethod
    def feature_importance(options, performance, labels):
        """feature importance"""
        options = StandardScaler().fit_transform(options)
        lasso = Lasso()
        lasso.fit(options, performance)
        result = zip(lasso.coef_, labels)
157 158 159
        total_sum = sum(map(abs, lasso.coef_))
        if total_sum == 0:
            return ", ".join("%s: 0" % label for label in labels)
160
        result = sorted(result, key=lambda x: -np.abs(x[0]))
161 162
        rank = ", ".join("%s: %s%%" % (label, round(coef * 100 / total_sum, 2))
                         for coef, label in result)
163 164
        return rank

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    def _get_intvalue_from_knobs(self, kv):
        """get the int value from knobs if dtype if string"""
        x_each = []
        for p_nob in self.knobs:
            if p_nob['name'] not in kv.keys():
                raise ValueError("the param {} is not in the x0 ref".format(p_nob['name']))
            if p_nob['dtype'] != 'string':
                x_each.append(int(kv[p_nob['name']]))
                continue
            options = p_nob['options']
            for key, value in enumerate(options):
                if value != kv[p_nob['name']]:
                    continue
                x_each.append(key)
        return x_each

    def transfer(self):
        """transfer ref x0 to int, y0 to float"""
        list_ref_x = []
        list_ref_y = []
        if self.x0 is None or self.y0 is None:
            return (list_ref_x, list_ref_y)

        for xValue in self.x0:
            kv = {}
            if len(xValue) != len(self.knobs):
                raise ValueError("x0 is not the same length with knobs")

            for i, val in enumerate(xValue):
                params = val.split("=")
                if len(params) != 2:
                    raise ValueError("the param format of {} is not correct".format(params))
                kv[params[0]] = params[1]

            ref_x = self._get_intvalue_from_knobs(kv)
            if len(ref_x) != len(self.knobs):
                raise ValueError("tuning parameter is not the same length with knobs")
            list_ref_x.append(ref_x)
        list_ref_y = [float(y) for y in self.y0]
        return (list_ref_x, list_ref_y)

D
dogsheng 已提交
206
    def run(self):
207
        """start the tuning process"""
D
dogsheng 已提交
208
        def objective(var):
209 210
            """objective method receive the benchmark result and send the next parameters"""
            iterResult = {}
D
dogsheng 已提交
211
            for i, knob in enumerate(self.knobs):
Z
Zhipeng Xie 已提交
212 213 214 215
                if knob['dtype'] == 'string':
                    params[knob['name']] = knob['options'][var[i]]
                else:
                    params[knob['name']] = var[i]
216 217 218
            
            iterResult["param"] = params
            self.child_conn.send(iterResult)
D
dogsheng 已提交
219
            result = self.child_conn.recv()
Z
Zhipeng Xie 已提交
220 221 222
            x_num = 0.0
            eval_list = result.split(',')
            for value in eval_list:
D
dogsheng 已提交
223
                num = float(value)
Z
Zhipeng Xie 已提交
224
                x_num = x_num + num
225 226
            options.append(var)
            performance.append(x_num)
Z
Zhipeng Xie 已提交
227
            return x_num
D
dogsheng 已提交
228 229

        params = {}
230 231 232
        options = []
        performance = []
        labels = []
233
        estimator = 'DIY'
D
dogsheng 已提交
234
        try:
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
            if self.engine == 'random' or self.engine == 'forest' or \
                    self.engine == 'gbrt' or self.engine == 'bayes':
                params_space = self.build_space()
                ref_x, ref_y = self.transfer()
                if len(ref_x) == 0:
                    ref_x = self.ref
                    ref_y = None
                if not isinstance(ref_x[0], (list, tuple)):
                    ref_x = [ref_x]

                LOGGER.info('x0: %s', ref_x)
                LOGGER.info('y0: %s', ref_y)

                if ref_x is not None and isinstance(ref_x[0], (list, tuple)):
                    self._n_random_starts = 0 if len(ref_x) >= self._n_random_starts \
                            else self._n_random_starts - len(ref_x) + 1

                LOGGER.info('n_random_starts parameter is: %d', self._n_random_starts)
                LOGGER.info("Running performance evaluation.......")
                if self.engine == 'random':
                    estimator = 'dummy'
                elif self.engine == 'forest':
                    estimator = 'RF'
                elif self.engine == 'gbrt':
                    estimator = 'GBRT'
                elif self.engine == 'bayes':
                    estimator = 'GP'
                    params_space = normalize_dimensions(params_space)

                LOGGER.info("base_estimator is: %s", estimator)
                optimizer = baseOpt(
                    dimensions=params_space,
                    n_random_starts=self._n_random_starts,
                    random_state=1,
                    base_estimator=estimator
                )
                n_calls = self.max_eval

                if not isinstance(ref_x[0], (list, tuple)):
                    ref_x = [ref_x]

                # User suggested points at which to evaluate the objective first
                if ref_x and ref_y is None:
                    ref_y = list(map(objective, ref_x))
                    n_calls -= len(ref_y)

                # Pass user suggested initialisation points to the optimizer
                if ref_x:
                    if not (isinstance(ref_y, collections.Iterable) or isinstance(ref_y, numbers.Number)):
                        raise ValueError(
                            "`ref_y` should be an iterable or a scalar, got %s" % type(ref_y))
                    if len(ref_x) != len(ref_y):
                        raise ValueError("`ref_x` and `ref_y` should have the same length")
                    LOGGER.info("ref_x: %s", ref_x)
                    LOGGER.info("ref_y: %s", ref_y)
                    ret = optimizer.tell(ref_x, ref_y)

                for i in range(n_calls):
                    next_x = optimizer.ask()
                    LOGGER.info("next_x: %s", next_x)
                    LOGGER.info("Running perfddormance evaluation.......")
                    next_y = objective(next_x)
                    LOGGER.info("next_y: %s", next_y)
                    ret = optimizer.tell(next_x, next_y)
                    LOGGER.info("finish (ref_x, ref_y) tell")
300 301 302 303 304
            elif self.engine == 'abtest':
                abtuning_manager = ABtestTuningManager(self.knobs, self.child_conn, self.split_count)
                options, performance = abtuning_manager.do_abtest_tuning_abtest()
                params = abtuning_manager.get_best_params()
                options = abtuning_manager.get_options_index(options) # convert string option into index
305 306 307 308 309
            elif self.engine == 'lhs':
                knobsampling_manager = KnobSamplingManager(self.knobs, self.child_conn, self.max_eval, self.split_count)
                options = knobsampling_manager.get_knob_samples()
                performance = knobsampling_manager.do_knob_sampling_test(options)
                params = knobsampling_manager.get_best_params(options, performance)
310
                options = knobsampling_manager.get_options_index(options)
311 312 313
            elif self.engine == 'tpe':
                tpe_opt = TPEOptimizer(self.knobs, self.child_conn, self.max_eval)
                best_params = tpe_opt.tpe_minimize_tuning()
314 315 316 317
                final_param = {}
                final_param["finished"] = True
                final_param["param"] = best_params
                self.child_conn.send(final_param)
318
                return best_params
Z
Zhipeng Xie 已提交
319
            LOGGER.info("Minimization procedure has been completed.")
320 321
        except ValueError as value_error:
            LOGGER.error('Value Error: %s', repr(value_error))
322 323
            self.child_conn.send(value_error)
            return None
324 325 326 327 328 329
        except RuntimeError as runtime_error:
            LOGGER.error('Runtime Error: %s', repr(runtime_error))
            self.child_conn.send(runtime_error)
            return None
        except Exception as e:
            LOGGER.error('Unexpected Error: %s', repr(e))
330
            self.child_conn.send(Exception("Unexpected Error:", repr(e)))
331
            return None
D
dogsheng 已提交
332 333

        for i, knob in enumerate(self.knobs):
334 335 336 337 338
            if estimator != "DIY":
                if knob['dtype'] == 'string':
                    params[knob['name']] = knob['options'][ret.x[i]]
                else:
                    params[knob['name']] = ret.x[i]
339
            labels.append(knob['name'])
340

Z
Zhipeng Xie 已提交
341 342
        LOGGER.info("Optimized result: %s", params)
        LOGGER.info("The optimized profile has been generated.")
343
        final_param = {}
344 345
        wefs = WeightedEnsembleFeatureSelector()
        rank = wefs.get_ensemble_feature_importance(options, performance, labels)
346

347 348 349 350
        final_param["param"] = params
        final_param["rank"] = rank
        final_param["finished"] = True
        self.child_conn.send(final_param)
351
        LOGGER.info("The feature importances of current evaluation are: %s", rank)
D
dogsheng 已提交
352 353
        return params

Z
Zhipeng Xie 已提交
354 355
    def stop_process(self):
        """stop process"""
D
dogsheng 已提交
356 357
        self.child_conn.close()
        self.terminate()
358