db_crashtest.py 11.6 KB
Newer Older
1
#! /usr/bin/env python
2
import os
3
import re
4 5
import sys
import time
6
import random
7
import logging
8
import tempfile
9
import subprocess
I
Igor Canadi 已提交
10
import shutil
11
import argparse
12

13 14 15 16 17
# params overwrite priority:
#   for default:
#       default_params < blackbox|whitebox_default_params < args
#   for simple:
#       simple_default_params < blackbox|whitebox_simple_default_params < args
18

19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
default_params = {
    "block_size": 16384,
    "cache_size": 1048576,
    "delpercent": 5,
    "destroy_db_initially": 0,
    "disable_data_sync": 0,
    "disable_wal": 0,
    "filter_deletes": lambda: random.randint(0, 1),
    "iterpercent": 10,
    "max_background_compactions": 20,
    "max_bytes_for_level_base": 10485760,
    "max_key": 100000000,
    "max_write_buffer_number": 3,
    "memtablerep": "prefix_hash",
    "mmap_read": lambda: random.randint(0, 1),
    "open_files": 500000,
    "prefix_size": 7,
    "prefixpercent": 5,
    "progress_reports": 0,
    "readpercent": 45,
    "reopen": 20,
    "sync": 0,
    "target_file_size_base": 2097152,
    "target_file_size_multiplier": 2,
    "threads": 32,
    "verify_checksum": 1,
    "write_buffer_size": 4 * 1024 * 1024,
    "writepercent": 35,
}
48

49

50
def get_dbname(test_name):
S
sdong 已提交
51 52
    test_tmpdir = os.environ.get("TEST_TMPDIR")
    if test_tmpdir is None or test_tmpdir == "":
53
        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
S
sdong 已提交
54
    else:
55
        dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
56
        shutil.rmtree(dbname, True)
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    return dbname

blackbox_default_params = {
    'db': lambda: get_dbname('blackbox'),
    # total time for this script to test db_stress
    "duration": 6000,
    # time for one db_stress instance to run
    "interval": 120,
    # since we will be killing anyway, use large value for ops_per_thread
    "ops_per_thread": 100000000,
    "set_options_one_in": 10000,
    "test_batches_snapshots": 1,
}

whitebox_default_params = {
    'db': lambda: get_dbname('whitebox'),
    "duration": 10000,
    "log2_keys_per_lock": 10,
    "nooverwritepercent": 1,
    "ops_per_thread": 200000,
    "test_batches_snapshots": lambda: random.randint(0, 1),
    "write_buffer_size": 4 * 1024 * 1024,
}

simple_default_params = {
    "block_size": 16384,
    "cache_size": 1048576,
    "column_families": 1,
    "delpercent": 5,
    "destroy_db_initially": 0,
    "disable_data_sync": 0,
    "disable_wal": 0,
    "filter_deletes": lambda: random.randint(0, 1),
    "iterpercent": 10,
    "max_background_compactions": 1,
    "max_bytes_for_level_base": 67108864,
    "max_key": 100000000,
    "max_write_buffer_number": 3,
    "memtablerep": "skip_list",
    "mmap_read": lambda: random.randint(0, 1),
    "prefix_size": 0,
    "prefixpercent": 0,
    "progress_reports": 0,
    "readpercent": 50,
    "reopen": 20,
    "sync": 0,
    "target_file_size_base": 16777216,
    "target_file_size_multiplier": 1,
    "test_batches_snapshots": 0,
    "threads": 32,
    "verify_checksum": 1,
    "write_buffer_size": 32 * 1024 * 1024,
    "writepercent": 35,
}

blackbox_simple_default_params = {
    'db': lambda: get_dbname('blackbox'),
    "duration": 6000,
    "interval": 120,
    "open_files": -1,
    "ops_per_thread": 100000000,
    "set_options_one_in": 0,
    "test_batches_snapshots": 0,
}

whitebox_simple_default_params = {
    'db': lambda: get_dbname('whitebox'),
    "duration": 10000,
    "log2_keys_per_lock": 10,
    "nooverwritepercent": 1,
    "open_files": 500000,
    "ops_per_thread": 200000,
    "write_buffer_size": 32 * 1024 * 1024,
}


def gen_cmd_params(args):
    params = {}

    if args.simple:
        params.update(simple_default_params)
        if args.test_type == 'blackbox':
            params.update(blackbox_simple_default_params)
        if args.test_type == 'whitebox':
            params.update(whitebox_simple_default_params)

    if not args.simple:
        params.update(default_params)
        if args.test_type == 'blackbox':
            params.update(blackbox_default_params)
        if args.test_type == 'whitebox':
            params.update(whitebox_default_params)

    for k, v in vars(args).items():
        if v is not None:
            params[k] = v
    return params


def gen_cmd(params):
    cmd = './db_stress ' + ' '.join(
        '--{0}={1}'.format(k, v() if callable(v) else v)
        for k, v in params.items()
        if k not in set(['test_type', 'simple', 'duration', 'interval'])
        and v is not None)
    return cmd


# This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB.
def blackbox_crash_main(args):
    cmd_params = gen_cmd_params(args)

    exit_time = time.time() + cmd_params['duration']

    print("Running blackbox-crash-test with \n"
          + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
          + "total-duration=" + str(cmd_params['duration']) + "\n"
          + "threads=" + str(cmd_params['threads']) + "\n"
          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")
I
Igor Canadi 已提交
178

179 180
    while time.time() < exit_time:
        run_had_errors = False
181 182 183
        killtime = time.time() + cmd_params['interval']

        cmd = gen_cmd(cmd_params)
184 185

        child = subprocess.Popen([cmd],
186
                                 stderr=subprocess.PIPE, shell=True)
187 188 189
        print("Running db_stress with pid=%d: %s\n\n"
              % (child.pid, cmd))

L
Lei Jin 已提交
190
        stop_early = False
191
        while time.time() < killtime:
L
Lei Jin 已提交
192 193 194 195 196 197
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
                stop_early = True
                break
            time.sleep(1)
198

L
Lei Jin 已提交
199 200 201 202 203 204 205 206
        if not stop_early:
            if child.poll() is not None:
                print("WARNING: db_stress ended before kill: exitcode=%d\n"
                      % child.returncode)
            else:
                child.kill()
                print("KILLED %d\n" % child.pid)
                time.sleep(1)  # time to stabilize after a kill
207 208 209 210 211 212 213

        while True:
            line = child.stderr.readline().strip()
            if line != '':
                run_had_errors = True
                print('***' + line + '^')
            else:
214 215
                break

216 217 218 219
        if run_had_errors:
            sys.exit(2)

        time.sleep(1)  # time to stabilize before the next run
220

I
Igor Canadi 已提交
221
    # we need to clean up after ourselves -- only do this on test success
S
Shusen Liu 已提交
222
    shutil.rmtree(cmd_params['db'], True)
I
Igor Canadi 已提交
223

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330

# This python script runs db_stress multiple times. Some runs with
# kill_random_test that causes rocksdb to crash at various points in code.
def whitebox_crash_main(args):
    cmd_params = gen_cmd_params(args)

    cur_time = time.time()
    exit_time = cur_time + cmd_params['duration']
    half_time = cur_time + cmd_params['duration'] / 2

    print("Running whitebox-crash-test with \n"
          + "total-duration=" + str(cmd_params['duration']) + "\n"
          + "threads=" + str(cmd_params['threads']) + "\n"
          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")

    total_check_mode = 4
    check_mode = 0
    kill_random_test = 97
    kill_mode = 0

    while time.time() < exit_time:
        if check_mode == 0:
            additional_opts = {
                # use large ops per thread since we will kill it anyway
                "ops_per_thread": 100 * cmd_params['ops_per_thread'],
            }
            # run with kill_random_test
            if kill_mode == 0:
                additional_opts.update({
                    "kill_random_test": kill_random_test,
                })
            elif kill_mode == 1:
                additional_opts.update({
                    "kill_random_test": (kill_random_test / 3 + 1),
                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
                    + "WritableFileWriter::WriteBuffered",
                })

            # Run kill mode 0 and 1 by turn.
            kill_mode = (kill_mode + 1) % 2
        elif check_mode == 1:
            # normal run with universal compaction mode
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
                "compaction_style": 1,
            }
        elif check_mode == 2:
            # normal run with FIFO compaction mode
            # ops_per_thread is divided by 5 because FIFO compaction
            # style is quite a bit slower on reads with lot of files
            additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'] / 5,
                "compaction_style": 2,
            }
        else:
            # normal run
            additional_opts = additional_opts = {
                "kill_random_test": None,
                "ops_per_thread": cmd_params['ops_per_thread'],
            }

        cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()))

        print "Running:" + cmd + "\n"

        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
        stdoutdata, stderrdata = popen.communicate()
        retncode = popen.returncode
        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
               check_mode, additional_opts['kill_random_test'], retncode))
        print msg
        print stdoutdata

        expected = False
        if additional_opts['kill_random_test'] is None and (retncode == 0):
            # we expect zero retncode if no kill option
            expected = True
        elif additional_opts['kill_random_test'] is not None and retncode < 0:
            # we expect negative retncode if kill option was given
            expected = True

        if not expected:
            print "TEST FAILED. See kill option and exit code above!!!\n"
            sys.exit(1)

        stdoutdata = stdoutdata.lower()
        errorcount = (stdoutdata.count('error') -
                      stdoutdata.count('got errors 0 times'))
        print "#times error occurred in output is " + str(errorcount) + "\n"

        if (errorcount > 0):
            print "TEST FAILED. Output has 'error'!!!\n"
            sys.exit(2)
        if (stdoutdata.find('fail') >= 0):
            print "TEST FAILED. Output has 'fail'!!!\n"
            sys.exit(2)

        # First half of the duration, keep doing kill test. For the next half,
        # try different modes.
        if time.time() > half_time:
            # we need to clean up after ourselves -- only do this on test
            # success
S
Shusen Liu 已提交
331
            shutil.rmtree(cmd_params['db'], True)
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
            check_mode = (check_mode + 1) % total_check_mode

        time.sleep(1)  # time to stabilize after a kill


def main():
    parser = argparse.ArgumentParser(description="This script runs and kills \
        db_stress multiple times")
    parser.add_argument("test_type", choices=["blackbox", "whitebox"])
    parser.add_argument("--simple", action="store_true")

    all_params = dict(default_params.items()
                      + blackbox_default_params.items()
                      + whitebox_default_params.items()
                      + simple_default_params.items()
                      + blackbox_simple_default_params.items()
                      + whitebox_simple_default_params.items())

    for k, v in all_params.items():
        parser.add_argument("--" + k, type=type(v() if callable(v) else v))
    args = parser.parse_args()

    if args.test_type == 'blackbox':
        blackbox_crash_main(args)
    if args.test_type == 'whitebox':
        whitebox_crash_main(args)

if __name__ == '__main__':
    main()