crash_gen.py 100.1 KB
Newer Older
S
Shuduo Sang 已提交
1
# -----!/usr/bin/python3.7
S
Steven Li 已提交
2 3 4 5 6 7 8 9 10 11 12 13
###################################################################
#           Copyright (c) 2016 by TAOS Technologies, Inc.
#                     All rights reserved.
#
#  This file is proprietary and confidential to TAOS Technologies.
#  No part of this file may be reproduced, stored, transmitted,
#  disclosed or used in any form or by any means other than as
#  expressly provided by the written permission from Jianhui Tao
#
###################################################################

# -*- coding: utf-8 -*-
S
Shuduo Sang 已提交
14 15 16 17 18 19 20 21 22 23 24 25 26 27
# For type hinting before definition, ref:
# https://stackoverflow.com/questions/33533148/how-do-i-specify-that-the-return-type-of-a-method-is-the-same-as-the-class-itsel
from __future__ import annotations
import taos
from util.sql import *
from util.cases import *
from util.dnodes import *
from util.log import *
from typing import Set
from typing import Dict
from typing import List
from requests.auth import HTTPBasicAuth
import textwrap
import time
28
import datetime
S
Shuduo Sang 已提交
29
import random
30
import logging
S
Shuduo Sang 已提交
31 32 33 34 35
import threading
import requests
import copy
import argparse
import getopt
36

S
Steven Li 已提交
37
import sys
38
import os
39
import signal
40
import traceback
41 42 43
import resource
from guppy import hpy
import gc
44

45 46
from .service_manager import ServiceManager, TdeInstance
from .misc import Logging, Status, CrashGenError, Dice, Helper, Progress
47

48 49 50 51
# Require Python 3
if sys.version_info[0] < 3:
    raise Exception("Must be using Python 3")

S
Shuduo Sang 已提交
52
# Global variables, tried to keep a small number.
53 54 55

# Command-line/Environment Configurations, will set a bit later
# ConfigNameSpace = argparse.Namespace
56 57
gConfig:    argparse.Namespace 
gSvcMgr:    ServiceManager # TODO: refactor this hack, use dep injection
58
# logger:     logging.Logger
59
gContainer: Container
S
Steven Li 已提交
60

61 62
# def runThread(wt: WorkerThread):
#     wt.run()
63

64

S
Steven Li 已提交
65
class WorkerThread:
66
    def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator,
S
Shuduo Sang 已提交
67 68 69
                 # te: TaskExecutor,
                 ):  # note: main thread context!
        # self._curStep = -1
70
        self._pool = pool
S
Shuduo Sang 已提交
71 72
        self._tid = tid
        self._tc = tc  # type: ThreadCoordinator
S
Steven Li 已提交
73
        # self.threadIdent = threading.get_ident()
74 75
        # self._thread = threading.Thread(target=runThread, args=(self,))
        self._thread = threading.Thread(target=self.run)
76
        self._stepGate = threading.Event()
S
Steven Li 已提交
77

78
        # Let us have a DB connection of our own
S
Shuduo Sang 已提交
79
        if (gConfig.per_thread_db_connection):  # type: ignore
80
            # print("connector_type = {}".format(gConfig.connector_type))
81 82 83 84 85 86 87 88 89 90 91
            if gConfig.connector_type == 'native':
                self._dbConn = DbConn.createNative() 
            elif gConfig.connector_type == 'rest':
                self._dbConn = DbConn.createRest() 
            elif gConfig.connector_type == 'mixed':
                if Dice.throw(2) == 0: # 1/2 chance
                    self._dbConn = DbConn.createNative() 
                else:
                    self._dbConn = DbConn.createRest() 
            else:
                raise RuntimeError("Unexpected connector type: {}".format(gConfig.connector_type))
92

93
        # self._dbInUse = False  # if "use db" was executed already
94

95
    def logDebug(self, msg):
96
        Logging.debug("    TRD[{}] {}".format(self._tid, msg))
97 98

    def logInfo(self, msg):
99
        Logging.info("    TRD[{}] {}".format(self._tid, msg))
100

101 102
    # def dbInUse(self):
    #     return self._dbInUse
103

104 105 106 107
    # def useDb(self):
    #     if (not self._dbInUse):
    #         self.execSql("use db")
    #     self._dbInUse = True
108

109
    def getTaskExecutor(self):
S
Shuduo Sang 已提交
110
        return self._tc.getTaskExecutor()
111

S
Steven Li 已提交
112
    def start(self):
113
        self._thread.start()  # AFTER the thread is recorded
S
Steven Li 已提交
114

S
Shuduo Sang 已提交
115
    def run(self):
S
Steven Li 已提交
116
        # initialization after thread starts, in the thread context
117
        # self.isSleeping = False
118
        Logging.info("Starting to run thread: {}".format(self._tid))
119

S
Shuduo Sang 已提交
120
        if (gConfig.per_thread_db_connection):  # type: ignore
121
            Logging.debug("Worker thread openning database connection")
122
            self._dbConn.open()
S
Steven Li 已提交
123

S
Shuduo Sang 已提交
124 125
        self._doTaskLoop()

126
        # clean up
S
Shuduo Sang 已提交
127
        if (gConfig.per_thread_db_connection):  # type: ignore
128 129 130
            if self._dbConn.isOpen: #sometimes it is not open
                self._dbConn.close()
            else:
131
                Logging.warning("Cleaning up worker thread, dbConn already closed")
132

S
Shuduo Sang 已提交
133
    def _doTaskLoop(self):
134 135
        # while self._curStep < self._pool.maxSteps:
        # tc = ThreadCoordinator(None)
S
Shuduo Sang 已提交
136 137
        while True:
            tc = self._tc  # Thread Coordinator, the overall master
138 139 140
            try:
                tc.crossStepBarrier()  # shared barrier first, INCLUDING the last one
            except threading.BrokenBarrierError as err: # main thread timed out
141
                print("_bto", end="")
142
                Logging.debug("[TRD] Worker thread exiting due to main thread barrier time-out")
143 144
                break

145
            Logging.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid))
146
            self.crossStepGate()   # then per-thread gate, after being tapped
147
            Logging.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid))
148
            if not self._tc.isRunning():
149
                print("_wts", end="")
150
                Logging.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
151 152
                break

153
            # Before we fetch the task and run it, let's ensure we properly "use" the database (not needed any more)
154
            try:
155 156 157
                if (gConfig.per_thread_db_connection):  # most likely TRUE
                    if not self._dbConn.isOpen:  # might have been closed during server auto-restart
                        self._dbConn.open()
158
                # self.useDb() # might encounter exceptions. TODO: catch
159 160
            except taos.error.ProgrammingError as err:
                errno = Helper.convertErrno(err.errno)
161
                if errno in [0x383, 0x386, 0x00B, 0x014]  : # invalid database, dropping, Unable to establish connection, Database not ready
162 163 164 165 166 167
                    # ignore
                    dummy = 0
                else:
                    print("\nCaught programming error. errno=0x{:X}, msg={} ".format(errno, err.msg))
                    raise

168
            # Fetch a task from the Thread Coordinator
169
            Logging.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid))
170
            task = tc.fetchTask()
171 172

            # Execute such a task
173
            Logging.debug("[TRD] Worker thread [{}] about to execute task: {}".format(
S
Shuduo Sang 已提交
174
                    self._tid, task.__class__.__name__))
175
            task.execute(self)
176
            tc.saveExecutedTask(task)
177
            Logging.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
S
Shuduo Sang 已提交
178

179
            # self._dbInUse = False  # there may be changes between steps
180
        # print("_wtd", end=None) # worker thread died
181

S
Shuduo Sang 已提交
182 183
    def verifyThreadSelf(self):  # ensure we are called by this own thread
        if (threading.get_ident() != self._thread.ident):
S
Steven Li 已提交
184 185
            raise RuntimeError("Unexpectly called from other threads")

S
Shuduo Sang 已提交
186 187
    def verifyThreadMain(self):  # ensure we are called by the main thread
        if (threading.get_ident() != threading.main_thread().ident):
S
Steven Li 已提交
188 189 190
            raise RuntimeError("Unexpectly called from other threads")

    def verifyThreadAlive(self):
S
Shuduo Sang 已提交
191
        if (not self._thread.is_alive()):
S
Steven Li 已提交
192 193
            raise RuntimeError("Unexpected dead thread")

194
    # A gate is different from a barrier in that a thread needs to be "tapped"
S
Steven Li 已提交
195 196
    def crossStepGate(self):
        self.verifyThreadAlive()
S
Shuduo Sang 已提交
197 198
        self.verifyThreadSelf()  # only allowed by ourselves

199
        # Wait again at the "gate", waiting to be "tapped"
200
        Logging.debug(
S
Shuduo Sang 已提交
201 202 203
            "[TRD] Worker thread {} about to cross the step gate".format(
                self._tid))
        self._stepGate.wait()
204
        self._stepGate.clear()
S
Shuduo Sang 已提交
205

206
        # self._curStep += 1  # off to a new step...
S
Steven Li 已提交
207

S
Shuduo Sang 已提交
208
    def tapStepGate(self):  # give it a tap, release the thread waiting there
209
        # self.verifyThreadAlive()
S
Shuduo Sang 已提交
210 211
        self.verifyThreadMain()  # only allowed for main thread

212
        if self._thread.is_alive():
213
            Logging.debug("[TRD] Tapping worker thread {}".format(self._tid))
214 215 216 217
            self._stepGate.set()  # wake up!
            time.sleep(0)  # let the released thread run a bit
        else:
            print("_tad", end="") # Thread already dead
218

S
Shuduo Sang 已提交
219
    def execSql(self, sql):  # TODO: expose DbConn directly
220
        return self.getDbConn().execute(sql)
221

S
Shuduo Sang 已提交
222
    def querySql(self, sql):  # TODO: expose DbConn directly
223
        return self.getDbConn().query(sql)
224 225

    def getQueryResult(self):
226
        return self.getDbConn().getQueryResult()
227

228
    def getDbConn(self) -> DbConn :
S
Shuduo Sang 已提交
229 230
        if (gConfig.per_thread_db_connection):
            return self._dbConn
231
        else:
232
            return self._tc.getDbManager().getDbConn()
233

234 235
    # def querySql(self, sql): # not "execute", since we are out side the DB context
    #     if ( gConfig.per_thread_db_connection ):
S
Shuduo Sang 已提交
236
    #         return self._dbConn.query(sql)
237 238
    #     else:
    #         return self._tc.getDbState().getDbConn().query(sql)
239

240
# The coordinator of all worker threads, mostly running in main thread
S
Shuduo Sang 已提交
241 242


243
class ThreadCoordinator:
244
    WORKER_THREAD_TIMEOUT = 180 # one minute
245

246
    def __init__(self, pool: ThreadPool, dbManager: DbManager):
S
Shuduo Sang 已提交
247
        self._curStep = -1  # first step is 0
248
        self._pool = pool
249
        # self._wd = wd
S
Shuduo Sang 已提交
250
        self._te = None  # prepare for every new step
251
        self._dbManager = dbManager
S
Shuduo Sang 已提交
252 253
        self._executedTasks: List[Task] = []  # in a given step
        self._lock = threading.RLock()  # sync access for a few things
S
Steven Li 已提交
254

S
Shuduo Sang 已提交
255 256
        self._stepBarrier = threading.Barrier(
            self._pool.numThreads + 1)  # one barrier for all threads
257
        self._execStats = ExecutionStats()
258
        self._runStatus = Status.STATUS_RUNNING
259
        self._initDbs()
S
Steven Li 已提交
260

261 262 263
    def getTaskExecutor(self):
        return self._te

S
Shuduo Sang 已提交
264
    def getDbManager(self) -> DbManager:
265
        return self._dbManager
266

267 268
    def crossStepBarrier(self, timeout=None):
        self._stepBarrier.wait(timeout) 
269

270
    def requestToStop(self):
271
        self._runStatus = Status.STATUS_STOPPING
272 273
        self._execStats.registerFailure("User Interruption")

274
    def _runShouldEnd(self, transitionFailed, hasAbortedTask, workerTimeout):
275 276 277
        maxSteps = gConfig.max_steps  # type: ignore
        if self._curStep >= (maxSteps - 1): # maxStep==10, last curStep should be 9
            return True
278
        if self._runStatus != Status.STATUS_RUNNING:
279 280 281 282 283
            return True
        if transitionFailed:
            return True
        if hasAbortedTask:
            return True
284 285
        if workerTimeout:
            return True
286 287 288 289 290 291 292 293 294 295 296 297 298
        return False

    def _hasAbortedTask(self): # from execution of previous step
        for task in self._executedTasks:
            if task.isAborted():
                # print("Task aborted: {}".format(task))
                # hasAbortedTask = True
                return True
        return False

    def _releaseAllWorkerThreads(self, transitionFailed):
        self._curStep += 1  # we are about to get into next step. TODO: race condition here!
        # Now not all threads had time to go to sleep
299
        Logging.debug(
300 301 302 303 304 305 306
            "--\r\n\n--> Step {} starts with main thread waking up".format(self._curStep))

        # A new TE for the new step
        self._te = None # set to empty first, to signal worker thread to stop
        if not transitionFailed:  # only if not failed
            self._te = TaskExecutor(self._curStep)

307
        Logging.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format(
308 309
                self._curStep))  # Now not all threads had time to go to sleep
        # Worker threads will wake up at this point, and each execute it's own task
310
        self.tapAllThreads() # release all worker thread from their "gates"
311 312 313 314 315

    def _syncAtBarrier(self):
         # Now main thread (that's us) is ready to enter a step
        # let other threads go past the pool barrier, but wait at the
        # thread gate
316
        Logging.debug("[TRD] Main thread about to cross the barrier")
317
        self.crossStepBarrier(timeout=self.WORKER_THREAD_TIMEOUT)
318
        self._stepBarrier.reset()  # Other worker threads should now be at the "gate"
319
        Logging.debug("[TRD] Main thread finished crossing the barrier")
320 321 322 323

    def _doTransition(self):
        transitionFailed = False
        try:
324 325 326
            for x in self._dbs:
                db = x # type: Database
                sm = db.getStateMachine()
327
                Logging.debug("[STT] starting transitions for DB: {}".format(db.getName()))
328 329 330
                # at end of step, transiton the DB state
                tasksForDb = db.filterTasks(self._executedTasks)
                sm.transition(tasksForDb, self.getDbManager().getDbConn())
331
                Logging.debug("[STT] transition ended for DB: {}".format(db.getName()))
332 333

            # Due to limitation (or maybe not) of the TD Python library,
334
            # we cannot share connections across threads
335 336 337 338
            # Here we are in main thread, we cannot operate the connections created in workers
            # Moving below to task loop
            # if sm.hasDatabase():
            #     for t in self._pool.threadList:
339
            #         Logging.debug("[DB] use db for all worker threads")
340
            #         t.useDb()
341 342
                    # t.execSql("use db") # main thread executing "use
                    # db" on behalf of every worker thread
343

344 345
        except taos.error.ProgrammingError as err:
            if (err.msg == 'network unavailable'):  # broken DB connection
346
                Logging.info("DB connection broken, execution failed")
347 348 349 350 351 352 353 354
                traceback.print_stack()
                transitionFailed = True
                self._te = None  # Not running any more
                self._execStats.registerFailure("Broken DB Connection")
                # continue # don't do that, need to tap all threads at
                # end, and maybe signal them to stop
            else:
                raise
S
Steven Li 已提交
355
        # return transitionFailed # Why did we have this??!!
356 357 358

        self.resetExecutedTasks()  # clear the tasks after we are done
        # Get ready for next step
359
        Logging.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed))
360 361
        return transitionFailed

S
Shuduo Sang 已提交
362
    def run(self):
363
        self._pool.createAndStartThreads(self)
S
Steven Li 已提交
364 365

        # Coordinate all threads step by step
S
Shuduo Sang 已提交
366
        self._curStep = -1  # not started yet
367
        
S
Shuduo Sang 已提交
368
        self._execStats.startExec()  # start the stop watch
369 370
        transitionFailed = False
        hasAbortedTask = False
371 372
        workerTimeout = False
        while not self._runShouldEnd(transitionFailed, hasAbortedTask, workerTimeout):
373 374 375
            if not gConfig.debug: # print this only if we are not in debug mode    
                Progress.emit(Progress.STEP_BOUNDARY)            
                # print(".", end="", flush=True)
376 377 378 379 380 381 382 383
            # if (self._curStep % 2) == 0: # print memory usage once every 10 steps
            #     memUsage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            #     print("[m:{}]".format(memUsage), end="", flush=True) # print memory usage
            # if (self._curStep % 10) == 3: 
            #     h = hpy()
            #     print("\n")        
            #     print(h.heap())
            
384
                        
385 386
            try:
                self._syncAtBarrier() # For now just cross the barrier
387
                Progress.emit(Progress.END_THREAD_STEP)
388
            except threading.BrokenBarrierError as err:
389
                Logging.info("Main loop aborted, caused by worker thread time-out")
390 391 392 393 394 395
                self._execStats.registerFailure("Aborted due to worker thread timeout")
                print("\n\nWorker Thread time-out detected, important thread info:")
                ts = ThreadStacks()
                ts.print(filterInternal=True)
                workerTimeout = True
                break
396 397

            # At this point, all threads should be pass the overall "barrier" and before the per-thread "gate"
S
Shuduo Sang 已提交
398 399
            # We use this period to do house keeping work, when all worker
            # threads are QUIET.
400 401
            hasAbortedTask = self._hasAbortedTask() # from previous step
            if hasAbortedTask: 
402
                Logging.info("Aborted task encountered, exiting test program")
403
                self._execStats.registerFailure("Aborted Task Encountered")
404
                break # do transition only if tasks are error free
S
Shuduo Sang 已提交
405

406
            # Ending previous step
407 408 409 410
            try:
                transitionFailed = self._doTransition() # To start, we end step -1 first
            except taos.error.ProgrammingError as err:
                transitionFailed = True
411
                errno2 = Helper.convertErrno(err.errno)  # correct error scheme
S
Steven Li 已提交
412
                errMsg = "Transition failed: errno=0x{:X}, msg: {}".format(errno2, err)
413
                Logging.info(errMsg)
414
                traceback.print_exc()
S
Steven Li 已提交
415
                self._execStats.registerFailure(errMsg)
416

417
            # Then we move on to the next step
418
            Progress.emit(Progress.BEGIN_THREAD_STEP)
419
            self._releaseAllWorkerThreads(transitionFailed)                    
420

421
        if hasAbortedTask or transitionFailed : # abnormal ending, workers waiting at "gate"
422
            Logging.debug("Abnormal ending of main thraed")
423
        elif workerTimeout:
424
            Logging.debug("Abnormal ending of main thread, due to worker timeout")
425
        else: # regular ending, workers waiting at "barrier"
426
            Logging.debug("Regular ending, main thread waiting for all worker threads to stop...")
427
            self._syncAtBarrier()
428

429
        self._te = None  # No more executor, time to end
430
        Logging.debug("Main thread tapping all threads one last time...")
431
        self.tapAllThreads()  # Let the threads run one last time
432

433 434
        Logging.debug("\r\n\n--> Main thread ready to finish up...")
        Logging.debug("Main thread joining all threads")
S
Shuduo Sang 已提交
435
        self._pool.joinAll()  # Get all threads to finish
436
        Logging.info("\nAll worker threads finished")
437 438
        self._execStats.endExec()

439 440 441 442 443 444 445 446 447 448 449 450 451
    def cleanup(self): # free resources
        self._pool.cleanup()

        self._pool = None
        self._te = None  
        self._dbManager = None
        self._executedTasks = None
        self._lock = None
        self._stepBarrier = None
        self._execStats = None
        self._runStatus = None


452 453
    def printStats(self):
        self._execStats.printStats()
S
Steven Li 已提交
454

S
Steven Li 已提交
455 456 457 458 459 460
    def isFailed(self):
        return self._execStats.isFailed()

    def getExecStats(self):
        return self._execStats

S
Shuduo Sang 已提交
461
    def tapAllThreads(self):  # in a deterministic manner
S
Steven Li 已提交
462
        wakeSeq = []
S
Shuduo Sang 已提交
463 464
        for i in range(self._pool.numThreads):  # generate a random sequence
            if Dice.throw(2) == 1:
S
Steven Li 已提交
465 466 467
                wakeSeq.append(i)
            else:
                wakeSeq.insert(0, i)
468
        Logging.debug(
S
Shuduo Sang 已提交
469 470
            "[TRD] Main thread waking up worker threads: {}".format(
                str(wakeSeq)))
471
        # TODO: set dice seed to a deterministic value
S
Steven Li 已提交
472
        for i in wakeSeq:
S
Shuduo Sang 已提交
473 474 475
            # TODO: maybe a bit too deep?!
            self._pool.threadList[i].tapStepGate()
            time.sleep(0)  # yield
S
Steven Li 已提交
476

477
    def isRunning(self):
S
Shuduo Sang 已提交
478
        return self._te is not None
479

480 481 482 483 484 485
    def _initDbs(self):
        ''' Initialize multiple databases, invoked at __ini__() time '''
        self._dbs = [] # type: List[Database]
        dbc = self.getDbManager().getDbConn()
        if gConfig.max_dbs == 0:
            self._dbs.append(Database(0, dbc))
S
Steven Li 已提交
486 487 488
        else:            
            baseDbNumber = int(datetime.datetime.now().timestamp( # Don't use Dice/random, as they are deterministic
                )) % 888 if gConfig.dynamic_db_table_names else 0
489
            for i in range(gConfig.max_dbs):
490
                self._dbs.append(Database(baseDbNumber + i, dbc))
491 492 493 494 495 496 497 498

    def pickDatabase(self):
        idxDb = 0
        if gConfig.max_dbs != 0 :
            idxDb = Dice.throw(gConfig.max_dbs) # 0 to N-1
        db = self._dbs[idxDb] # type: Database
        return db

S
Shuduo Sang 已提交
499
    def fetchTask(self) -> Task:
500 501 502
        ''' The thread coordinator (that's us) is responsible for fetching a task
            to be executed next.
        '''
S
Shuduo Sang 已提交
503
        if (not self.isRunning()):  # no task
504
            raise RuntimeError("Cannot fetch task when not running")
505

S
Shuduo Sang 已提交
506
        # pick a task type for current state
507 508 509
        db = self.pickDatabase()
        taskType = db.getStateMachine().pickTaskType() # type: Task
        return taskType(self._execStats, db)  # create a task from it
510 511

    def resetExecutedTasks(self):
S
Shuduo Sang 已提交
512
        self._executedTasks = []  # should be under single thread
513 514 515 516

    def saveExecutedTask(self, task):
        with self._lock:
            self._executedTasks.append(task)
517

518
class ThreadPool:
519
    def __init__(self, numThreads, maxSteps):
520 521 522 523
        self.numThreads = numThreads
        self.maxSteps = maxSteps
        # Internal class variables
        self.curStep = 0
S
Shuduo Sang 已提交
524 525
        self.threadList = []  # type: List[WorkerThread]

526
    # starting to run all the threads, in locking steps
527
    def createAndStartThreads(self, tc: ThreadCoordinator):
S
Shuduo Sang 已提交
528 529
        for tid in range(0, self.numThreads):  # Create the threads
            workerThread = WorkerThread(self, tid, tc)
530
            self.threadList.append(workerThread)
S
Shuduo Sang 已提交
531
            workerThread.start()  # start, but should block immediately before step 0
532 533 534

    def joinAll(self):
        for workerThread in self.threadList:
535
            Logging.debug("Joining thread...")
536 537
            workerThread._thread.join()

538 539 540
    def cleanup(self):
        self.threadList = None # maybe clean up each?

541 542
# A queue of continguous POSITIVE integers, used by DbManager to generate continuous numbers
# for new table names
S
Shuduo Sang 已提交
543 544


S
Steven Li 已提交
545 546
class LinearQueue():
    def __init__(self):
547
        self.firstIndex = 1  # 1st ever element
S
Steven Li 已提交
548
        self.lastIndex = 0
S
Shuduo Sang 已提交
549 550
        self._lock = threading.RLock()  # our functions may call each other
        self.inUse = set()  # the indexes that are in use right now
S
Steven Li 已提交
551

552
    def toText(self):
S
Shuduo Sang 已提交
553 554
        return "[{}..{}], in use: {}".format(
            self.firstIndex, self.lastIndex, self.inUse)
555 556

    # Push (add new element, largest) to the tail, and mark it in use
S
Shuduo Sang 已提交
557
    def push(self):
558
        with self._lock:
S
Shuduo Sang 已提交
559 560
            # if ( self.isEmpty() ):
            #     self.lastIndex = self.firstIndex
561
            #     return self.firstIndex
562 563
            # Otherwise we have something
            self.lastIndex += 1
564 565
            self.allocate(self.lastIndex)
            # self.inUse.add(self.lastIndex) # mark it in use immediately
566
            return self.lastIndex
S
Steven Li 已提交
567 568

    def pop(self):
569
        with self._lock:
S
Shuduo Sang 已提交
570 571 572 573
            if (self.isEmpty()):
                # raise RuntimeError("Cannot pop an empty queue")
                return False  # TODO: None?

574
            index = self.firstIndex
S
Shuduo Sang 已提交
575
            if (index in self.inUse):
576 577
                return False

578 579 580 581 582 583 584
            self.firstIndex += 1
            return index

    def isEmpty(self):
        return self.firstIndex > self.lastIndex

    def popIfNotEmpty(self):
585
        with self._lock:
586 587 588 589
            if (self.isEmpty()):
                return 0
            return self.pop()

S
Steven Li 已提交
590
    def allocate(self, i):
591
        with self._lock:
592
            # Logging.debug("LQ allocating item {}".format(i))
S
Shuduo Sang 已提交
593 594 595
            if (i in self.inUse):
                raise RuntimeError(
                    "Cannot re-use same index in queue: {}".format(i))
596 597
            self.inUse.add(i)

S
Steven Li 已提交
598
    def release(self, i):
599
        with self._lock:
600
            # Logging.debug("LQ releasing item {}".format(i))
S
Shuduo Sang 已提交
601
            self.inUse.remove(i)  # KeyError possible, TODO: why?
602 603 604 605

    def size(self):
        return self.lastIndex + 1 - self.firstIndex

S
Steven Li 已提交
606
    def pickAndAllocate(self):
S
Shuduo Sang 已提交
607
        if (self.isEmpty()):
608 609
            return None
        with self._lock:
S
Shuduo Sang 已提交
610
            cnt = 0  # counting the interations
611 612
            while True:
                cnt += 1
S
Shuduo Sang 已提交
613
                if (cnt > self.size() * 10):  # 10x iteration already
614 615
                    # raise RuntimeError("Failed to allocate LinearQueue element")
                    return None
S
Shuduo Sang 已提交
616 617
                ret = Dice.throwRange(self.firstIndex, self.lastIndex + 1)
                if (ret not in self.inUse):
618 619 620
                    self.allocate(ret)
                    return ret

S
Shuduo Sang 已提交
621

622
class DbConn:
623
    TYPE_NATIVE = "native-c"
624
    TYPE_REST =   "rest-api"
625 626 627 628 629 630 631 632 633
    TYPE_INVALID = "invalid"

    @classmethod
    def create(cls, connType):
        if connType == cls.TYPE_NATIVE:
            return DbConnNative()
        elif connType == cls.TYPE_REST:
            return DbConnRest()
        else:
S
Shuduo Sang 已提交
634 635
            raise RuntimeError(
                "Unexpected connection type: {}".format(connType))
636 637 638 639 640 641 642 643 644

    @classmethod
    def createNative(cls):
        return cls.create(cls.TYPE_NATIVE)

    @classmethod
    def createRest(cls):
        return cls.create(cls.TYPE_REST)

645 646
    def __init__(self):
        self.isOpen = False
647
        self._type = self.TYPE_INVALID
648 649 650 651
        self._lastSql = None

    def getLastSql(self):
        return self._lastSql
652 653

    def open(self):
S
Shuduo Sang 已提交
654
        if (self.isOpen):
655 656
            raise RuntimeError("Cannot re-open an existing DB connection")

657 658
        # below implemented by child classes
        self.openByType()
659

660
        Logging.debug("[DB] data connection opened, type = {}".format(self._type))
661 662
        self.isOpen = True

663 664 665
    def close(self):
        raise RuntimeError("Unexpected execution, should be overriden")

S
Shuduo Sang 已提交
666
    def queryScalar(self, sql) -> int:
667 668
        return self._queryAny(sql)

S
Shuduo Sang 已提交
669
    def queryString(self, sql) -> str:
670 671
        return self._queryAny(sql)

S
Shuduo Sang 已提交
672 673
    def _queryAny(self, sql):  # actual query result as an int
        if (not self.isOpen):
674
            raise RuntimeError("Cannot query database until connection is open")
675
        nRows = self.query(sql)
S
Shuduo Sang 已提交
676
        if nRows != 1:
677 678 679 680
            raise taos.error.ProgrammingError(
                "Unexpected result for query: {}, rows = {}".format(sql, nRows), 
                (0x991 if nRows==0 else 0x992)
            )
681
        if self.getResultRows() != 1 or self.getResultCols() != 1:
682
            raise RuntimeError("Unexpected result set for query: {}".format(sql))
683 684
        return self.getQueryResult()[0][0]

685 686 687
    def use(self, dbName):
        self.execute("use {}".format(dbName))

688 689 690 691 692 693 694
    def existsDatabase(self, dbName: str):
        ''' Check if a certain database exists '''
        self.query("show databases")
        dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation
        # ret2 = dbName in dbs
        # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName)))
        return dbName in dbs # TODO: super weird type mangling seen, once here
695 696 697 698

    def hasTables(self):
        return self.query("show tables") > 0

699
    def execute(self, sql):
700
        ''' Return the number of rows affected'''
701
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
702

703 704 705 706 707 708 709 710 711
    def safeExecute(self, sql):
        '''Safely execute any SQL query, returning True/False upon success/failure'''
        try:
            self.execute(sql)
            return True # ignore num of results, return success
        except taos.error.ProgrammingError as err:
            return False # failed, for whatever TAOS reason
        # Not possile to reach here, non-TAOS exception would have been thrown

712
    def query(self, sql) -> int: # return num rows returned
713
        ''' Return the number of rows affected'''
714 715
        raise RuntimeError("Unexpected execution, should be overriden")

716 717
    def openByType(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
718

719 720
    def getQueryResult(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
721

722 723
    def getResultRows(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
724

725 726 727 728
    def getResultCols(self):
        raise RuntimeError("Unexpected execution, should be overriden")

# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql
S
Shuduo Sang 已提交
729 730


731 732 733 734
class DbConnRest(DbConn):
    def __init__(self):
        super().__init__()
        self._type = self.TYPE_REST
S
Steven Li 已提交
735
        self._url = "http://localhost:6041/rest/sql"  # fixed for now
736 737
        self._result = None

S
Shuduo Sang 已提交
738 739 740
    def openByType(self):  # Open connection
        pass  # do nothing, always open

741
    def close(self):
S
Shuduo Sang 已提交
742
        if (not self.isOpen):
743
            raise RuntimeError("Cannot clean up database until connection is open")
744
        # Do nothing for REST
745
        Logging.debug("[DB] REST Database connection closed")
746 747 748
        self.isOpen = False

    def _doSql(self, sql):
749
        self._lastSql = sql # remember this, last SQL attempted
750 751 752
        try:
            r = requests.post(self._url, 
                data = sql,
753
                auth = HTTPBasicAuth('root', 'taosdata'))         
754 755 756
        except:
            print("REST API Failure (TODO: more info here)")
            raise
757 758
        rj = r.json()
        # Sanity check for the "Json Result"
S
Shuduo Sang 已提交
759
        if ('status' not in rj):
760 761
            raise RuntimeError("No status in REST response")

S
Shuduo Sang 已提交
762 763 764 765
        if rj['status'] == 'error':  # clearly reported error
            if ('code' not in rj):  # error without code
                raise RuntimeError("REST error return without code")
            errno = rj['code']  # May need to massage this in the future
766
            # print("Raising programming error with REST return: {}".format(rj))
S
Shuduo Sang 已提交
767 768
            raise taos.error.ProgrammingError(
                rj['desc'], errno)  # todo: check existance of 'desc'
769

S
Shuduo Sang 已提交
770 771 772 773
        if rj['status'] != 'succ':  # better be this
            raise RuntimeError(
                "Unexpected REST return status: {}".format(
                    rj['status']))
774 775

        nRows = rj['rows'] if ('rows' in rj) else 0
S
Shuduo Sang 已提交
776
        self._result = rj
777 778
        return nRows

S
Shuduo Sang 已提交
779 780 781 782
    def execute(self, sql):
        if (not self.isOpen):
            raise RuntimeError(
                "Cannot execute database commands until connection is open")
783
        Logging.debug("[SQL-REST] Executing SQL: {}".format(sql))
784
        nRows = self._doSql(sql)
785
        Logging.debug(
S
Shuduo Sang 已提交
786
            "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
787 788
        return nRows

S
Shuduo Sang 已提交
789
    def query(self, sql):  # return rows affected
790 791 792 793 794 795 796 797 798 799 800 801 802
        return self.execute(sql)

    def getQueryResult(self):
        return self._result['data']

    def getResultRows(self):
        print(self._result)
        raise RuntimeError("TBD")
        # return self._tdSql.queryRows

    def getResultCols(self):
        print(self._result)
        raise RuntimeError("TBD")
S
Shuduo Sang 已提交
803

804
    # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative
805 806


807
class MyTDSql:
808 809 810 811 812 813 814
    # Class variables
    _clsLock = threading.Lock() # class wide locking
    longestQuery = None # type: str
    longestQueryTime = 0.0 # seconds
    lqStartTime = 0.0
    # lqEndTime = 0.0 # Not needed, as we have the two above already

815 816 817 818 819
    def __init__(self, hostAddr, cfgPath):
        # Make the DB connection
        self._conn = taos.connect(host=hostAddr, config=cfgPath) 
        self._cursor = self._conn.cursor()

820 821 822 823
        self.queryRows = 0
        self.queryCols = 0
        self.affectedRows = 0

824 825
    # def init(self, cursor, log=True):
    #     self.cursor = cursor
826 827 828 829 830
        # if (log):
        #     caller = inspect.getframeinfo(inspect.stack()[1][0])
        #     self.cursor.log(caller.filename + ".sql")

    def close(self):
831
        self._cursor.close() # can we double close?
832 833
        self._conn.close() # TODO: very important, cursor close does NOT close DB connection!
        self._cursor.close()
834

835 836 837
    def _execInternal(self, sql):
        startTime = time.time() 
        ret = self._cursor.execute(sql)
838
        # print("\nSQL success: {}".format(sql))
839 840 841 842 843 844 845 846 847 848
        queryTime =  time.time() - startTime
        # Record the query time
        cls = self.__class__
        if queryTime > (cls.longestQueryTime + 0.01) :
            with cls._clsLock:
                cls.longestQuery = sql
                cls.longestQueryTime = queryTime
                cls.lqStartTime = startTime
        return ret

849 850 851
    def query(self, sql):
        self.sql = sql
        try:
852
            self._execInternal(sql)
853
            self.queryResult = self._cursor.fetchall()
854
            self.queryRows = len(self.queryResult)
855
            self.queryCols = len(self._cursor.description)
856 857 858 859 860 861
        except Exception as e:
            # caller = inspect.getframeinfo(inspect.stack()[1][0])
            # args = (caller.filename, caller.lineno, sql, repr(e))
            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
            raise
        return self.queryRows
862

863 864 865
    def execute(self, sql):
        self.sql = sql
        try:
866
            self.affectedRows = self._execInternal(sql)
867 868 869 870 871 872 873
        except Exception as e:
            # caller = inspect.getframeinfo(inspect.stack()[1][0])
            # args = (caller.filename, caller.lineno, sql, repr(e))
            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
            raise
        return self.affectedRows

874 875 876 877 878 879 880 881 882 883 884 885
class DbConnNative(DbConn):
    # Class variables
    _lock = threading.Lock()
    _connInfoDisplayed = False
    totalConnections = 0 # Not private

    def __init__(self):
        super().__init__()
        self._type = self.TYPE_NATIVE
        self._conn = None
        # self._cursor = None        

S
Shuduo Sang 已提交
886
    def openByType(self):  # Open connection
887 888 889 890 891
        global gContainer
        tdeInstance = gContainer.defTdeInstance # set up in ClientManager, type: TdeInstance
        # cfgPath = self.getBuildPath() + "/test/cfg"
        cfgPath  = tdeInstance.getCfgDir()
        hostAddr = tdeInstance.getHostAddr()
892

893 894 895 896
        cls = self.__class__ # Get the class, to access class variables
        with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
            if not cls._connInfoDisplayed:
                cls._connInfoDisplayed = True # updating CLASS variable
897
                Logging.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath))                    
898 899 900 901 902 903
            # Make the connection         
            # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
            # self._cursor = self._conn.cursor()
            # Record the count in the class
            self._tdSql = MyTDSql(hostAddr, cfgPath) # making DB connection
            cls.totalConnections += 1 
904
        
905
        self._tdSql.execute('reset query cache')
S
Shuduo Sang 已提交
906
        # self._cursor.execute('use db') # do this at the beginning of every
907 908

        # Open connection
909 910 911
        # self._tdSql = MyTDSql()
        # self._tdSql.init(self._cursor)
        
912
    def close(self):
S
Shuduo Sang 已提交
913
        if (not self.isOpen):
914
            raise RuntimeError("Cannot clean up database until connection is open")
915
        self._tdSql.close()
916 917 918 919 920
        # Decrement the class wide counter
        cls = self.__class__ # Get the class, to access class variables
        with cls._lock:
            cls.totalConnections -= 1

921
        Logging.debug("[DB] Database connection closed")
922
        self.isOpen = False
S
Steven Li 已提交
923

S
Shuduo Sang 已提交
924 925
    def execute(self, sql):
        if (not self.isOpen):
926
            raise RuntimeError("Cannot execute database commands until connection is open")
927
        Logging.debug("[SQL] Executing SQL: {}".format(sql))
928
        self._lastSql = sql
929
        nRows = self._tdSql.execute(sql)
930
        Logging.debug(
S
Shuduo Sang 已提交
931 932
            "[SQL] Execution Result, nRows = {}, SQL = {}".format(
                nRows, sql))
933
        return nRows
S
Steven Li 已提交
934

S
Shuduo Sang 已提交
935 936 937 938
    def query(self, sql):  # return rows affected
        if (not self.isOpen):
            raise RuntimeError(
                "Cannot query database until connection is open")
939
        Logging.debug("[SQL] Executing SQL: {}".format(sql))
940
        self._lastSql = sql
941
        nRows = self._tdSql.query(sql)
942
        Logging.debug(
S
Shuduo Sang 已提交
943 944
            "[SQL] Query Result, nRows = {}, SQL = {}".format(
                nRows, sql))
945
        return nRows
946
        # results are in: return self._tdSql.queryResult
947

948 949 950
    def getQueryResult(self):
        return self._tdSql.queryResult

951 952
    def getResultRows(self):
        return self._tdSql.queryRows
953

954 955
    def getResultCols(self):
        return self._tdSql.queryCols
956

S
Shuduo Sang 已提交
957

958
class AnyState:
S
Shuduo Sang 已提交
959 960 961
    STATE_INVALID = -1
    STATE_EMPTY = 0  # nothing there, no even a DB
    STATE_DB_ONLY = 1  # we have a DB, but nothing else
962
    STATE_TABLE_ONLY = 2  # we have a table, but totally empty
S
Shuduo Sang 已提交
963
    STATE_HAS_DATA = 3  # we have some data in the table
964 965 966 967
    _stateNames = ["Invalid", "Empty", "DB_Only", "Table_Only", "Has_Data"]

    STATE_VAL_IDX = 0
    CAN_CREATE_DB = 1
968 969 970
    # For below, if we can "drop the DB", but strictly speaking 
    # only "under normal circumstances", as we may override it with the -b option
    CAN_DROP_DB = 2  
971 972
    CAN_CREATE_FIXED_SUPER_TABLE = 3
    CAN_DROP_FIXED_SUPER_TABLE = 4
973 974 975 976 977 978 979
    CAN_ADD_DATA = 5
    CAN_READ_DATA = 6

    def __init__(self):
        self._info = self.getInfo()

    def __str__(self):
S
Shuduo Sang 已提交
980 981
        # -1 hack to accomodate the STATE_INVALID case
        return self._stateNames[self._info[self.STATE_VAL_IDX] + 1]
982

983 984
    # Each sub state tells us the "info", about itself, so we can determine
    # on things like canDropDB()
985 986 987
    def getInfo(self):
        raise RuntimeError("Must be overriden by child classes")

S
Steven Li 已提交
988 989 990 991 992 993
    def equals(self, other):
        if isinstance(other, int):
            return self.getValIndex() == other
        elif isinstance(other, AnyState):
            return self.getValIndex() == other.getValIndex()
        else:
S
Shuduo Sang 已提交
994 995 996
            raise RuntimeError(
                "Unexpected comparison, type = {}".format(
                    type(other)))
S
Steven Li 已提交
997

998 999 1000
    def verifyTasksToState(self, tasks, newState):
        raise RuntimeError("Must be overriden by child classes")

S
Steven Li 已提交
1001 1002 1003
    def getValIndex(self):
        return self._info[self.STATE_VAL_IDX]

1004 1005
    def getValue(self):
        return self._info[self.STATE_VAL_IDX]
S
Shuduo Sang 已提交
1006

1007 1008
    def canCreateDb(self):
        return self._info[self.CAN_CREATE_DB]
S
Shuduo Sang 已提交
1009

1010
    def canDropDb(self):
1011 1012 1013 1014
        # If user requests to run up to a number of DBs,
        # we'd then not do drop_db operations any more
        if gConfig.max_dbs > 0 : 
            return False
1015
        return self._info[self.CAN_DROP_DB]
S
Shuduo Sang 已提交
1016

1017 1018
    def canCreateFixedSuperTable(self):
        return self._info[self.CAN_CREATE_FIXED_SUPER_TABLE]
S
Shuduo Sang 已提交
1019

1020 1021
    def canDropFixedSuperTable(self):
        return self._info[self.CAN_DROP_FIXED_SUPER_TABLE]
S
Shuduo Sang 已提交
1022

1023 1024
    def canAddData(self):
        return self._info[self.CAN_ADD_DATA]
S
Shuduo Sang 已提交
1025

1026 1027 1028 1029 1030
    def canReadData(self):
        return self._info[self.CAN_READ_DATA]

    def assertAtMostOneSuccess(self, tasks, cls):
        sCnt = 0
S
Shuduo Sang 已提交
1031
        for task in tasks:
1032 1033 1034
            if not isinstance(task, cls):
                continue
            if task.isSuccess():
S
Steven Li 已提交
1035
                # task.logDebug("Task success found")
1036
                sCnt += 1
S
Shuduo Sang 已提交
1037 1038 1039
                if (sCnt >= 2):
                    raise RuntimeError(
                        "Unexpected more than 1 success with task: {}".format(cls))
1040 1041 1042 1043

    def assertIfExistThenSuccess(self, tasks, cls):
        sCnt = 0
        exists = False
S
Shuduo Sang 已提交
1044
        for task in tasks:
1045 1046
            if not isinstance(task, cls):
                continue
S
Shuduo Sang 已提交
1047
            exists = True  # we have a valid instance
1048 1049
            if task.isSuccess():
                sCnt += 1
S
Shuduo Sang 已提交
1050
        if (exists and sCnt <= 0):
S
Steven Li 已提交
1051 1052
            raise RuntimeError("Unexpected zero success for task type: {}, from tasks: {}"
                .format(cls, tasks))
1053 1054

    def assertNoTask(self, tasks, cls):
S
Shuduo Sang 已提交
1055
        for task in tasks:
1056
            if isinstance(task, cls):
S
Shuduo Sang 已提交
1057 1058
                raise CrashGenError(
                    "This task: {}, is not expected to be present, given the success/failure of others".format(cls.__name__))
1059 1060

    def assertNoSuccess(self, tasks, cls):
S
Shuduo Sang 已提交
1061
        for task in tasks:
1062 1063
            if isinstance(task, cls):
                if task.isSuccess():
S
Shuduo Sang 已提交
1064 1065
                    raise RuntimeError(
                        "Unexpected successful task: {}".format(cls))
1066 1067

    def hasSuccess(self, tasks, cls):
S
Shuduo Sang 已提交
1068
        for task in tasks:
1069 1070 1071 1072 1073 1074
            if not isinstance(task, cls):
                continue
            if task.isSuccess():
                return True
        return False

S
Steven Li 已提交
1075
    def hasTask(self, tasks, cls):
S
Shuduo Sang 已提交
1076
        for task in tasks:
S
Steven Li 已提交
1077 1078 1079 1080
            if isinstance(task, cls):
                return True
        return False

S
Shuduo Sang 已提交
1081

1082 1083 1084 1085
class StateInvalid(AnyState):
    def getInfo(self):
        return [
            self.STATE_INVALID,
S
Shuduo Sang 已提交
1086 1087 1088
            False, False,  # can create/drop Db
            False, False,  # can create/drop fixed table
            False, False,  # can insert/read data with fixed table
1089 1090 1091 1092
        ]

    # def verifyTasksToState(self, tasks, newState):

S
Shuduo Sang 已提交
1093

1094 1095 1096 1097
class StateEmpty(AnyState):
    def getInfo(self):
        return [
            self.STATE_EMPTY,
S
Shuduo Sang 已提交
1098 1099 1100
            True, False,  # can create/drop Db
            False, False,  # can create/drop fixed table
            False, False,  # can insert/read data with fixed table
1101 1102
        ]

S
Shuduo Sang 已提交
1103 1104
    def verifyTasksToState(self, tasks, newState):
        if (self.hasSuccess(tasks, TaskCreateDb)
1105
                ):  # at EMPTY, if there's succes in creating DB
S
Shuduo Sang 已提交
1106 1107 1108 1109
            if (not self.hasTask(tasks, TaskDropDb)):  # and no drop_db tasks
                # we must have at most one. TODO: compare numbers
                self.assertAtMostOneSuccess(tasks, TaskCreateDb)

1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120

class StateDbOnly(AnyState):
    def getInfo(self):
        return [
            self.STATE_DB_ONLY,
            False, True,
            True, False,
            False, False,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1121 1122 1123
        if (not self.hasTask(tasks, TaskCreateDb)):
            # only if we don't create any more
            self.assertAtMostOneSuccess(tasks, TaskDropDb)
1124 1125 1126 1127 1128

        # TODO: restore the below, the problem exists, although unlikely in real-world
        # if (gSvcMgr!=None) and gSvcMgr.isRestarting():     
        # if (gSvcMgr == None) or (not gSvcMgr.isRestarting()) : 
        #     self.assertIfExistThenSuccess(tasks, TaskDropDb)       
1129

S
Shuduo Sang 已提交
1130

1131
class StateSuperTableOnly(AnyState):
1132 1133 1134 1135 1136 1137 1138 1139 1140
    def getInfo(self):
        return [
            self.STATE_TABLE_ONLY,
            False, True,
            False, True,
            True, True,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1141
        if (self.hasSuccess(tasks, TaskDropSuperTable)
1142
                ):  # we are able to drop the table
1143
            #self.assertAtMostOneSuccess(tasks, TaskDropSuperTable)
S
Shuduo Sang 已提交
1144 1145
            # we must have had recreted it
            self.hasSuccess(tasks, TaskCreateSuperTable)
1146

1147
            # self._state = self.STATE_DB_ONLY
S
Steven Li 已提交
1148 1149
        # elif ( self.hasSuccess(tasks, AddFixedDataTask) ): # no success dropping the table, but added data
        #     self.assertNoTask(tasks, DropFixedTableTask) # not true in massively parrallel cases
1150
            # self._state = self.STATE_HAS_DATA
S
Steven Li 已提交
1151 1152 1153
        # elif ( self.hasSuccess(tasks, ReadFixedDataTask) ): # no success in prev cases, but was able to read data
            # self.assertNoTask(tasks, DropFixedTableTask)
            # self.assertNoTask(tasks, AddFixedDataTask)
1154
            # self._state = self.STATE_TABLE_ONLY # no change
S
Steven Li 已提交
1155 1156 1157
        # else: # did not drop table, did not insert data, did not read successfully, that is impossible
        #     raise RuntimeError("Unexpected no-success scenarios")
        # TODO: need to revamp!!
1158

S
Shuduo Sang 已提交
1159

1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
class StateHasData(AnyState):
    def getInfo(self):
        return [
            self.STATE_HAS_DATA,
            False, True,
            False, True,
            True, True,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1170
        if (newState.equals(AnyState.STATE_EMPTY)):
1171
            self.hasSuccess(tasks, TaskDropDb)
S
Shuduo Sang 已提交
1172 1173 1174 1175
            if (not self.hasTask(tasks, TaskCreateDb)):
                self.assertAtMostOneSuccess(tasks, TaskDropDb)  # TODO: dicy
        elif (newState.equals(AnyState.STATE_DB_ONLY)):  # in DB only
            if (not self.hasTask(tasks, TaskCreateDb)
1176
                ):  # without a create_db task
S
Shuduo Sang 已提交
1177 1178
                # we must have drop_db task
                self.assertNoTask(tasks, TaskDropDb)
1179
            self.hasSuccess(tasks, TaskDropSuperTable)
1180
            # self.assertAtMostOneSuccess(tasks, DropFixedSuperTableTask) # TODO: dicy
1181 1182 1183 1184
        # elif ( newState.equals(AnyState.STATE_TABLE_ONLY) ): # data deleted
            # self.assertNoTask(tasks, TaskDropDb)
            # self.assertNoTask(tasks, TaskDropSuperTable)
            # self.assertNoTask(tasks, TaskAddData)
S
Steven Li 已提交
1185
            # self.hasSuccess(tasks, DeleteDataTasks)
S
Shuduo Sang 已提交
1186 1187
        else:  # should be STATE_HAS_DATA
            if (not self.hasTask(tasks, TaskCreateDb)
1188
                ):  # only if we didn't create one
S
Shuduo Sang 已提交
1189 1190 1191
                # we shouldn't have dropped it
                self.assertNoTask(tasks, TaskDropDb)
            if (not self.hasTask(tasks, TaskCreateSuperTable)
1192
                    ):  # if we didn't create the table
S
Shuduo Sang 已提交
1193 1194
                # we should not have a task that drops it
                self.assertNoTask(tasks, TaskDropSuperTable)
1195
            # self.assertIfExistThenSuccess(tasks, ReadFixedDataTask)
S
Steven Li 已提交
1196

S
Shuduo Sang 已提交
1197

1198
class StateMechine:
1199 1200 1201
    def __init__(self, db: Database): 
        self._db = db
        # transitition target probabilities, indexed with value of STATE_EMPTY, STATE_DB_ONLY, etc.
1202
        self._stateWeights = [1, 2, 10, 40]
S
Shuduo Sang 已提交
1203

1204 1205
    def init(self, dbc: DbConn): # late initailization, don't save the dbConn
        self._curState = self._findCurrentState(dbc)  # starting state
1206
        Logging.debug("Found Starting State: {}".format(self._curState))
1207 1208

    # TODO: seems no lnoger used, remove?
1209 1210 1211
    def getCurrentState(self):
        return self._curState

1212 1213 1214
    def hasDatabase(self):
        return self._curState.canDropDb()  # ha, can drop DB means it has one

1215
    # May be slow, use cautionsly...
S
Shuduo Sang 已提交
1216
    def getTaskTypes(self):  # those that can run (directly/indirectly) from the current state
1217 1218 1219 1220 1221 1222
        def typesToStrings(types):
            ss = []
            for t in types:
                ss.append(t.__name__)
            return ss

S
Shuduo Sang 已提交
1223
        allTaskClasses = StateTransitionTask.__subclasses__()  # all state transition tasks
1224 1225
        firstTaskTypes = []
        for tc in allTaskClasses:
S
Shuduo Sang 已提交
1226
            # t = tc(self) # create task object
1227 1228
            if tc.canBeginFrom(self._curState):
                firstTaskTypes.append(tc)
S
Shuduo Sang 已提交
1229 1230 1231 1232 1233 1234 1235 1236
        # now we have all the tasks that can begin directly from the current
        # state, let's figure out the INDIRECT ones
        taskTypes = firstTaskTypes.copy()  # have to have these
        for task1 in firstTaskTypes:  # each task type gathered so far
            endState = task1.getEndState()  # figure the end state
            if endState is None:  # does not change end state
                continue  # no use, do nothing
            for tc in allTaskClasses:  # what task can further begin from there?
1237
                if tc.canBeginFrom(endState) and (tc not in firstTaskTypes):
S
Shuduo Sang 已提交
1238
                    taskTypes.append(tc)  # gather it
1239 1240

        if len(taskTypes) <= 0:
S
Shuduo Sang 已提交
1241 1242 1243
            raise RuntimeError(
                "No suitable task types found for state: {}".format(
                    self._curState))
1244
        Logging.debug(
S
Shuduo Sang 已提交
1245 1246 1247
            "[OPS] Tasks found for state {}: {}".format(
                self._curState,
                typesToStrings(taskTypes)))
1248 1249
        return taskTypes

1250
    def _findCurrentState(self, dbc: DbConn):
S
Shuduo Sang 已提交
1251
        ts = time.time()  # we use this to debug how fast/slow it is to do the various queries to find the current DB state
1252 1253
        dbName =self._db.getName()
        if not dbc.existsDatabase(dbName): # dbc.hasDatabases():  # no database?!
1254
            Logging.debug( "[STT] empty database found, between {} and {}".format(ts, time.time()))
1255
            return StateEmpty()
S
Shuduo Sang 已提交
1256 1257
        # did not do this when openning connection, and this is NOT the worker
        # thread, which does this on their own
1258
        dbc.use(dbName)
1259
        if not dbc.hasTables():  # no tables
1260
            Logging.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time()))
1261
            return StateDbOnly()
1262

1263 1264
        sTable = self._db.getFixedSuperTable()
        if sTable.hasRegTables(dbc, dbName):  # no regular tables
1265
            Logging.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time()))
1266
            return StateSuperTableOnly()
S
Shuduo Sang 已提交
1267
        else:  # has actual tables
1268
            Logging.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time()))
1269 1270
            return StateHasData()

1271 1272
    # We transition the system to a new state by examining the current state itself
    def transition(self, tasks, dbc: DbConn):
S
Shuduo Sang 已提交
1273
        if (len(tasks) == 0):  # before 1st step, or otherwise empty
1274
            Logging.debug("[STT] Starting State: {}".format(self._curState))
S
Shuduo Sang 已提交
1275
            return  # do nothing
1276

S
Shuduo Sang 已提交
1277
        # this should show up in the server log, separating steps
1278
        dbc.execute("show dnodes")
1279 1280 1281 1282

        # Generic Checks, first based on the start state
        if self._curState.canCreateDb():
            self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
S
Shuduo Sang 已提交
1283 1284
            # self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
            # case of multiple creation and drops
1285 1286

        if self._curState.canDropDb():
1287
            if gSvcMgr == None: # only if we are running as client-only
S
Steven Li 已提交
1288
                self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
S
Shuduo Sang 已提交
1289 1290
            # self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
            # case of drop-create-drop
1291 1292 1293

        # if self._state.canCreateFixedTable():
            # self.assertIfExistThenSuccess(tasks, CreateFixedTableTask) # Not true, DB may be dropped
S
Shuduo Sang 已提交
1294 1295
            # self.assertAtMostOneSuccess(tasks, CreateFixedTableTask) # not
            # really, in case of create-drop-create
1296 1297 1298

        # if self._state.canDropFixedTable():
            # self.assertIfExistThenSuccess(tasks, DropFixedTableTask) # Not True, the whole DB may be dropped
S
Shuduo Sang 已提交
1299 1300
            # self.assertAtMostOneSuccess(tasks, DropFixedTableTask) # not
            # really in case of drop-create-drop
1301 1302

        # if self._state.canAddData():
S
Shuduo Sang 已提交
1303 1304
        # self.assertIfExistThenSuccess(tasks, AddFixedDataTask)  # not true
        # actually
1305 1306 1307 1308

        # if self._state.canReadData():
            # Nothing for sure

1309
        newState = self._findCurrentState(dbc)
1310
        Logging.debug("[STT] New DB state determined: {}".format(newState))
S
Shuduo Sang 已提交
1311 1312
        # can old state move to new state through the tasks?
        self._curState.verifyTasksToState(tasks, newState)
1313 1314 1315
        self._curState = newState

    def pickTaskType(self):
S
Shuduo Sang 已提交
1316 1317
        # all the task types we can choose from at curent state
        taskTypes = self.getTaskTypes()
1318 1319 1320
        weights = []
        for tt in taskTypes:
            endState = tt.getEndState()
S
Shuduo Sang 已提交
1321 1322 1323
            if endState is not None:
                # TODO: change to a method
                weights.append(self._stateWeights[endState.getValIndex()])
1324
            else:
S
Shuduo Sang 已提交
1325 1326
                # read data task, default to 10: TODO: change to a constant
                weights.append(10)
1327
        i = self._weighted_choice_sub(weights)
1328
        # Logging.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes)))
1329 1330
        return taskTypes[i]

S
Shuduo Sang 已提交
1331 1332 1333 1334 1335
    # ref:
    # https://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python/
    def _weighted_choice_sub(self, weights):
        # TODO: use our dice to ensure it being determinstic?
        rnd = random.random() * sum(weights)
1336 1337 1338 1339
        for i, w in enumerate(weights):
            rnd -= w
            if rnd < 0:
                return i
1340

1341 1342 1343 1344 1345 1346
class Database:
    ''' We use this to represent an actual TDengine database inside a service instance,
        possibly in a cluster environment.

        For now we use it to manage state transitions in that database
    '''
1347 1348 1349 1350 1351
    _clsLock = threading.Lock() # class wide lock
    _lastInt = 101  # next one is initial integer
    _lastTick = 0
    _lastLaggingTick = 0 # lagging tick, for unsequenced insersions

1352 1353 1354 1355
    def __init__(self, dbNum: int, dbc: DbConn): # TODO: remove dbc
        self._dbNum = dbNum # we assign a number to databases, for our testing purpose
        self._stateMachine = StateMechine(self)
        self._stateMachine.init(dbc)
1356
          
1357
        self._lock = threading.RLock()
S
Shuduo Sang 已提交
1358

1359 1360
    def getStateMachine(self) -> StateMechine:
        return self._stateMachine
S
Shuduo Sang 已提交
1361

1362 1363
    def getDbNum(self):
        return self._dbNum
1364

1365 1366
    def getName(self):
        return "db_{}".format(self._dbNum)
1367

1368 1369 1370 1371 1372 1373
    def filterTasks(self, inTasks: List[Task]): # Pick out those belonging to us
        outTasks = []
        for task in inTasks:
            if task.getDb().isSame(self):
                outTasks.append(task)
        return outTasks
1374

1375 1376 1377 1378 1379
    def isSame(self, other):
        return self._dbNum == other._dbNum

    def exists(self, dbc: DbConn):
        return dbc.existsDatabase(self.getName())
1380

1381 1382 1383 1384 1385 1386 1387
    @classmethod
    def getFixedSuperTableName(cls):
        return "fs_table"

    @classmethod
    def getFixedSuperTable(cls) -> TdSuperTable:
        return TdSuperTable(cls.getFixedSuperTableName())
1388 1389 1390 1391 1392 1393

    # We aim to create a starting time tick, such that, whenever we run our test here once
    # We should be able to safely create 100,000 records, which will not have any repeated time stamp
    # when we re-run the test in 3 minutes (180 seconds), basically we should expand time duration
    # by a factor of 500.
    # TODO: what if it goes beyond 10 years into the future
1394
    # TODO: fix the error as result of above: "tsdb timestamp is out of range"
1395 1396
    @classmethod
    def setupLastTick(cls):
1397
        t1 = datetime.datetime(2020, 6, 1)
1398
        t2 = datetime.datetime.now()
S
Shuduo Sang 已提交
1399 1400 1401 1402
        # maybe a very large number, takes 69 years to exceed Python int range
        elSec = int(t2.timestamp() - t1.timestamp())
        elSec2 = (elSec % (8 * 12 * 30 * 24 * 60 * 60 / 500)) * \
            500  # a number representing seconds within 10 years
1403
        # print("elSec = {}".format(elSec))
S
Shuduo Sang 已提交
1404 1405 1406
        t3 = datetime.datetime(2012, 1, 1)  # default "keep" is 10 years
        t4 = datetime.datetime.fromtimestamp(
            t3.timestamp() + elSec2)  # see explanation above
1407
        Logging.info("Setting up TICKS to start from: {}".format(t4))
1408 1409
        return t4

1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421
    @classmethod
    def getNextTick(cls):        
        with cls._clsLock:  # prevent duplicate tick
            if cls._lastLaggingTick==0:
                # 10k at 1/20 chance, should be enough to avoid overlaps
                cls._lastLaggingTick = cls.setupLastTick() + datetime.timedelta(0, -10000)                 
            if cls._lastTick==0: # should be quite a bit into the future
                cls._lastTick = cls.setupLastTick()  

            if Dice.throw(20) == 0:  # 1 in 20 chance, return lagging tick
                cls._lastLaggingTick += datetime.timedelta(0, 1) # Go back in time 100 seconds
                return cls._lastLaggingTick 
S
Shuduo Sang 已提交
1422 1423
            else:  # regular
                # add one second to it
1424 1425
                cls._lastTick += datetime.timedelta(0, 1)
                return cls._lastTick
1426 1427

    def getNextInt(self):
1428 1429 1430
        with self._lock:
            self._lastInt += 1
            return self._lastInt
1431 1432

    def getNextBinary(self):
S
Shuduo Sang 已提交
1433 1434
        return "Beijing_Shanghai_Los_Angeles_New_York_San_Francisco_Chicago_Beijing_Shanghai_Los_Angeles_New_York_San_Francisco_Chicago_{}".format(
            self.getNextInt())
1435 1436

    def getNextFloat(self):
1437 1438 1439
        ret = 0.9 + self.getNextInt()
        # print("Float obtained: {}".format(ret))
        return ret
S
Shuduo Sang 已提交
1440

1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489

class DbManager():
    ''' This is a wrapper around DbConn(), to make it easier to use. 

        TODO: rename this to DbConnManager
    '''
    def __init__(self):
        self.tableNumQueue = LinearQueue() # TODO: delete?
        # self.openDbServerConnection()
        self._dbConn = DbConn.createNative() if (
            gConfig.connector_type == 'native') else DbConn.createRest()
        try:
            self._dbConn.open()  # may throw taos.error.ProgrammingError: disconnected
        except taos.error.ProgrammingError as err:
            # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err))
            if (err.msg == 'client disconnected'):  # cannot open DB connection
                print(
                    "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.")
                sys.exit(2)
            else:
                print("Failed to connect to DB, errno = {}, msg: {}"
                    .format(Helper.convertErrno(err.errno), err.msg))
                raise
        except BaseException:
            print("[=] Unexpected exception")
            raise

        # Do this after dbConn is in proper shape
        # Moved to Database()
        # self._stateMachine = StateMechine(self._dbConn)

    def getDbConn(self):
        return self._dbConn

    # TODO: not used any more, to delete
    def pickAndAllocateTable(self):  # pick any table, and "use" it
        return self.tableNumQueue.pickAndAllocate()

    # TODO: Not used any more, to delete
    def addTable(self):
        with self._lock:
            tIndex = self.tableNumQueue.push()
        return tIndex

    # Not used any more, to delete
    def releaseTable(self, i):  # return the table back, so others can use it
        self.tableNumQueue.release(i)    

    # TODO: not used any more, delete
S
Steven Li 已提交
1490
    def getTableNameToDelete(self):
S
Shuduo Sang 已提交
1491 1492
        tblNum = self.tableNumQueue.pop()  # TODO: race condition!
        if (not tblNum):  # maybe false
1493
            return False
S
Shuduo Sang 已提交
1494

S
Steven Li 已提交
1495 1496
        return "table_{}".format(tblNum)

1497
    def cleanUp(self):
S
Shuduo Sang 已提交
1498 1499
        self._dbConn.close()

1500
class TaskExecutor():
1501
    class BoundedList:
S
Shuduo Sang 已提交
1502
        def __init__(self, size=10):
1503 1504
            self._size = size
            self._list = []
S
Steven Li 已提交
1505
            self._lock = threading.Lock()
1506

S
Shuduo Sang 已提交
1507
        def add(self, n: int):
S
Steven Li 已提交
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
            with self._lock:
                if not self._list:  # empty
                    self._list.append(n)
                    return
                # now we should insert
                nItems = len(self._list)
                insPos = 0
                for i in range(nItems):
                    insPos = i
                    if n <= self._list[i]:  # smaller than this item, time to insert
                        break  # found the insertion point
                    insPos += 1  # insert to the right

                if insPos == 0:  # except for the 1st item, # TODO: elimiate first item as gating item
                    return  # do nothing

                # print("Inserting at postion {}, value: {}".format(insPos, n))
                self._list.insert(insPos, n)  # insert

                newLen = len(self._list)
                if newLen <= self._size:
                    return  # do nothing
                elif newLen == (self._size + 1):
                    del self._list[0]  # remove the first item
                else:
                    raise RuntimeError("Corrupt Bounded List")
1534 1535 1536 1537 1538 1539

        def __str__(self):
            return repr(self._list)

    _boundedList = BoundedList()

1540 1541 1542
    def __init__(self, curStep):
        self._curStep = curStep

1543 1544 1545 1546
    @classmethod
    def getBoundedList(cls):
        return cls._boundedList

1547 1548 1549
    def getCurStep(self):
        return self._curStep

S
Shuduo Sang 已提交
1550
    def execute(self, task: Task, wt: WorkerThread):  # execute a task on a thread
1551
        task.execute(wt)
1552

1553 1554 1555 1556
    def recordDataMark(self, n: int):
        # print("[{}]".format(n), end="", flush=True)
        self._boundedList.add(n)

1557
    # def logInfo(self, msg):
1558
    #     Logging.info("    T[{}.x]: ".format(self._curStep) + msg)
1559

1560
    # def logDebug(self, msg):
1561
    #     Logging.debug("    T[{}.x]: ".format(self._curStep) + msg)
1562

S
Shuduo Sang 已提交
1563

S
Steven Li 已提交
1564
class Task():
1565 1566 1567 1568
    ''' A generic "Task" to be executed. For now we decide that there is no
        need to embed a DB connection here, we use whatever the Worker Thread has
        instead. But a task is always associated with a DB
    '''
1569 1570 1571 1572
    taskSn = 100

    @classmethod
    def allocTaskNum(cls):
S
Shuduo Sang 已提交
1573
        Task.taskSn += 1  # IMPORTANT: cannot use cls.taskSn, since each sub class will have a copy
1574
        # Logging.debug("Allocating taskSN: {}".format(Task.taskSn))
S
Steven Li 已提交
1575
        return Task.taskSn
1576

1577
    def __init__(self, execStats: ExecutionStats, db: Database):
S
Shuduo Sang 已提交
1578
        self._workerThread = None
1579
        self._err = None # type: Exception
1580
        self._aborted = False
1581
        self._curStep = None
S
Shuduo Sang 已提交
1582
        self._numRows = None  # Number of rows affected
1583

S
Shuduo Sang 已提交
1584
        # Assign an incremental task serial number
1585
        self._taskNum = self.allocTaskNum()
1586
        # Logging.debug("Creating new task {}...".format(self._taskNum))
1587

1588
        self._execStats = execStats
1589
        self._db = db # A task is always associated/for a specific DB
1590

1591
    def isSuccess(self):
S
Shuduo Sang 已提交
1592
        return self._err is None
1593

1594 1595 1596
    def isAborted(self):
        return self._aborted

S
Shuduo Sang 已提交
1597
    def clone(self):  # TODO: why do we need this again?
1598
        newTask = self.__class__(self._execStats, self._db)
1599 1600
        return newTask

1601 1602 1603
    def getDb(self):
        return self._db

1604
    def logDebug(self, msg):
S
Shuduo Sang 已提交
1605 1606 1607
        self._workerThread.logDebug(
            "Step[{}.{}] {}".format(
                self._curStep, self._taskNum, msg))
1608 1609

    def logInfo(self, msg):
S
Shuduo Sang 已提交
1610 1611 1612
        self._workerThread.logInfo(
            "Step[{}.{}] {}".format(
                self._curStep, self._taskNum, msg))
1613

1614
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
S
Shuduo Sang 已提交
1615 1616 1617
        raise RuntimeError(
            "To be implemeted by child classes, class name: {}".format(
                self.__class__.__name__))
1618

1619 1620 1621
    def _isErrAcceptable(self, errno, msg):
        if errno in [
                0x05,  # TSDB_CODE_RPC_NOT_READY
1622
                0x0B,  # Unable to establish connection, more details in TD-1648
1623
                # 0x200, # invalid SQL, TODO: re-examine with TD-934
1624 1625
                0x217, # "db not selected", client side defined error code
                0x218, # "Table does not exist" client side defined error code
1626 1627 1628 1629 1630 1631 1632 1633 1634
                0x360, 0x362, 
                0x369, # tag already exists
                0x36A, 0x36B, 0x36D,
                0x381, 
                0x380, # "db not selected"
                0x383,
                0x386,  # DB is being dropped?!
                0x503,
                0x510,  # vnode not in ready state
1635
                0x14,   # db not ready, errno changed
1636
                0x600,  # Invalid table ID, why?
1637 1638 1639
                1000  # REST catch-all error
            ]: 
            return True # These are the ALWAYS-ACCEPTABLE ones
1640 1641
        elif (errno in [ 0x0B ]) and gConfig.auto_start_service:
            return True # We may get "network unavilable" when restarting service
1642 1643 1644
        elif errno == 0x200 : # invalid SQL, we need to div in a bit more
            if msg.find("invalid column name") != -1:
                return True 
1645 1646 1647 1648
            elif msg.find("tags number not matched") != -1: # mismatched tags after modification
                return True
            elif msg.find("duplicated column names") != -1: # also alter table tag issues
                return True
1649
        elif gSvcMgr and (not gSvcMgr.isStable()): # We are managing service, and ...
1650
            Logging.info("Ignoring error when service starting/stopping: errno = {}, msg = {}".format(errno, msg))
S
Steven Li 已提交
1651
            return True
1652 1653 1654 1655
        
        return False # Not an acceptable error


1656 1657
    def execute(self, wt: WorkerThread):
        wt.verifyThreadSelf()
S
Shuduo Sang 已提交
1658
        self._workerThread = wt  # type: ignore
1659 1660

        te = wt.getTaskExecutor()
1661
        self._curStep = te.getCurStep()
S
Shuduo Sang 已提交
1662 1663
        self.logDebug(
            "[-] executing task {}...".format(self.__class__.__name__))
1664

1665
        self._err = None # TODO: type hint mess up?
1666 1667
        self._execStats.beginTaskType(self.__class__.__name__)  # mark beginning
        errno2 = None
1668 1669 1670

        # Now pick a database, and stick with it for the duration of the task execution
        dbName = self._db.getName()
1671
        try:
S
Shuduo Sang 已提交
1672
            self._executeInternal(te, wt)  # TODO: no return value?
1673
        except taos.error.ProgrammingError as err:
1674
            errno2 = Helper.convertErrno(err.errno)
1675
            if (gConfig.continue_on_exception):  # user choose to continue
1676
                self.logDebug("[=] Continue after TAOS exception: errno=0x{:X}, msg: {}, SQL: {}".format(
1677
                        errno2, err, wt.getDbConn().getLastSql()))
1678
                self._err = err
1679 1680
            elif self._isErrAcceptable(errno2, err.__str__()):
                self.logDebug("[=] Acceptable Taos library exception: errno=0x{:X}, msg: {}, SQL: {}".format(
1681
                        errno2, err, wt.getDbConn().getLastSql()))
1682
                print("_", end="", flush=True)
S
Shuduo Sang 已提交
1683
                self._err = err
1684
            else: # not an acceptable error
1685 1686 1687
                errMsg = "[=] Unexpected Taos library exception ({}): errno=0x{:X}, msg: {}, SQL: {}".format(
                    self.__class__.__name__,
                    errno2, err, wt.getDbConn().getLastSql())
1688
                self.logDebug(errMsg)
S
Shuduo Sang 已提交
1689
                if gConfig.debug:
1690 1691
                    # raise # so that we see full stack
                    traceback.print_exc()
1692 1693
                print(
                    "\n\n----------------------------\nProgram ABORTED Due to Unexpected TAOS Error: \n\n{}\n".format(errMsg) +
1694 1695 1696 1697
                    "----------------------------\n")
                # sys.exit(-1)
                self._err = err
                self._aborted = True
S
Shuduo Sang 已提交
1698
        except Exception as e:
S
Steven Li 已提交
1699
            self.logInfo("Non-TAOS exception encountered")
S
Shuduo Sang 已提交
1700
            self._err = e
S
Steven Li 已提交
1701
            self._aborted = True
1702
            traceback.print_exc()
1703
        except BaseException as e:
1704
            self.logInfo("Python base exception encountered")
1705
            self._err = e
1706
            self._aborted = True
S
Steven Li 已提交
1707
            traceback.print_exc()
1708
        except BaseException: # TODO: what is this again??!!
S
Shuduo Sang 已提交
1709 1710
            self.logDebug(
                "[=] Unexpected exception, SQL: {}".format(
1711
                    wt.getDbConn().getLastSql()))
1712
            raise
1713
        self._execStats.endTaskType(self.__class__.__name__, self.isSuccess())
S
Shuduo Sang 已提交
1714 1715 1716 1717

        self.logDebug("[X] task execution completed, {}, status: {}".format(
            self.__class__.__name__, "Success" if self.isSuccess() else "Failure"))
        # TODO: merge with above.
1718
        self._execStats.incExecCount(self.__class__.__name__, self.isSuccess(), errno2)
S
Steven Li 已提交
1719

1720
    # TODO: refactor away, just provide the dbConn
S
Shuduo Sang 已提交
1721
    def execWtSql(self, wt: WorkerThread, sql):  # execute an SQL on the worker thread
1722
        """ Haha """
1723 1724
        return wt.execSql(sql)

S
Shuduo Sang 已提交
1725
    def queryWtSql(self, wt: WorkerThread, sql):  # execute an SQL on the worker thread
1726 1727
        return wt.querySql(sql)

S
Shuduo Sang 已提交
1728
    def getQueryResult(self, wt: WorkerThread):  # execute an SQL on the worker thread
1729 1730 1731
        return wt.getQueryResult()


1732
class ExecutionStats:
1733
    def __init__(self):
S
Shuduo Sang 已提交
1734 1735
        # total/success times for a task
        self._execTimes: Dict[str, [int, int]] = {}
1736 1737 1738
        self._tasksInProgress = 0
        self._lock = threading.Lock()
        self._firstTaskStartTime = None
1739
        self._execStartTime = None
1740
        self._errors = {}
S
Shuduo Sang 已提交
1741 1742
        self._elapsedTime = 0.0  # total elapsed time
        self._accRunTime = 0.0  # accumulated run time
1743

1744 1745 1746
        self._failed = False
        self._failureReason = None

S
Steven Li 已提交
1747
    def __str__(self):
S
Shuduo Sang 已提交
1748 1749
        return "[ExecStats: _failed={}, _failureReason={}".format(
            self._failed, self._failureReason)
S
Steven Li 已提交
1750 1751

    def isFailed(self):
S
Shuduo Sang 已提交
1752
        return self._failed
S
Steven Li 已提交
1753

1754 1755 1756 1757 1758 1759
    def startExec(self):
        self._execStartTime = time.time()

    def endExec(self):
        self._elapsedTime = time.time() - self._execStartTime

1760
    def incExecCount(self, klassName, isSuccess, eno=None):  # TODO: add a lock here
1761 1762
        if klassName not in self._execTimes:
            self._execTimes[klassName] = [0, 0]
S
Shuduo Sang 已提交
1763 1764
        t = self._execTimes[klassName]  # tuple for the data
        t[0] += 1  # index 0 has the "total" execution times
1765
        if isSuccess:
S
Shuduo Sang 已提交
1766
            t[1] += 1  # index 1 has the "success" execution times
1767 1768 1769 1770 1771
        if eno != None:             
            if klassName not in self._errors:
                self._errors[klassName] = {}
            errors = self._errors[klassName]
            errors[eno] = errors[eno]+1 if eno in errors else 1
1772 1773 1774

    def beginTaskType(self, klassName):
        with self._lock:
S
Shuduo Sang 已提交
1775 1776
            if self._tasksInProgress == 0:  # starting a new round
                self._firstTaskStartTime = time.time()  # I am now the first task
1777 1778 1779 1780 1781
            self._tasksInProgress += 1

    def endTaskType(self, klassName, isSuccess):
        with self._lock:
            self._tasksInProgress -= 1
S
Shuduo Sang 已提交
1782
            if self._tasksInProgress == 0:  # all tasks have stopped
1783 1784 1785
                self._accRunTime += (time.time() - self._firstTaskStartTime)
                self._firstTaskStartTime = None

1786 1787 1788 1789
    def registerFailure(self, reason):
        self._failed = True
        self._failureReason = reason

1790
    def printStats(self):
1791
        Logging.info(
S
Shuduo Sang 已提交
1792
            "----------------------------------------------------------------------")
1793
        Logging.info(
S
Shuduo Sang 已提交
1794 1795 1796
            "| Crash_Gen test {}, with the following stats:". format(
                "FAILED (reason: {})".format(
                    self._failureReason) if self._failed else "SUCCEEDED"))
1797
        Logging.info("| Task Execution Times (success/total):")
1798
        execTimesAny = 0.0
S
Shuduo Sang 已提交
1799
        for k, n in self._execTimes.items():
1800
            execTimesAny += n[0]
1801 1802 1803 1804 1805 1806 1807
            errStr = None
            if k in self._errors:
                errors = self._errors[k]
                # print("errors = {}".format(errors))
                errStrs = ["0x{:X}:{}".format(eno, n) for (eno, n) in errors.items()]
                # print("error strings = {}".format(errStrs))
                errStr = ", ".join(errStrs) 
1808
            Logging.info("|    {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr))
S
Shuduo Sang 已提交
1809

1810
        Logging.info(
S
Shuduo Sang 已提交
1811
            "| Total Tasks Executed (success or not): {} ".format(execTimesAny))
1812
        Logging.info(
S
Shuduo Sang 已提交
1813 1814
            "| Total Tasks In Progress at End: {}".format(
                self._tasksInProgress))
1815
        Logging.info(
S
Shuduo Sang 已提交
1816 1817
            "| Total Task Busy Time (elapsed time when any task is in progress): {:.3f} seconds".format(
                self._accRunTime))
1818
        Logging.info(
S
Shuduo Sang 已提交
1819
            "| Average Per-Task Execution Time: {:.3f} seconds".format(self._accRunTime / execTimesAny))
1820
        Logging.info(
S
Shuduo Sang 已提交
1821 1822
            "| Total Elapsed Time (from wall clock): {:.3f} seconds".format(
                self._elapsedTime))
1823 1824 1825
        Logging.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList()))
        Logging.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections))
        Logging.info("| Longest native query time: {:.3f} seconds, started: {}".
1826 1827
            format(MyTDSql.longestQueryTime, 
                time.strftime("%x %X", time.localtime(MyTDSql.lqStartTime))) )
1828 1829
        Logging.info("| Longest native query: {}".format(MyTDSql.longestQuery))
        Logging.info(
S
Shuduo Sang 已提交
1830
            "----------------------------------------------------------------------")
1831 1832 1833


class StateTransitionTask(Task):
1834 1835 1836 1837 1838
    LARGE_NUMBER_OF_TABLES = 35
    SMALL_NUMBER_OF_TABLES = 3
    LARGE_NUMBER_OF_RECORDS = 50
    SMALL_NUMBER_OF_RECORDS = 3

1839 1840 1841 1842
    _baseTableNumber = None

    _endState = None

1843
    @classmethod
S
Shuduo Sang 已提交
1844
    def getInfo(cls):  # each sub class should supply their own information
1845
        raise RuntimeError("Overriding method expected")
1846
    
1847
    @classmethod
S
Shuduo Sang 已提交
1848
    def getEndState(cls):  # TODO: optimize by calling it fewer times
1849 1850
        raise RuntimeError("Overriding method expected")

1851 1852 1853
    # @classmethod
    # def getBeginStates(cls):
    #     return cls.getInfo()[0]
1854

1855 1856 1857
    # @classmethod
    # def getEndState(cls): # returning the class name
    #     return cls.getInfo()[0]
1858 1859

    @classmethod
1860 1861 1862
    def canBeginFrom(cls, state: AnyState):
        # return state.getValue() in cls.getBeginStates()
        raise RuntimeError("must be overriden")
1863

1864 1865
    @classmethod
    def getRegTableName(cls, i):
1866
        if ( StateTransitionTask._baseTableNumber is None):
S
Steven Li 已提交
1867 1868
            StateTransitionTask._baseTableNumber = Dice.throw(
                999) if gConfig.dynamic_db_table_names else 0
1869
        return "reg_table_{}".format(StateTransitionTask._baseTableNumber + i)
1870

1871 1872
    def execute(self, wt: WorkerThread):
        super().execute(wt)
S
Shuduo Sang 已提交
1873 1874


1875
class TaskCreateDb(StateTransitionTask):
1876
    @classmethod
1877
    def getEndState(cls):
S
Shuduo Sang 已提交
1878
        return StateDbOnly()
1879

1880 1881 1882 1883
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canCreateDb()

1884
    # Actually creating the database(es)
1885
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1886
        # was: self.execWtSql(wt, "create database db")
1887 1888 1889 1890 1891 1892
        repStr = ""
        if gConfig.max_replicas != 1:
            numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
            repStr = "replica {}".format(numReplica)
        self.execWtSql(wt, "create database {} {}"
            .format(self._db.getName(), repStr) )
1893

1894
class TaskDropDb(StateTransitionTask):
1895
    @classmethod
1896 1897
    def getEndState(cls):
        return StateEmpty()
1898

1899 1900 1901 1902
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canDropDb()

1903
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1904
        self.execWtSql(wt, "drop database {}".format(self._db.getName()))
1905
        Logging.debug("[OPS] database dropped at {}".format(time.time()))
1906

1907
class TaskCreateSuperTable(StateTransitionTask):
1908
    @classmethod
1909 1910
    def getEndState(cls):
        return StateSuperTableOnly()
1911

1912 1913
    @classmethod
    def canBeginFrom(cls, state: AnyState):
1914
        return state.canCreateFixedSuperTable()
1915

1916
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1917
        if not self._db.exists(wt.getDbConn()):
1918
            Logging.debug("Skipping task, no DB yet")
1919 1920
            return

1921
        sTable = self._db.getFixedSuperTable() # type: TdSuperTable
1922
        # wt.execSql("use db")    # should always be in place
1923 1924
        sTable.create(wt.getDbConn(), self._db.getName(), 
            {'ts':'timestamp', 'speed':'int'}, {'b':'binary(200)', 'f':'float'})
1925
        # self.execWtSql(wt,"create table db.{} (ts timestamp, speed int) tags (b binary(200), f float) ".format(tblName))
S
Shuduo Sang 已提交
1926 1927
        # No need to create the regular tables, INSERT will do that
        # automatically
1928

S
Steven Li 已提交
1929

1930 1931 1932 1933
class TdSuperTable:
    def __init__(self, stName):
        self._stName = stName

1934 1935 1936
    def getName(self):
        return self._stName

1937 1938 1939 1940 1941
    # TODO: odd semantic, create() method is usually static?
    def create(self, dbc, dbName, cols: dict, tags: dict):
        '''Creating a super table'''
        sql = "CREATE TABLE {}.{} ({}) TAGS ({})".format(
            dbName,
1942 1943 1944 1945 1946 1947
            self._stName,
            ",".join(['%s %s'%(k,v) for (k,v) in cols.items()]),
            ",".join(['%s %s'%(k,v) for (k,v) in tags.items()])
            )
        dbc.execute(sql)        

1948
    def getRegTables(self, dbc: DbConn, dbName: str):
1949
        try:
1950
            dbc.query("select TBNAME from {}.{}".format(dbName, self._stName))  # TODO: analyze result set later            
1951
        except taos.error.ProgrammingError as err:                    
1952
            errno2 = Helper.convertErrno(err.errno) 
1953
            Logging.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err))
1954 1955 1956 1957 1958
            raise

        qr = dbc.getQueryResult()
        return [v[0] for v in qr] # list transformation, ref: https://stackoverflow.com/questions/643823/python-list-transformation

1959 1960
    def hasRegTables(self, dbc: DbConn, dbName: str):
        return dbc.query("SELECT * FROM {}.{}".format(dbName, self._stName)) > 0
1961

1962 1963
    def ensureTable(self, dbc: DbConn, dbName: str, regTableName: str):
        sql = "select tbname from {}.{} where tbname in ('{}')".format(dbName, self._stName, regTableName)
1964 1965
        if dbc.query(sql) >= 1 : # reg table exists already
            return
1966 1967
        sql = "CREATE TABLE {}.{} USING {}.{} tags ({})".format(
            dbName, regTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName)
1968 1969 1970
        )
        dbc.execute(sql)

1971 1972
    def _getTagStrForSql(self, dbc, dbName: str) :
        tags = self._getTags(dbc, dbName)
1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985
        tagStrs = []
        for tagName in tags: 
            tagType = tags[tagName]
            if tagType == 'BINARY':
                tagStrs.append("'Beijing-Shanghai-LosAngeles'")
            elif tagType == 'FLOAT':
                tagStrs.append('9.9')
            elif tagType == 'INT':
                tagStrs.append('88')
            else:
                raise RuntimeError("Unexpected tag type: {}".format(tagType))
        return ", ".join(tagStrs)

1986 1987
    def _getTags(self, dbc, dbName) -> dict:
        dbc.query("DESCRIBE {}.{}".format(dbName, self._stName))
1988 1989 1990 1991 1992 1993
        stCols = dbc.getQueryResult()
        # print(stCols)
        ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type
        # print("Tags retrieved: {}".format(ret))
        return ret

1994 1995
    def addTag(self, dbc, dbName, tagName, tagType):
        if tagName in self._getTags(dbc, dbName): # already 
1996 1997
            return
        # sTable.addTag("extraTag", "int")
1998
        sql = "alter table {}.{} add tag {} {}".format(dbName, self._stName, tagName, tagType)
1999 2000
        dbc.execute(sql)

2001 2002
    def dropTag(self, dbc, dbName, tagName):
        if not tagName in self._getTags(dbc, dbName): # don't have this tag
2003
            return
2004
        sql = "alter table {}.{} drop tag {}".format(dbName, self._stName, tagName)
2005 2006
        dbc.execute(sql)

2007 2008
    def changeTag(self, dbc, dbName, oldTag, newTag):
        tags = self._getTags(dbc, dbName)
2009 2010 2011 2012
        if not oldTag in tags: # don't have this tag
            return
        if newTag in tags: # already have this tag
            return
2013
        sql = "alter table {}.{} change tag {} {}".format(dbName, self._stName, oldTag, newTag)
2014 2015
        dbc.execute(sql)

2016
class TaskReadData(StateTransitionTask):
2017
    @classmethod
2018
    def getEndState(cls):
S
Shuduo Sang 已提交
2019
        return None  # meaning doesn't affect state
2020

2021 2022 2023 2024
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canReadData()

2025
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2026
        sTable = self._db.getFixedSuperTable()
2027

2028 2029
        # 1 in 5 chance, simulate a broken connection. 
        if random.randrange(5) == 0:  # TODO: break connection in all situations
2030 2031
            wt.getDbConn().close()
            wt.getDbConn().open()
2032
            print("_r", end="", flush=True)
2033
        
2034
        dbc = wt.getDbConn()
2035 2036
        dbName = self._db.getName()
        for rTbName in sTable.getRegTables(dbc, dbName):  # regular tables
2037
            aggExpr = Dice.choice([
2038 2039 2040
                '*',
                'count(*)',
                'avg(speed)',
2041
                # 'twa(speed)', # TODO: this one REQUIRES a where statement, not reasonable
2042 2043
                'sum(speed)', 
                'stddev(speed)', 
2044
                # SELECTOR functions
2045 2046 2047
                'min(speed)', 
                'max(speed)', 
                'first(speed)', 
2048
                'last(speed)',
2049 2050 2051
                'top(speed, 50)', # TODO: not supported?
                'bottom(speed, 50)', # TODO: not supported?
                'apercentile(speed, 10)', # TODO: TD-1316
2052 2053 2054 2055 2056
                'last_row(speed)',
                # Transformation Functions
                # 'diff(speed)', # TODO: no supported?!
                'spread(speed)'
                ]) # TODO: add more from 'top'
2057 2058 2059
            filterExpr = Dice.choice([ # TODO: add various kind of WHERE conditions
                None
            ])
2060
            try:
2061
                # Run the query against the regular table first
2062
                dbc.execute("select {} from {}.{}".format(aggExpr, dbName, rTbName))
2063
                # Then run it against the super table
2064
                if aggExpr not in ['stddev(speed)']: #TODO: STDDEV not valid for super tables?!
2065
                    dbc.execute("select {} from {}.{}".format(aggExpr, dbName, sTable.getName()))
2066
            except taos.error.ProgrammingError as err:                    
2067
                errno2 = Helper.convertErrno(err.errno)
2068
                Logging.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql()))
2069
                raise
S
Shuduo Sang 已提交
2070

2071
class TaskDropSuperTable(StateTransitionTask):
2072
    @classmethod
2073
    def getEndState(cls):
S
Shuduo Sang 已提交
2074
        return StateDbOnly()
2075

2076 2077
    @classmethod
    def canBeginFrom(cls, state: AnyState):
2078
        return state.canDropFixedSuperTable()
2079

2080
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2081
        # 1/2 chance, we'll drop the regular tables one by one, in a randomized sequence
S
Shuduo Sang 已提交
2082
        if Dice.throw(2) == 0:
2083
            # print("_7_", end="", flush=True)
S
Shuduo Sang 已提交
2084 2085 2086 2087
            tblSeq = list(range(
                2 + (self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES)))
            random.shuffle(tblSeq)
            tickOutput = False  # if we have spitted out a "d" character for "drop regular table"
2088
            isSuccess = True
S
Shuduo Sang 已提交
2089
            for i in tblSeq:
2090
                regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)
2091
                try:
2092 2093
                    self.execWtSql(wt, "drop table {}.{}".
                        format(self._db.getName(), regTableName))  # nRows always 0, like MySQL
S
Shuduo Sang 已提交
2094
                except taos.error.ProgrammingError as err:
2095 2096
                    # correcting for strange error number scheme                    
                    errno2 = Helper.convertErrno(err.errno)
S
Shuduo Sang 已提交
2097
                    if (errno2 in [0x362]):  # mnode invalid table name
2098
                        isSuccess = False
2099
                        Logging.debug("[DB] Acceptable error when dropping a table")
S
Shuduo Sang 已提交
2100
                    continue  # try to delete next regular table
2101 2102

                if (not tickOutput):
S
Shuduo Sang 已提交
2103 2104
                    tickOutput = True  # Print only one time
                    if isSuccess:
2105 2106
                        print("d", end="", flush=True)
                    else:
S
Shuduo Sang 已提交
2107
                        print("f", end="", flush=True)
2108 2109

        # Drop the super table itself
2110 2111
        tblName = self._db.getFixedSuperTableName()
        self.execWtSql(wt, "drop table {}.{}".format(self._db.getName(), tblName))
2112

S
Shuduo Sang 已提交
2113

2114 2115 2116
class TaskAlterTags(StateTransitionTask):
    @classmethod
    def getEndState(cls):
S
Shuduo Sang 已提交
2117
        return None  # meaning doesn't affect state
2118 2119 2120

    @classmethod
    def canBeginFrom(cls, state: AnyState):
S
Shuduo Sang 已提交
2121
        return state.canDropFixedSuperTable()  # if we can drop it, we can alter tags
2122 2123

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2124 2125
        # tblName = self._dbManager.getFixedSuperTableName()
        dbc = wt.getDbConn()
2126 2127
        sTable = self._db.getFixedSuperTable()
        dbName = self._db.getName()
2128
        dice = Dice.throw(4)
S
Shuduo Sang 已提交
2129
        if dice == 0:
2130
            sTable.addTag(dbc, dbName, "extraTag", "int")
2131
            # sql = "alter table db.{} add tag extraTag int".format(tblName)
S
Shuduo Sang 已提交
2132
        elif dice == 1:
2133
            sTable.dropTag(dbc, dbName, "extraTag")
2134
            # sql = "alter table db.{} drop tag extraTag".format(tblName)
S
Shuduo Sang 已提交
2135
        elif dice == 2:
2136
            sTable.dropTag(dbc, dbName, "newTag")
2137
            # sql = "alter table db.{} drop tag newTag".format(tblName)
S
Shuduo Sang 已提交
2138
        else:  # dice == 3
2139
            sTable.changeTag(dbc, dbName, "extraTag", "newTag")
2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155
            # sql = "alter table db.{} change tag extraTag newTag".format(tblName)

class TaskRestartService(StateTransitionTask):
    _isRunning = False
    _classLock = threading.Lock()

    @classmethod
    def getEndState(cls):
        return None  # meaning doesn't affect state

    @classmethod
    def canBeginFrom(cls, state: AnyState):
        if gConfig.auto_start_service:
            return state.canDropFixedSuperTable()  # Basicallly when we have the super table
        return False # don't run this otherwise

2156
    CHANCE_TO_RESTART_SERVICE = 200
2157 2158 2159 2160
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        if not gConfig.auto_start_service: # only execute when we are in -a mode
            print("_a", end="", flush=True)
            return
2161

2162 2163 2164 2165 2166 2167
        with self._classLock:
            if self._isRunning:
                print("Skipping restart task, another running already")
                return
            self._isRunning = True

2168
        if Dice.throw(self.CHANCE_TO_RESTART_SERVICE) == 0: # 1 in N chance
2169 2170 2171
            dbc = wt.getDbConn()
            dbc.execute("show databases") # simple delay, align timing with other workers
            gSvcMgr.restart()
2172

2173
        self._isRunning = False
S
Shuduo Sang 已提交
2174

2175
class TaskAddData(StateTransitionTask):
S
Shuduo Sang 已提交
2176 2177
    # Track which table is being actively worked on
    activeTable: Set[int] = set()
2178

2179 2180 2181
    # We use these two files to record operations to DB, useful for power-off tests
    fAddLogReady = None # type: TextIOWrapper
    fAddLogDone  = None # type: TextIOWrapper
2182 2183 2184

    @classmethod
    def prepToRecordOps(cls):
S
Shuduo Sang 已提交
2185 2186
        if gConfig.record_ops:
            if (cls.fAddLogReady is None):
2187
                Logging.info(
S
Shuduo Sang 已提交
2188
                    "Recording in a file operations to be performed...")
2189
                cls.fAddLogReady = open("add_log_ready.txt", "w")
S
Shuduo Sang 已提交
2190
            if (cls.fAddLogDone is None):
2191
                Logging.info("Recording in a file operations completed...")
2192
                cls.fAddLogDone = open("add_log_done.txt", "w")
2193

2194
    @classmethod
2195 2196
    def getEndState(cls):
        return StateHasData()
2197 2198 2199 2200

    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canAddData()
S
Shuduo Sang 已提交
2201

2202
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2203 2204
        # ds = self._dbManager # Quite DANGEROUS here, may result in multi-thread client access
        db = self._db
2205
        dbc = wt.getDbConn()
2206
        tblSeq = list(range(
S
Shuduo Sang 已提交
2207 2208 2209 2210
                self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES))
        random.shuffle(tblSeq)
        for i in tblSeq:
            if (i in self.activeTable):  # wow already active
2211
                print("x", end="", flush=True) # concurrent insertion
2212
            else:
S
Shuduo Sang 已提交
2213
                self.activeTable.add(i)  # marking it active
2214
            
2215
            sTable = db.getFixedSuperTable()
2216
            regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)
2217
            sTable.ensureTable(wt.getDbConn(), db.getName(), regTableName)  # Ensure the table exists           
2218 2219
           
            for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS):  # number of records per table
2220
                nextInt = db.getNextInt()
2221
                nextTick = db.getNextTick()
2222 2223
                if gConfig.record_ops:
                    self.prepToRecordOps()
2224
                    self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName))
2225 2226
                    self.fAddLogReady.flush()
                    os.fsync(self.fAddLogReady)
2227 2228
                sql = "insert into {}.{} values ('{}', {});".format( # removed: tags ('{}', {})
                    db.getName(),
S
Shuduo Sang 已提交
2229
                    regTableName,
2230 2231
                    # ds.getFixedSuperTableName(),
                    # ds.getNextBinary(), ds.getNextFloat(),
2232 2233
                    nextTick, nextInt)
                dbc.execute(sql)
S
Shuduo Sang 已提交
2234 2235
                # Successfully wrote the data into the DB, let's record it
                # somehow
2236
                te.recordDataMark(nextInt)
2237
                if gConfig.record_ops:
S
Shuduo Sang 已提交
2238 2239 2240
                    self.fAddLogDone.write(
                        "Wrote {} to {}\n".format(
                            nextInt, regTableName))
2241 2242
                    self.fAddLogDone.flush()
                    os.fsync(self.fAddLogDone)
2243 2244

                # Now read it back and verify, we might encounter an error if table is dropped
2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261
                if gConfig.verify_data: # only if command line asks for it
                    try:
                        readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts= '{}'".
                            format(db.getName(), regTableName, nextTick))
                        if readBack != nextInt :
                            raise taos.error.ProgrammingError(
                                "Failed to read back same data, wrote: {}, read: {}"
                                .format(nextInt, readBack), 0x999)
                    except taos.error.ProgrammingError as err:
                        errno = Helper.convertErrno(err.errno)
                        if errno in [0x991, 0x992]  : # not a single result
                            raise taos.error.ProgrammingError(
                                "Failed to read back same data for tick: {}, wrote: {}, read: {}"
                                .format(nextTick, nextInt, "Empty Result" if errno==0x991 else "Multiple Result"),
                                errno)
                        # Re-throw no matter what
                        raise
2262 2263
                

S
Shuduo Sang 已提交
2264
            self.activeTable.discard(i)  # not raising an error, unlike remove
2265 2266


2267

2268

2269

2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297
class ThreadStacks: # stack info for all threads
    def __init__(self):
        self._allStacks = {}
        allFrames = sys._current_frames()
        for th in threading.enumerate():                        
            stack = traceback.extract_stack(allFrames[th.ident])     
            self._allStacks[th.native_id] = stack

    def print(self, filteredEndName = None, filterInternal = False):
        for thNid, stack in self._allStacks.items(): # for each thread            
            lastFrame = stack[-1]
            if filteredEndName: # we need to filter out stacks that match this name                
                if lastFrame.name == filteredEndName : # end did not match
                    continue
            if filterInternal:
                if lastFrame.name in ['wait', 'invoke_excepthook', 
                    '_wait', # The Barrier exception
                    'svcOutputReader', # the svcMgr thread
                    '__init__']: # the thread that extracted the stack
                    continue # ignore
            # Now print
            print("\n<----- Thread Info for ID: {}".format(thNid))
            for frame in stack:
                # print(frame)
                print("File {filename}, line {lineno}, in {name}".format(
                    filename=frame.filename, lineno=frame.lineno, name=frame.name))
                print("    {}".format(frame.line))
            print("-----> End of Thread Info\n")
S
Shuduo Sang 已提交
2298

2299 2300 2301
class ClientManager:
    def __init__(self):
        print("Starting service manager")
2302 2303
        # signal.signal(signal.SIGTERM, self.sigIntHandler)
        # signal.signal(signal.SIGINT, self.sigIntHandler)
2304

2305
        self._status = Status.STATUS_RUNNING
2306 2307
        self.tc = None

2308 2309
        self.inSigHandler = False

2310
    def sigIntHandler(self, signalNumber, frame):
2311
        if self._status != Status.STATUS_RUNNING:
2312 2313 2314
            print("Repeated SIGINT received, forced exit...")
            # return  # do nothing if it's already not running
            sys.exit(-1)
2315
        self._status = Status.STATUS_STOPPING  # immediately set our status
2316

2317
        print("ClientManager: Terminating program...")
2318 2319
        self.tc.requestToStop()

2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360
    def _doMenu(self):
        choice = ""
        while True:
            print("\nInterrupting Client Program, Choose an Action: ")
            print("1: Resume")
            print("2: Terminate")
            print("3: Show Threads")
            # Remember to update the if range below
            # print("Enter Choice: ", end="", flush=True)
            while choice == "":
                choice = input("Enter Choice: ")
                if choice != "":
                    break  # done with reading repeated input
            if choice in ["1", "2", "3"]:
                break  # we are done with whole method
            print("Invalid choice, please try again.")
            choice = ""  # reset
        return choice

    def sigUsrHandler(self, signalNumber, frame):
        print("Interrupting main thread execution upon SIGUSR1")
        if self.inSigHandler:  # already
            print("Ignoring repeated SIG_USR1...")
            return  # do nothing if it's already not running
        self.inSigHandler = True

        choice = self._doMenu()
        if choice == "1":
            print("Resuming execution...")
            time.sleep(1.0)
        elif choice == "2":
            print("Not implemented yet")
            time.sleep(1.0)
        elif choice == "3":
            ts = ThreadStacks()
            ts.print()
        else:
            raise RuntimeError("Invalid menu choice: {}".format(choice))

        self.inSigHandler = False

2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389
    # TODO: need to revise how we verify data durability
    # def _printLastNumbers(self):  # to verify data durability
    #     dbManager = DbManager()
    #     dbc = dbManager.getDbConn()
    #     if dbc.query("show databases") <= 1:  # no database (we have a default called "log")
    #         return
    #     dbc.execute("use db")
    #     if dbc.query("show tables") == 0:  # no tables
    #         return

    #     sTbName = dbManager.getFixedSuperTableName()

    #     # get all regular tables
    #     # TODO: analyze result set later
    #     dbc.query("select TBNAME from db.{}".format(sTbName))
    #     rTables = dbc.getQueryResult()

    #     bList = TaskExecutor.BoundedList()
    #     for rTbName in rTables:  # regular tables
    #         dbc.query("select speed from db.{}".format(rTbName[0]))
    #         numbers = dbc.getQueryResult()
    #         for row in numbers:
    #             # print("<{}>".format(n), end="", flush=True)
    #             bList.add(row[0])

    #     print("Top numbers in DB right now: {}".format(bList))
    #     print("TDengine client execution is about to start in 2 seconds...")
    #     time.sleep(2.0)
    #     dbManager = None  # release?
2390

2391
    def run(self, svcMgr):    
2392
        # self._printLastNumbers()
2393
        global gConfig
2394

2395 2396 2397 2398
        # Prepare Tde Instance
        global gContainer
        tInst = gContainer.defTdeInstance = TdeInstance() # "subdir to hold the instance"

S
Shuduo Sang 已提交
2399
        dbManager = DbManager()  # Regular function
2400
        thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
2401
        self.tc = ThreadCoordinator(thPool, dbManager)
2402
        
2403
        print("Starting client instance to: {}".format(tInst))
2404
        self.tc.run()
S
Steven Li 已提交
2405 2406
        # print("exec stats: {}".format(self.tc.getExecStats()))
        # print("TC failed = {}".format(self.tc.isFailed()))
2407
        if svcMgr: # gConfig.auto_start_service:
2408
            svcMgr.stopTaosService()
2409
            svcMgr = None
2410 2411
        # Print exec status, etc., AFTER showing messages from the server
        self.conclude()
S
Steven Li 已提交
2412
        # print("TC failed (2) = {}".format(self.tc.isFailed()))
S
Shuduo Sang 已提交
2413
        # Linux return code: ref https://shapeshed.com/unix-exit-codes/
2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432
        ret = 1 if self.tc.isFailed() else 0
        self.tc.cleanup()

        # Release global variables
        gConfig = None
        gSvcMgr = None
        logger = None

        # Release variables here
        self.tc = None
        thPool = None
        dbManager = None

        gc.collect() # force garbage collection
        # h = hpy()
        # print("\n----- Final Python Heap -----\n")        
        # print(h.heap())

        return ret
2433 2434

    def conclude(self):
2435
        # self.tc.getDbManager().cleanUp() # clean up first, so we can show ZERO db connections
2436
        self.tc.printStats()
2437

2438
class MainExec:
2439 2440 2441
    def __init__(self):        
        self._clientMgr = None
        self._svcMgr = None
2442

2443 2444 2445
        signal.signal(signal.SIGTERM, self.sigIntHandler)
        signal.signal(signal.SIGINT,  self.sigIntHandler)
        signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
2446

2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459
    def sigUsrHandler(self, signalNumber, frame):
        if self._clientMgr:
            self._clientMgr.sigUsrHandler(signalNumber, frame)
        elif self._svcMgr: # Only if no client mgr, we are running alone
            self._svcMgr.sigUsrHandler(signalNumber, frame)
        
    def sigIntHandler(self, signalNumber, frame):
        if self._svcMgr:
            self._svcMgr.sigIntHandler(signalNumber, frame)
        if self._clientMgr:
            self._clientMgr.sigIntHandler(signalNumber, frame)

    def runClient(self):
2460
        global gSvcMgr
2461
        if gConfig.auto_start_service:
2462
            self._svcMgr = ServiceManager()
2463
            gSvcMgr = self._svcMgr # hack alert
2464 2465 2466
            self._svcMgr.startTaosService() # we start, don't run
        
        self._clientMgr = ClientManager()
2467 2468 2469 2470
        ret = None
        try: 
            ret = self._clientMgr.run(self._svcMgr) # stop TAOS service inside
        except requests.exceptions.ConnectionError as err:
2471
            Logging.warning("Failed to open REST connection to DB: {}".format(err.getMessage()))
2472 2473
            # don't raise
        return ret
2474 2475

    def runService(self):
2476
        global gSvcMgr
2477
        self._svcMgr = ServiceManager()
2478 2479
        gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert

2480
        self._svcMgr.run() # run to some end state
2481 2482
        self._svcMgr = None 
        gSvcMgr = None        
2483

2484 2485 2486 2487
    def init(self): # TODO: refactor
        global gContainer
        gContainer = Container() # micky-mouse DI

2488 2489 2490
        global gSvcMgr # TODO: refactor away
        gSvcMgr = None

2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543
        # Super cool Python argument library:
        # https://docs.python.org/3/library/argparse.html
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawDescriptionHelpFormatter,
            description=textwrap.dedent('''\
                TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
                ---------------------------------------------------------------------
                1. You build TDengine in the top level ./build directory, as described in offical docs
                2. You run the server there before this script: ./build/bin/taosd -c test/cfg

                '''))                      

        parser.add_argument(
            '-a',
            '--auto-start-service',
            action='store_true',
            help='Automatically start/stop the TDengine service (default: false)')
        parser.add_argument(
            '-b',
            '--max-dbs',
            action='store',
            default=0,
            type=int,
            help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)')
        parser.add_argument(
            '-c',
            '--connector-type',
            action='store',
            default='native',
            type=str,
            help='Connector type to use: native, rest, or mixed (default: 10)')
        parser.add_argument(
            '-d',
            '--debug',
            action='store_true',
            help='Turn on DEBUG mode for more logging (default: false)')
        parser.add_argument(
            '-e',
            '--run-tdengine',
            action='store_true',
            help='Run TDengine service in foreground (default: false)')
        parser.add_argument(
            '-i',
            '--max-replicas',
            action='store',
            default=1,
            type=int,
            help='Maximum number of replicas to use, when testing against clusters. (default: 1)')
        parser.add_argument(
            '-l',
            '--larger-data',
            action='store_true',
            help='Write larger amount of data during write operations (default: false)')
2544 2545 2546 2547 2548 2549
        parser.add_argument(
            '-n',
            '--dynamic-db-table-names',
            action='store_true',
            help='Use non-fixed names for dbs/tables, useful for multi-instance executions (default: false)')
        
2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586
        parser.add_argument(
            '-p',
            '--per-thread-db-connection',
            action='store_true',
            help='Use a single shared db connection (default: false)')
        parser.add_argument(
            '-r',
            '--record-ops',
            action='store_true',
            help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')
        parser.add_argument(
            '-s',
            '--max-steps',
            action='store',
            default=1000,
            type=int,
            help='Maximum number of steps to run (default: 100)')
        parser.add_argument(
            '-t',
            '--num-threads',
            action='store',
            default=5,
            type=int,
            help='Number of threads to run (default: 10)')
        parser.add_argument(
            '-v',
            '--verify-data',
            action='store_true',
            help='Verify data written in a number of places by reading back (default: false)')
        parser.add_argument(
            '-x',
            '--continue-on-exception',
            action='store_true',
            help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')

        global gConfig
        gConfig = parser.parse_args()
S
Shuduo Sang 已提交
2587

2588
        Logging.clsInit(gConfig)
2589 2590 2591 2592 2593 2594 2595 2596

        Dice.seed(0)  # initial seeding of dice

    def run(self):
        if gConfig.run_tdengine:  # run server
            self.runService()
        else:
            return self.runClient()
S
Steven Li 已提交
2597

2598

2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619
class Container():
    _propertyList = {'defTdeInstance'}

    def __init__(self):
        self._cargo = {} # No cargo at the beginning

    def _verifyValidProperty(self, name):
        if not name in self._propertyList:
            raise CrashGenError("Invalid container property: {}".format(name))

    # Called for an attribute, when other mechanisms fail (compare to  __getattribute__)
    def __getattr__(self, name):
        self._verifyValidProperty(name)
        return self._cargo[name] # just a simple lookup

    def __setattr__(self, name, value):
        if name == '_cargo' : # reserved vars
            super().__setattr__(name, value)
            return
        self._verifyValidProperty(name)
        self._cargo[name] = value
S
Shuduo Sang 已提交
2620