crash_gen.py 121.5 KB
Newer Older
S
Shuduo Sang 已提交
1
# -----!/usr/bin/python3.7
S
Steven Li 已提交
2 3 4 5 6 7 8 9 10 11 12 13
###################################################################
#           Copyright (c) 2016 by TAOS Technologies, Inc.
#                     All rights reserved.
#
#  This file is proprietary and confidential to TAOS Technologies.
#  No part of this file may be reproduced, stored, transmitted,
#  disclosed or used in any form or by any means other than as
#  expressly provided by the written permission from Jianhui Tao
#
###################################################################

# -*- coding: utf-8 -*-
S
Shuduo Sang 已提交
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
# For type hinting before definition, ref:
# https://stackoverflow.com/questions/33533148/how-do-i-specify-that-the-return-type-of-a-method-is-the-same-as-the-class-itsel
from __future__ import annotations
import taos
from util.sql import *
from util.cases import *
from util.dnodes import *
from util.log import *
from queue import Queue, Empty
from typing import IO
from typing import Set
from typing import Dict
from typing import List
from requests.auth import HTTPBasicAuth
import textwrap
import datetime
import logging
import time
import random
import threading
import requests
import copy
import argparse
import getopt
38

S
Steven Li 已提交
39
import sys
40
import os
41 42
import io
import signal
43
import traceback
44 45 46
import resource
from guppy import hpy
import gc
47 48 49 50 51 52 53

try:
    import psutil
except:
    print("Psutil module needed, please install: sudo pip3 install psutil")
    sys.exit(-1)

54 55 56 57
# Require Python 3
if sys.version_info[0] < 3:
    raise Exception("Must be using Python 3")

S
Shuduo Sang 已提交
58
# Global variables, tried to keep a small number.
59 60 61

# Command-line/Environment Configurations, will set a bit later
# ConfigNameSpace = argparse.Namespace
S
Shuduo Sang 已提交
62
gConfig = argparse.Namespace()  # Dummy value, will be replaced later
63
gSvcMgr = None # TODO: refactor this hack, use dep injection
64
logger = None # type: Logger
S
Steven Li 已提交
65

S
Shuduo Sang 已提交
66
def runThread(wt: WorkerThread):
67
    wt.run()
68

69 70
class CrashGenError(Exception):
    def __init__(self, msg=None, errno=None):
S
Shuduo Sang 已提交
71
        self.msg = msg
72
        self.errno = errno
S
Shuduo Sang 已提交
73

74 75 76
    def __str__(self):
        return self.msg

S
Shuduo Sang 已提交
77

S
Steven Li 已提交
78
class WorkerThread:
79
    def __init__(self, pool: ThreadPool, tid, tc: ThreadCoordinator,
S
Shuduo Sang 已提交
80 81 82
                 # te: TaskExecutor,
                 ):  # note: main thread context!
        # self._curStep = -1
83
        self._pool = pool
S
Shuduo Sang 已提交
84 85
        self._tid = tid
        self._tc = tc  # type: ThreadCoordinator
S
Steven Li 已提交
86
        # self.threadIdent = threading.get_ident()
87 88
        self._thread = threading.Thread(target=runThread, args=(self,))
        self._stepGate = threading.Event()
S
Steven Li 已提交
89

90
        # Let us have a DB connection of our own
S
Shuduo Sang 已提交
91
        if (gConfig.per_thread_db_connection):  # type: ignore
92
            # print("connector_type = {}".format(gConfig.connector_type))
93 94 95 96 97 98 99 100 101 102 103
            if gConfig.connector_type == 'native':
                self._dbConn = DbConn.createNative() 
            elif gConfig.connector_type == 'rest':
                self._dbConn = DbConn.createRest() 
            elif gConfig.connector_type == 'mixed':
                if Dice.throw(2) == 0: # 1/2 chance
                    self._dbConn = DbConn.createNative() 
                else:
                    self._dbConn = DbConn.createRest() 
            else:
                raise RuntimeError("Unexpected connector type: {}".format(gConfig.connector_type))
104

105
        # self._dbInUse = False  # if "use db" was executed already
106

107
    def logDebug(self, msg):
S
Steven Li 已提交
108
        logger.debug("    TRD[{}] {}".format(self._tid, msg))
109 110

    def logInfo(self, msg):
S
Steven Li 已提交
111
        logger.info("    TRD[{}] {}".format(self._tid, msg))
112

113 114
    # def dbInUse(self):
    #     return self._dbInUse
115

116 117 118 119
    # def useDb(self):
    #     if (not self._dbInUse):
    #         self.execSql("use db")
    #     self._dbInUse = True
120

121
    def getTaskExecutor(self):
S
Shuduo Sang 已提交
122
        return self._tc.getTaskExecutor()
123

S
Steven Li 已提交
124
    def start(self):
125
        self._thread.start()  # AFTER the thread is recorded
S
Steven Li 已提交
126

S
Shuduo Sang 已提交
127
    def run(self):
S
Steven Li 已提交
128
        # initialization after thread starts, in the thread context
129
        # self.isSleeping = False
130 131
        logger.info("Starting to run thread: {}".format(self._tid))

S
Shuduo Sang 已提交
132
        if (gConfig.per_thread_db_connection):  # type: ignore
133
            logger.debug("Worker thread openning database connection")
134
            self._dbConn.open()
S
Steven Li 已提交
135

S
Shuduo Sang 已提交
136 137
        self._doTaskLoop()

138
        # clean up
S
Shuduo Sang 已提交
139
        if (gConfig.per_thread_db_connection):  # type: ignore
140 141 142 143
            if self._dbConn.isOpen: #sometimes it is not open
                self._dbConn.close()
            else:
                logger.warning("Cleaning up worker thread, dbConn already closed")
144

S
Shuduo Sang 已提交
145
    def _doTaskLoop(self):
146 147
        # while self._curStep < self._pool.maxSteps:
        # tc = ThreadCoordinator(None)
S
Shuduo Sang 已提交
148 149
        while True:
            tc = self._tc  # Thread Coordinator, the overall master
150 151 152
            try:
                tc.crossStepBarrier()  # shared barrier first, INCLUDING the last one
            except threading.BrokenBarrierError as err: # main thread timed out
153
                print("_bto", end="")
154 155 156
                logger.debug("[TRD] Worker thread exiting due to main thread barrier time-out")
                break

157
            logger.debug("[TRD] Worker thread [{}] exited barrier...".format(self._tid))
158
            self.crossStepGate()   # then per-thread gate, after being tapped
159
            logger.debug("[TRD] Worker thread [{}] exited step gate...".format(self._tid))
160
            if not self._tc.isRunning():
161
                print("_wts", end="")
162
                logger.debug("[TRD] Thread Coordinator not running any more, worker thread now stopping...")
163 164
                break

165
            # Before we fetch the task and run it, let's ensure we properly "use" the database (not needed any more)
166
            try:
167 168 169
                if (gConfig.per_thread_db_connection):  # most likely TRUE
                    if not self._dbConn.isOpen:  # might have been closed during server auto-restart
                        self._dbConn.open()
170
                # self.useDb() # might encounter exceptions. TODO: catch
171 172
            except taos.error.ProgrammingError as err:
                errno = Helper.convertErrno(err.errno)
173
                if errno in [0x383, 0x386, 0x00B, 0x014]  : # invalid database, dropping, Unable to establish connection, Database not ready
174 175 176 177 178 179
                    # ignore
                    dummy = 0
                else:
                    print("\nCaught programming error. errno=0x{:X}, msg={} ".format(errno, err.msg))
                    raise

180
            # Fetch a task from the Thread Coordinator
181
            logger.debug( "[TRD] Worker thread [{}] about to fetch task".format(self._tid))
182
            task = tc.fetchTask()
183 184

            # Execute such a task
185
            logger.debug("[TRD] Worker thread [{}] about to execute task: {}".format(
S
Shuduo Sang 已提交
186
                    self._tid, task.__class__.__name__))
187
            task.execute(self)
188
            tc.saveExecutedTask(task)
189
            logger.debug("[TRD] Worker thread [{}] finished executing task".format(self._tid))
S
Shuduo Sang 已提交
190

191
            # self._dbInUse = False  # there may be changes between steps
192
        # print("_wtd", end=None) # worker thread died
193

S
Shuduo Sang 已提交
194 195
    def verifyThreadSelf(self):  # ensure we are called by this own thread
        if (threading.get_ident() != self._thread.ident):
S
Steven Li 已提交
196 197
            raise RuntimeError("Unexpectly called from other threads")

S
Shuduo Sang 已提交
198 199
    def verifyThreadMain(self):  # ensure we are called by the main thread
        if (threading.get_ident() != threading.main_thread().ident):
S
Steven Li 已提交
200 201 202
            raise RuntimeError("Unexpectly called from other threads")

    def verifyThreadAlive(self):
S
Shuduo Sang 已提交
203
        if (not self._thread.is_alive()):
S
Steven Li 已提交
204 205
            raise RuntimeError("Unexpected dead thread")

206
    # A gate is different from a barrier in that a thread needs to be "tapped"
S
Steven Li 已提交
207 208
    def crossStepGate(self):
        self.verifyThreadAlive()
S
Shuduo Sang 已提交
209 210
        self.verifyThreadSelf()  # only allowed by ourselves

211
        # Wait again at the "gate", waiting to be "tapped"
S
Shuduo Sang 已提交
212 213 214 215
        logger.debug(
            "[TRD] Worker thread {} about to cross the step gate".format(
                self._tid))
        self._stepGate.wait()
216
        self._stepGate.clear()
S
Shuduo Sang 已提交
217

218
        # self._curStep += 1  # off to a new step...
S
Steven Li 已提交
219

S
Shuduo Sang 已提交
220
    def tapStepGate(self):  # give it a tap, release the thread waiting there
221
        # self.verifyThreadAlive()
S
Shuduo Sang 已提交
222 223
        self.verifyThreadMain()  # only allowed for main thread

224 225 226 227 228 229
        if self._thread.is_alive():
            logger.debug("[TRD] Tapping worker thread {}".format(self._tid))
            self._stepGate.set()  # wake up!
            time.sleep(0)  # let the released thread run a bit
        else:
            print("_tad", end="") # Thread already dead
230

S
Shuduo Sang 已提交
231
    def execSql(self, sql):  # TODO: expose DbConn directly
232
        return self.getDbConn().execute(sql)
233

S
Shuduo Sang 已提交
234
    def querySql(self, sql):  # TODO: expose DbConn directly
235
        return self.getDbConn().query(sql)
236 237

    def getQueryResult(self):
238
        return self.getDbConn().getQueryResult()
239

240
    def getDbConn(self) -> DbConn :
S
Shuduo Sang 已提交
241 242
        if (gConfig.per_thread_db_connection):
            return self._dbConn
243
        else:
244
            return self._tc.getDbManager().getDbConn()
245

246 247
    # def querySql(self, sql): # not "execute", since we are out side the DB context
    #     if ( gConfig.per_thread_db_connection ):
S
Shuduo Sang 已提交
248
    #         return self._dbConn.query(sql)
249 250
    #     else:
    #         return self._tc.getDbState().getDbConn().query(sql)
251

252
# The coordinator of all worker threads, mostly running in main thread
S
Shuduo Sang 已提交
253 254


255
class ThreadCoordinator:
S
Steven Li 已提交
256
    WORKER_THREAD_TIMEOUT = 60 # one minute
257

258
    def __init__(self, pool: ThreadPool, dbManager: DbManager):
S
Shuduo Sang 已提交
259
        self._curStep = -1  # first step is 0
260
        self._pool = pool
261
        # self._wd = wd
S
Shuduo Sang 已提交
262
        self._te = None  # prepare for every new step
263
        self._dbManager = dbManager
S
Shuduo Sang 已提交
264 265
        self._executedTasks: List[Task] = []  # in a given step
        self._lock = threading.RLock()  # sync access for a few things
S
Steven Li 已提交
266

S
Shuduo Sang 已提交
267 268
        self._stepBarrier = threading.Barrier(
            self._pool.numThreads + 1)  # one barrier for all threads
269
        self._execStats = ExecutionStats()
270
        self._runStatus = MainExec.STATUS_RUNNING
271
        self._initDbs()
S
Steven Li 已提交
272

273 274 275
    def getTaskExecutor(self):
        return self._te

S
Shuduo Sang 已提交
276
    def getDbManager(self) -> DbManager:
277
        return self._dbManager
278

279 280
    def crossStepBarrier(self, timeout=None):
        self._stepBarrier.wait(timeout) 
281

282 283 284 285
    def requestToStop(self):
        self._runStatus = MainExec.STATUS_STOPPING
        self._execStats.registerFailure("User Interruption")

286
    def _runShouldEnd(self, transitionFailed, hasAbortedTask, workerTimeout):
287 288 289 290 291 292 293 294 295
        maxSteps = gConfig.max_steps  # type: ignore
        if self._curStep >= (maxSteps - 1): # maxStep==10, last curStep should be 9
            return True
        if self._runStatus != MainExec.STATUS_RUNNING:
            return True
        if transitionFailed:
            return True
        if hasAbortedTask:
            return True
296 297
        if workerTimeout:
            return True
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
        return False

    def _hasAbortedTask(self): # from execution of previous step
        for task in self._executedTasks:
            if task.isAborted():
                # print("Task aborted: {}".format(task))
                # hasAbortedTask = True
                return True
        return False

    def _releaseAllWorkerThreads(self, transitionFailed):
        self._curStep += 1  # we are about to get into next step. TODO: race condition here!
        # Now not all threads had time to go to sleep
        logger.debug(
            "--\r\n\n--> Step {} starts with main thread waking up".format(self._curStep))

        # A new TE for the new step
        self._te = None # set to empty first, to signal worker thread to stop
        if not transitionFailed:  # only if not failed
            self._te = TaskExecutor(self._curStep)

        logger.debug("[TRD] Main thread waking up at step {}, tapping worker threads".format(
                self._curStep))  # Now not all threads had time to go to sleep
        # Worker threads will wake up at this point, and each execute it's own task
322
        self.tapAllThreads() # release all worker thread from their "gates"
323 324 325 326 327 328

    def _syncAtBarrier(self):
         # Now main thread (that's us) is ready to enter a step
        # let other threads go past the pool barrier, but wait at the
        # thread gate
        logger.debug("[TRD] Main thread about to cross the barrier")
329
        self.crossStepBarrier(timeout=self.WORKER_THREAD_TIMEOUT)
330 331 332 333 334 335
        self._stepBarrier.reset()  # Other worker threads should now be at the "gate"
        logger.debug("[TRD] Main thread finished crossing the barrier")

    def _doTransition(self):
        transitionFailed = False
        try:
336 337 338 339 340 341 342 343 344 345
            for x in self._dbs:
                db = x # type: Database
                sm = db.getStateMachine()
                logger.debug("[STT] starting transitions for DB: {}".format(db.getName()))
                # at end of step, transiton the DB state
                tasksForDb = db.filterTasks(self._executedTasks)
                sm.transition(tasksForDb, self.getDbManager().getDbConn())
                logger.debug("[STT] transition ended for DB: {}".format(db.getName()))

            # Due to limitation (or maybe not) of the TD Python library,
346
            # we cannot share connections across threads
347 348 349 350 351 352
            # Here we are in main thread, we cannot operate the connections created in workers
            # Moving below to task loop
            # if sm.hasDatabase():
            #     for t in self._pool.threadList:
            #         logger.debug("[DB] use db for all worker threads")
            #         t.useDb()
353 354
                    # t.execSql("use db") # main thread executing "use
                    # db" on behalf of every worker thread
355

356 357 358 359 360 361 362 363 364 365 366
        except taos.error.ProgrammingError as err:
            if (err.msg == 'network unavailable'):  # broken DB connection
                logger.info("DB connection broken, execution failed")
                traceback.print_stack()
                transitionFailed = True
                self._te = None  # Not running any more
                self._execStats.registerFailure("Broken DB Connection")
                # continue # don't do that, need to tap all threads at
                # end, and maybe signal them to stop
            else:
                raise
S
Steven Li 已提交
367
        # return transitionFailed # Why did we have this??!!
368 369 370 371 372 373

        self.resetExecutedTasks()  # clear the tasks after we are done
        # Get ready for next step
        logger.debug("<-- Step {} finished, trasition failed = {}".format(self._curStep, transitionFailed))
        return transitionFailed

S
Shuduo Sang 已提交
374
    def run(self):
375
        self._pool.createAndStartThreads(self)
S
Steven Li 已提交
376 377

        # Coordinate all threads step by step
S
Shuduo Sang 已提交
378
        self._curStep = -1  # not started yet
379
        
S
Shuduo Sang 已提交
380
        self._execStats.startExec()  # start the stop watch
381 382
        transitionFailed = False
        hasAbortedTask = False
383 384
        workerTimeout = False
        while not self._runShouldEnd(transitionFailed, hasAbortedTask, workerTimeout):
385
            if not gConfig.debug: # print this only if we are not in debug mode                
S
Shuduo Sang 已提交
386
                print(".", end="", flush=True)
387 388 389 390 391 392 393 394
            # if (self._curStep % 2) == 0: # print memory usage once every 10 steps
            #     memUsage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            #     print("[m:{}]".format(memUsage), end="", flush=True) # print memory usage
            # if (self._curStep % 10) == 3: 
            #     h = hpy()
            #     print("\n")        
            #     print(h.heap())
            
395
                        
396 397 398 399 400 401 402 403 404 405
            try:
                self._syncAtBarrier() # For now just cross the barrier
            except threading.BrokenBarrierError as err:
                logger.info("Main loop aborted, caused by worker thread time-out")
                self._execStats.registerFailure("Aborted due to worker thread timeout")
                print("\n\nWorker Thread time-out detected, important thread info:")
                ts = ThreadStacks()
                ts.print(filterInternal=True)
                workerTimeout = True
                break
406 407

            # At this point, all threads should be pass the overall "barrier" and before the per-thread "gate"
S
Shuduo Sang 已提交
408 409
            # We use this period to do house keeping work, when all worker
            # threads are QUIET.
410 411 412
            hasAbortedTask = self._hasAbortedTask() # from previous step
            if hasAbortedTask: 
                logger.info("Aborted task encountered, exiting test program")
413
                self._execStats.registerFailure("Aborted Task Encountered")
414
                break # do transition only if tasks are error free
S
Shuduo Sang 已提交
415

416
            # Ending previous step
417 418 419 420
            try:
                transitionFailed = self._doTransition() # To start, we end step -1 first
            except taos.error.ProgrammingError as err:
                transitionFailed = True
421
                errno2 = Helper.convertErrno(err.errno)  # correct error scheme
S
Steven Li 已提交
422 423
                errMsg = "Transition failed: errno=0x{:X}, msg: {}".format(errno2, err)
                logger.info(errMsg)
424
                traceback.print_exc()
S
Steven Li 已提交
425
                self._execStats.registerFailure(errMsg)
426

427 428
            # Then we move on to the next step
            self._releaseAllWorkerThreads(transitionFailed)                    
429

430 431
        if hasAbortedTask or transitionFailed : # abnormal ending, workers waiting at "gate"
            logger.debug("Abnormal ending of main thraed")
432 433
        elif workerTimeout:
            logger.debug("Abnormal ending of main thread, due to worker timeout")
434 435 436
        else: # regular ending, workers waiting at "barrier"
            logger.debug("Regular ending, main thread waiting for all worker threads to stop...")
            self._syncAtBarrier()
437

438 439 440
        self._te = None  # No more executor, time to end
        logger.debug("Main thread tapping all threads one last time...")
        self.tapAllThreads()  # Let the threads run one last time
441

442
        logger.debug("\r\n\n--> Main thread ready to finish up...")
443
        logger.debug("Main thread joining all threads")
S
Shuduo Sang 已提交
444
        self._pool.joinAll()  # Get all threads to finish
445
        logger.info("\nAll worker threads finished")
446 447
        self._execStats.endExec()

448 449 450 451 452 453 454 455 456 457 458 459 460
    def cleanup(self): # free resources
        self._pool.cleanup()

        self._pool = None
        self._te = None  
        self._dbManager = None
        self._executedTasks = None
        self._lock = None
        self._stepBarrier = None
        self._execStats = None
        self._runStatus = None


461 462
    def printStats(self):
        self._execStats.printStats()
S
Steven Li 已提交
463

S
Steven Li 已提交
464 465 466 467 468 469
    def isFailed(self):
        return self._execStats.isFailed()

    def getExecStats(self):
        return self._execStats

S
Shuduo Sang 已提交
470
    def tapAllThreads(self):  # in a deterministic manner
S
Steven Li 已提交
471
        wakeSeq = []
S
Shuduo Sang 已提交
472 473
        for i in range(self._pool.numThreads):  # generate a random sequence
            if Dice.throw(2) == 1:
S
Steven Li 已提交
474 475 476
                wakeSeq.append(i)
            else:
                wakeSeq.insert(0, i)
S
Shuduo Sang 已提交
477 478 479
        logger.debug(
            "[TRD] Main thread waking up worker threads: {}".format(
                str(wakeSeq)))
480
        # TODO: set dice seed to a deterministic value
S
Steven Li 已提交
481
        for i in wakeSeq:
S
Shuduo Sang 已提交
482 483 484
            # TODO: maybe a bit too deep?!
            self._pool.threadList[i].tapStepGate()
            time.sleep(0)  # yield
S
Steven Li 已提交
485

486
    def isRunning(self):
S
Shuduo Sang 已提交
487
        return self._te is not None
488

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
    def _initDbs(self):
        ''' Initialize multiple databases, invoked at __ini__() time '''
        self._dbs = [] # type: List[Database]
        dbc = self.getDbManager().getDbConn()
        if gConfig.max_dbs == 0:
            self._dbs.append(Database(0, dbc))
        else:
            for i in range(gConfig.max_dbs):
                self._dbs.append(Database(i, dbc))

    def pickDatabase(self):
        idxDb = 0
        if gConfig.max_dbs != 0 :
            idxDb = Dice.throw(gConfig.max_dbs) # 0 to N-1
        db = self._dbs[idxDb] # type: Database
        return db

S
Shuduo Sang 已提交
506
    def fetchTask(self) -> Task:
507 508 509
        ''' The thread coordinator (that's us) is responsible for fetching a task
            to be executed next.
        '''
S
Shuduo Sang 已提交
510
        if (not self.isRunning()):  # no task
511
            raise RuntimeError("Cannot fetch task when not running")
512

S
Shuduo Sang 已提交
513
        # pick a task type for current state
514 515 516
        db = self.pickDatabase()
        taskType = db.getStateMachine().pickTaskType() # type: Task
        return taskType(self._execStats, db)  # create a task from it
517 518

    def resetExecutedTasks(self):
S
Shuduo Sang 已提交
519
        self._executedTasks = []  # should be under single thread
520 521 522 523

    def saveExecutedTask(self, task):
        with self._lock:
            self._executedTasks.append(task)
524 525

# We define a class to run a number of threads in locking steps.
S
Shuduo Sang 已提交
526

527 528 529 530
class Helper:
    @classmethod
    def convertErrno(cls, errno):
        return errno if (errno > 0) else 0x80000000 + errno
S
Shuduo Sang 已提交
531

532
class ThreadPool:
533
    def __init__(self, numThreads, maxSteps):
534 535 536 537
        self.numThreads = numThreads
        self.maxSteps = maxSteps
        # Internal class variables
        self.curStep = 0
S
Shuduo Sang 已提交
538 539
        self.threadList = []  # type: List[WorkerThread]

540
    # starting to run all the threads, in locking steps
541
    def createAndStartThreads(self, tc: ThreadCoordinator):
S
Shuduo Sang 已提交
542 543
        for tid in range(0, self.numThreads):  # Create the threads
            workerThread = WorkerThread(self, tid, tc)
544
            self.threadList.append(workerThread)
S
Shuduo Sang 已提交
545
            workerThread.start()  # start, but should block immediately before step 0
546 547 548 549 550 551

    def joinAll(self):
        for workerThread in self.threadList:
            logger.debug("Joining thread...")
            workerThread._thread.join()

552 553 554
    def cleanup(self):
        self.threadList = None # maybe clean up each?

555 556
# A queue of continguous POSITIVE integers, used by DbManager to generate continuous numbers
# for new table names
S
Shuduo Sang 已提交
557 558


S
Steven Li 已提交
559 560
class LinearQueue():
    def __init__(self):
561
        self.firstIndex = 1  # 1st ever element
S
Steven Li 已提交
562
        self.lastIndex = 0
S
Shuduo Sang 已提交
563 564
        self._lock = threading.RLock()  # our functions may call each other
        self.inUse = set()  # the indexes that are in use right now
S
Steven Li 已提交
565

566
    def toText(self):
S
Shuduo Sang 已提交
567 568
        return "[{}..{}], in use: {}".format(
            self.firstIndex, self.lastIndex, self.inUse)
569 570

    # Push (add new element, largest) to the tail, and mark it in use
S
Shuduo Sang 已提交
571
    def push(self):
572
        with self._lock:
S
Shuduo Sang 已提交
573 574
            # if ( self.isEmpty() ):
            #     self.lastIndex = self.firstIndex
575
            #     return self.firstIndex
576 577
            # Otherwise we have something
            self.lastIndex += 1
578 579
            self.allocate(self.lastIndex)
            # self.inUse.add(self.lastIndex) # mark it in use immediately
580
            return self.lastIndex
S
Steven Li 已提交
581 582

    def pop(self):
583
        with self._lock:
S
Shuduo Sang 已提交
584 585 586 587
            if (self.isEmpty()):
                # raise RuntimeError("Cannot pop an empty queue")
                return False  # TODO: None?

588
            index = self.firstIndex
S
Shuduo Sang 已提交
589
            if (index in self.inUse):
590 591
                return False

592 593 594 595 596 597 598
            self.firstIndex += 1
            return index

    def isEmpty(self):
        return self.firstIndex > self.lastIndex

    def popIfNotEmpty(self):
599
        with self._lock:
600 601 602 603
            if (self.isEmpty()):
                return 0
            return self.pop()

S
Steven Li 已提交
604
    def allocate(self, i):
605
        with self._lock:
606
            # logger.debug("LQ allocating item {}".format(i))
S
Shuduo Sang 已提交
607 608 609
            if (i in self.inUse):
                raise RuntimeError(
                    "Cannot re-use same index in queue: {}".format(i))
610 611
            self.inUse.add(i)

S
Steven Li 已提交
612
    def release(self, i):
613
        with self._lock:
614
            # logger.debug("LQ releasing item {}".format(i))
S
Shuduo Sang 已提交
615
            self.inUse.remove(i)  # KeyError possible, TODO: why?
616 617 618 619

    def size(self):
        return self.lastIndex + 1 - self.firstIndex

S
Steven Li 已提交
620
    def pickAndAllocate(self):
S
Shuduo Sang 已提交
621
        if (self.isEmpty()):
622 623
            return None
        with self._lock:
S
Shuduo Sang 已提交
624
            cnt = 0  # counting the interations
625 626
            while True:
                cnt += 1
S
Shuduo Sang 已提交
627
                if (cnt > self.size() * 10):  # 10x iteration already
628 629
                    # raise RuntimeError("Failed to allocate LinearQueue element")
                    return None
S
Shuduo Sang 已提交
630 631
                ret = Dice.throwRange(self.firstIndex, self.lastIndex + 1)
                if (ret not in self.inUse):
632 633 634
                    self.allocate(ret)
                    return ret

S
Shuduo Sang 已提交
635

636
class DbConn:
637
    TYPE_NATIVE = "native-c"
638
    TYPE_REST =   "rest-api"
639 640 641 642 643 644 645 646 647
    TYPE_INVALID = "invalid"

    @classmethod
    def create(cls, connType):
        if connType == cls.TYPE_NATIVE:
            return DbConnNative()
        elif connType == cls.TYPE_REST:
            return DbConnRest()
        else:
S
Shuduo Sang 已提交
648 649
            raise RuntimeError(
                "Unexpected connection type: {}".format(connType))
650 651 652 653 654 655 656 657 658

    @classmethod
    def createNative(cls):
        return cls.create(cls.TYPE_NATIVE)

    @classmethod
    def createRest(cls):
        return cls.create(cls.TYPE_REST)

659 660
    def __init__(self):
        self.isOpen = False
661
        self._type = self.TYPE_INVALID
662 663 664 665
        self._lastSql = None

    def getLastSql(self):
        return self._lastSql
666 667

    def open(self):
S
Shuduo Sang 已提交
668
        if (self.isOpen):
669 670
            raise RuntimeError("Cannot re-open an existing DB connection")

671 672
        # below implemented by child classes
        self.openByType()
673

674
        logger.debug("[DB] data connection opened, type = {}".format(self._type))
675 676
        self.isOpen = True

S
Shuduo Sang 已提交
677
    def queryScalar(self, sql) -> int:
678 679
        return self._queryAny(sql)

S
Shuduo Sang 已提交
680
    def queryString(self, sql) -> str:
681 682
        return self._queryAny(sql)

S
Shuduo Sang 已提交
683 684
    def _queryAny(self, sql):  # actual query result as an int
        if (not self.isOpen):
685
            raise RuntimeError("Cannot query database until connection is open")
686
        nRows = self.query(sql)
S
Shuduo Sang 已提交
687
        if nRows != 1:
688 689 690 691
            raise taos.error.ProgrammingError(
                "Unexpected result for query: {}, rows = {}".format(sql, nRows), 
                (0x991 if nRows==0 else 0x992)
            )
692
        if self.getResultRows() != 1 or self.getResultCols() != 1:
693
            raise RuntimeError("Unexpected result set for query: {}".format(sql))
694 695
        return self.getQueryResult()[0][0]

696 697 698
    def use(self, dbName):
        self.execute("use {}".format(dbName))

699 700 701 702 703 704 705
    def existsDatabase(self, dbName: str):
        ''' Check if a certain database exists '''
        self.query("show databases")
        dbs = [v[0] for v in self.getQueryResult()] # ref: https://stackoverflow.com/questions/643823/python-list-transformation
        # ret2 = dbName in dbs
        # print("dbs = {}, str = {}, ret2={}, type2={}".format(dbs, dbName,ret2, type(dbName)))
        return dbName in dbs # TODO: super weird type mangling seen, once here
706 707 708 709

    def hasTables(self):
        return self.query("show tables") > 0

710
    def execute(self, sql):
711
        ''' Return the number of rows affected'''
712
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
713

714 715 716 717 718 719 720 721 722
    def safeExecute(self, sql):
        '''Safely execute any SQL query, returning True/False upon success/failure'''
        try:
            self.execute(sql)
            return True # ignore num of results, return success
        except taos.error.ProgrammingError as err:
            return False # failed, for whatever TAOS reason
        # Not possile to reach here, non-TAOS exception would have been thrown

723
    def query(self, sql) -> int: # return num rows returned
724
        ''' Return the number of rows affected'''
725 726
        raise RuntimeError("Unexpected execution, should be overriden")

727 728
    def openByType(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
729

730 731
    def getQueryResult(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
732

733 734
    def getResultRows(self):
        raise RuntimeError("Unexpected execution, should be overriden")
S
Shuduo Sang 已提交
735

736 737 738 739
    def getResultCols(self):
        raise RuntimeError("Unexpected execution, should be overriden")

# Sample: curl -u root:taosdata -d "show databases" localhost:6020/rest/sql
S
Shuduo Sang 已提交
740 741


742 743 744 745
class DbConnRest(DbConn):
    def __init__(self):
        super().__init__()
        self._type = self.TYPE_REST
S
Steven Li 已提交
746
        self._url = "http://localhost:6041/rest/sql"  # fixed for now
747 748
        self._result = None

S
Shuduo Sang 已提交
749 750 751
    def openByType(self):  # Open connection
        pass  # do nothing, always open

752
    def close(self):
S
Shuduo Sang 已提交
753
        if (not self.isOpen):
754
            raise RuntimeError("Cannot clean up database until connection is open")
755 756 757 758 759
        # Do nothing for REST
        logger.debug("[DB] REST Database connection closed")
        self.isOpen = False

    def _doSql(self, sql):
760
        self._lastSql = sql # remember this, last SQL attempted
761 762 763
        try:
            r = requests.post(self._url, 
                data = sql,
764
                auth = HTTPBasicAuth('root', 'taosdata'))         
765 766 767
        except:
            print("REST API Failure (TODO: more info here)")
            raise
768 769
        rj = r.json()
        # Sanity check for the "Json Result"
S
Shuduo Sang 已提交
770
        if ('status' not in rj):
771 772
            raise RuntimeError("No status in REST response")

S
Shuduo Sang 已提交
773 774 775 776
        if rj['status'] == 'error':  # clearly reported error
            if ('code' not in rj):  # error without code
                raise RuntimeError("REST error return without code")
            errno = rj['code']  # May need to massage this in the future
777
            # print("Raising programming error with REST return: {}".format(rj))
S
Shuduo Sang 已提交
778 779
            raise taos.error.ProgrammingError(
                rj['desc'], errno)  # todo: check existance of 'desc'
780

S
Shuduo Sang 已提交
781 782 783 784
        if rj['status'] != 'succ':  # better be this
            raise RuntimeError(
                "Unexpected REST return status: {}".format(
                    rj['status']))
785 786

        nRows = rj['rows'] if ('rows' in rj) else 0
S
Shuduo Sang 已提交
787
        self._result = rj
788 789
        return nRows

S
Shuduo Sang 已提交
790 791 792 793
    def execute(self, sql):
        if (not self.isOpen):
            raise RuntimeError(
                "Cannot execute database commands until connection is open")
794 795
        logger.debug("[SQL-REST] Executing SQL: {}".format(sql))
        nRows = self._doSql(sql)
S
Shuduo Sang 已提交
796 797
        logger.debug(
            "[SQL-REST] Execution Result, nRows = {}, SQL = {}".format(nRows, sql))
798 799
        return nRows

S
Shuduo Sang 已提交
800
    def query(self, sql):  # return rows affected
801 802 803 804 805 806 807 808 809 810 811 812 813
        return self.execute(sql)

    def getQueryResult(self):
        return self._result['data']

    def getResultRows(self):
        print(self._result)
        raise RuntimeError("TBD")
        # return self._tdSql.queryRows

    def getResultCols(self):
        print(self._result)
        raise RuntimeError("TBD")
S
Shuduo Sang 已提交
814

815
    # Duplicate code from TDMySQL, TODO: merge all this into DbConnNative
816 817


818
class MyTDSql:
819 820 821 822 823 824 825
    # Class variables
    _clsLock = threading.Lock() # class wide locking
    longestQuery = None # type: str
    longestQueryTime = 0.0 # seconds
    lqStartTime = 0.0
    # lqEndTime = 0.0 # Not needed, as we have the two above already

826 827 828 829 830
    def __init__(self, hostAddr, cfgPath):
        # Make the DB connection
        self._conn = taos.connect(host=hostAddr, config=cfgPath) 
        self._cursor = self._conn.cursor()

831 832 833 834
        self.queryRows = 0
        self.queryCols = 0
        self.affectedRows = 0

835 836
    # def init(self, cursor, log=True):
    #     self.cursor = cursor
837 838 839 840 841
        # if (log):
        #     caller = inspect.getframeinfo(inspect.stack()[1][0])
        #     self.cursor.log(caller.filename + ".sql")

    def close(self):
842
        self._cursor.close() # can we double close?
843 844
        self._conn.close() # TODO: very important, cursor close does NOT close DB connection!
        self._cursor.close()
845

846 847 848
    def _execInternal(self, sql):
        startTime = time.time() 
        ret = self._cursor.execute(sql)
849
        # print("\nSQL success: {}".format(sql))
850 851 852 853 854 855 856 857 858 859
        queryTime =  time.time() - startTime
        # Record the query time
        cls = self.__class__
        if queryTime > (cls.longestQueryTime + 0.01) :
            with cls._clsLock:
                cls.longestQuery = sql
                cls.longestQueryTime = queryTime
                cls.lqStartTime = startTime
        return ret

860 861 862
    def query(self, sql):
        self.sql = sql
        try:
863
            self._execInternal(sql)
864
            self.queryResult = self._cursor.fetchall()
865
            self.queryRows = len(self.queryResult)
866
            self.queryCols = len(self._cursor.description)
867 868 869 870 871 872
        except Exception as e:
            # caller = inspect.getframeinfo(inspect.stack()[1][0])
            # args = (caller.filename, caller.lineno, sql, repr(e))
            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
            raise
        return self.queryRows
873

874 875 876
    def execute(self, sql):
        self.sql = sql
        try:
877
            self.affectedRows = self._execInternal(sql)
878 879 880 881 882 883 884
        except Exception as e:
            # caller = inspect.getframeinfo(inspect.stack()[1][0])
            # args = (caller.filename, caller.lineno, sql, repr(e))
            # tdLog.exit("%s(%d) failed: sql:%s, %s" % args)
            raise
        return self.affectedRows

S
Shuduo Sang 已提交
885

886
class DbConnNative(DbConn):
887 888 889
    # Class variables
    _lock = threading.Lock()
    _connInfoDisplayed = False
890
    totalConnections = 0 # Not private
891

892 893
    def __init__(self):
        super().__init__()
894
        self._type = self.TYPE_NATIVE
S
Shuduo Sang 已提交
895
        self._conn = None
896
        # self._cursor = None        
S
Shuduo Sang 已提交
897

898 899 900 901 902 903 904
    def getBuildPath(self):
        selfPath = os.path.dirname(os.path.realpath(__file__))
        if ("community" in selfPath):
            projPath = selfPath[:selfPath.find("communit")]
        else:
            projPath = selfPath[:selfPath.find("tests")]

905
        buildPath = None
906 907 908 909
        for root, dirs, files in os.walk(projPath):
            if ("taosd" in files):
                rootRealPath = os.path.dirname(os.path.realpath(root))
                if ("packaging" not in rootRealPath):
S
Shuduo Sang 已提交
910
                    buildPath = root[:len(root) - len("/build/bin")]
911
                    break
912
        if buildPath == None:
913 914
            raise RuntimeError("Failed to determine buildPath, selfPath={}, projPath={}"
                .format(selfPath, projPath))
915 916
        return buildPath

917
    
S
Shuduo Sang 已提交
918
    def openByType(self):  # Open connection
919
        cfgPath = self.getBuildPath() + "/test/cfg"
920
        hostAddr = "127.0.0.1"
921

922 923 924 925 926 927 928 929 930 931 932
        cls = self.__class__ # Get the class, to access class variables
        with cls._lock: # force single threading for opening DB connections. # TODO: whaaat??!!!
            if not cls._connInfoDisplayed:
                cls._connInfoDisplayed = True # updating CLASS variable
                logger.info("Initiating TAOS native connection to {}, using config at {}".format(hostAddr, cfgPath))                    
            # Make the connection         
            # self._conn = taos.connect(host=hostAddr, config=cfgPath)  # TODO: make configurable
            # self._cursor = self._conn.cursor()
            # Record the count in the class
            self._tdSql = MyTDSql(hostAddr, cfgPath) # making DB connection
            cls.totalConnections += 1 
933
        
934
        self._tdSql.execute('reset query cache')
S
Shuduo Sang 已提交
935
        # self._cursor.execute('use db') # do this at the beginning of every
936 937

        # Open connection
938 939 940
        # self._tdSql = MyTDSql()
        # self._tdSql.init(self._cursor)
        
941
    def close(self):
S
Shuduo Sang 已提交
942
        if (not self.isOpen):
943
            raise RuntimeError("Cannot clean up database until connection is open")
944
        self._tdSql.close()
945 946 947 948 949
        # Decrement the class wide counter
        cls = self.__class__ # Get the class, to access class variables
        with cls._lock:
            cls.totalConnections -= 1

950
        logger.debug("[DB] Database connection closed")
951
        self.isOpen = False
S
Steven Li 已提交
952

S
Shuduo Sang 已提交
953 954
    def execute(self, sql):
        if (not self.isOpen):
955
            raise RuntimeError("Cannot execute database commands until connection is open")
956
        logger.debug("[SQL] Executing SQL: {}".format(sql))
957
        self._lastSql = sql
958
        nRows = self._tdSql.execute(sql)
S
Shuduo Sang 已提交
959 960 961
        logger.debug(
            "[SQL] Execution Result, nRows = {}, SQL = {}".format(
                nRows, sql))
962
        return nRows
S
Steven Li 已提交
963

S
Shuduo Sang 已提交
964 965 966 967
    def query(self, sql):  # return rows affected
        if (not self.isOpen):
            raise RuntimeError(
                "Cannot query database until connection is open")
968
        logger.debug("[SQL] Executing SQL: {}".format(sql))
969
        self._lastSql = sql
970
        nRows = self._tdSql.query(sql)
S
Shuduo Sang 已提交
971 972 973
        logger.debug(
            "[SQL] Query Result, nRows = {}, SQL = {}".format(
                nRows, sql))
974
        return nRows
975
        # results are in: return self._tdSql.queryResult
976

977 978 979
    def getQueryResult(self):
        return self._tdSql.queryResult

980 981
    def getResultRows(self):
        return self._tdSql.queryRows
982

983 984
    def getResultCols(self):
        return self._tdSql.queryCols
985

S
Shuduo Sang 已提交
986

987
class AnyState:
S
Shuduo Sang 已提交
988 989 990
    STATE_INVALID = -1
    STATE_EMPTY = 0  # nothing there, no even a DB
    STATE_DB_ONLY = 1  # we have a DB, but nothing else
991
    STATE_TABLE_ONLY = 2  # we have a table, but totally empty
S
Shuduo Sang 已提交
992
    STATE_HAS_DATA = 3  # we have some data in the table
993 994 995 996
    _stateNames = ["Invalid", "Empty", "DB_Only", "Table_Only", "Has_Data"]

    STATE_VAL_IDX = 0
    CAN_CREATE_DB = 1
997 998 999
    # For below, if we can "drop the DB", but strictly speaking 
    # only "under normal circumstances", as we may override it with the -b option
    CAN_DROP_DB = 2  
1000 1001
    CAN_CREATE_FIXED_SUPER_TABLE = 3
    CAN_DROP_FIXED_SUPER_TABLE = 4
1002 1003 1004 1005 1006 1007 1008
    CAN_ADD_DATA = 5
    CAN_READ_DATA = 6

    def __init__(self):
        self._info = self.getInfo()

    def __str__(self):
S
Shuduo Sang 已提交
1009 1010
        # -1 hack to accomodate the STATE_INVALID case
        return self._stateNames[self._info[self.STATE_VAL_IDX] + 1]
1011

1012 1013
    # Each sub state tells us the "info", about itself, so we can determine
    # on things like canDropDB()
1014 1015 1016
    def getInfo(self):
        raise RuntimeError("Must be overriden by child classes")

S
Steven Li 已提交
1017 1018 1019 1020 1021 1022
    def equals(self, other):
        if isinstance(other, int):
            return self.getValIndex() == other
        elif isinstance(other, AnyState):
            return self.getValIndex() == other.getValIndex()
        else:
S
Shuduo Sang 已提交
1023 1024 1025
            raise RuntimeError(
                "Unexpected comparison, type = {}".format(
                    type(other)))
S
Steven Li 已提交
1026

1027 1028 1029
    def verifyTasksToState(self, tasks, newState):
        raise RuntimeError("Must be overriden by child classes")

S
Steven Li 已提交
1030 1031 1032
    def getValIndex(self):
        return self._info[self.STATE_VAL_IDX]

1033 1034
    def getValue(self):
        return self._info[self.STATE_VAL_IDX]
S
Shuduo Sang 已提交
1035

1036 1037
    def canCreateDb(self):
        return self._info[self.CAN_CREATE_DB]
S
Shuduo Sang 已提交
1038

1039
    def canDropDb(self):
1040 1041 1042 1043
        # If user requests to run up to a number of DBs,
        # we'd then not do drop_db operations any more
        if gConfig.max_dbs > 0 : 
            return False
1044
        return self._info[self.CAN_DROP_DB]
S
Shuduo Sang 已提交
1045

1046 1047
    def canCreateFixedSuperTable(self):
        return self._info[self.CAN_CREATE_FIXED_SUPER_TABLE]
S
Shuduo Sang 已提交
1048

1049 1050
    def canDropFixedSuperTable(self):
        return self._info[self.CAN_DROP_FIXED_SUPER_TABLE]
S
Shuduo Sang 已提交
1051

1052 1053
    def canAddData(self):
        return self._info[self.CAN_ADD_DATA]
S
Shuduo Sang 已提交
1054

1055 1056 1057 1058 1059
    def canReadData(self):
        return self._info[self.CAN_READ_DATA]

    def assertAtMostOneSuccess(self, tasks, cls):
        sCnt = 0
S
Shuduo Sang 已提交
1060
        for task in tasks:
1061 1062 1063
            if not isinstance(task, cls):
                continue
            if task.isSuccess():
S
Steven Li 已提交
1064
                # task.logDebug("Task success found")
1065
                sCnt += 1
S
Shuduo Sang 已提交
1066 1067 1068
                if (sCnt >= 2):
                    raise RuntimeError(
                        "Unexpected more than 1 success with task: {}".format(cls))
1069 1070 1071 1072

    def assertIfExistThenSuccess(self, tasks, cls):
        sCnt = 0
        exists = False
S
Shuduo Sang 已提交
1073
        for task in tasks:
1074 1075
            if not isinstance(task, cls):
                continue
S
Shuduo Sang 已提交
1076
            exists = True  # we have a valid instance
1077 1078
            if task.isSuccess():
                sCnt += 1
S
Shuduo Sang 已提交
1079
        if (exists and sCnt <= 0):
S
Steven Li 已提交
1080 1081
            raise RuntimeError("Unexpected zero success for task type: {}, from tasks: {}"
                .format(cls, tasks))
1082 1083

    def assertNoTask(self, tasks, cls):
S
Shuduo Sang 已提交
1084
        for task in tasks:
1085
            if isinstance(task, cls):
S
Shuduo Sang 已提交
1086 1087
                raise CrashGenError(
                    "This task: {}, is not expected to be present, given the success/failure of others".format(cls.__name__))
1088 1089

    def assertNoSuccess(self, tasks, cls):
S
Shuduo Sang 已提交
1090
        for task in tasks:
1091 1092
            if isinstance(task, cls):
                if task.isSuccess():
S
Shuduo Sang 已提交
1093 1094
                    raise RuntimeError(
                        "Unexpected successful task: {}".format(cls))
1095 1096

    def hasSuccess(self, tasks, cls):
S
Shuduo Sang 已提交
1097
        for task in tasks:
1098 1099 1100 1101 1102 1103
            if not isinstance(task, cls):
                continue
            if task.isSuccess():
                return True
        return False

S
Steven Li 已提交
1104
    def hasTask(self, tasks, cls):
S
Shuduo Sang 已提交
1105
        for task in tasks:
S
Steven Li 已提交
1106 1107 1108 1109
            if isinstance(task, cls):
                return True
        return False

S
Shuduo Sang 已提交
1110

1111 1112 1113 1114
class StateInvalid(AnyState):
    def getInfo(self):
        return [
            self.STATE_INVALID,
S
Shuduo Sang 已提交
1115 1116 1117
            False, False,  # can create/drop Db
            False, False,  # can create/drop fixed table
            False, False,  # can insert/read data with fixed table
1118 1119 1120 1121
        ]

    # def verifyTasksToState(self, tasks, newState):

S
Shuduo Sang 已提交
1122

1123 1124 1125 1126
class StateEmpty(AnyState):
    def getInfo(self):
        return [
            self.STATE_EMPTY,
S
Shuduo Sang 已提交
1127 1128 1129
            True, False,  # can create/drop Db
            False, False,  # can create/drop fixed table
            False, False,  # can insert/read data with fixed table
1130 1131
        ]

S
Shuduo Sang 已提交
1132 1133
    def verifyTasksToState(self, tasks, newState):
        if (self.hasSuccess(tasks, TaskCreateDb)
1134
                ):  # at EMPTY, if there's succes in creating DB
S
Shuduo Sang 已提交
1135 1136 1137 1138
            if (not self.hasTask(tasks, TaskDropDb)):  # and no drop_db tasks
                # we must have at most one. TODO: compare numbers
                self.assertAtMostOneSuccess(tasks, TaskCreateDb)

1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149

class StateDbOnly(AnyState):
    def getInfo(self):
        return [
            self.STATE_DB_ONLY,
            False, True,
            True, False,
            False, False,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1150 1151 1152
        if (not self.hasTask(tasks, TaskCreateDb)):
            # only if we don't create any more
            self.assertAtMostOneSuccess(tasks, TaskDropDb)
1153 1154 1155 1156 1157

        # TODO: restore the below, the problem exists, although unlikely in real-world
        # if (gSvcMgr!=None) and gSvcMgr.isRestarting():     
        # if (gSvcMgr == None) or (not gSvcMgr.isRestarting()) : 
        #     self.assertIfExistThenSuccess(tasks, TaskDropDb)       
1158

S
Shuduo Sang 已提交
1159

1160
class StateSuperTableOnly(AnyState):
1161 1162 1163 1164 1165 1166 1167 1168 1169
    def getInfo(self):
        return [
            self.STATE_TABLE_ONLY,
            False, True,
            False, True,
            True, True,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1170
        if (self.hasSuccess(tasks, TaskDropSuperTable)
1171
                ):  # we are able to drop the table
1172
            #self.assertAtMostOneSuccess(tasks, TaskDropSuperTable)
S
Shuduo Sang 已提交
1173 1174
            # we must have had recreted it
            self.hasSuccess(tasks, TaskCreateSuperTable)
1175

1176
            # self._state = self.STATE_DB_ONLY
S
Steven Li 已提交
1177 1178
        # elif ( self.hasSuccess(tasks, AddFixedDataTask) ): # no success dropping the table, but added data
        #     self.assertNoTask(tasks, DropFixedTableTask) # not true in massively parrallel cases
1179
            # self._state = self.STATE_HAS_DATA
S
Steven Li 已提交
1180 1181 1182
        # elif ( self.hasSuccess(tasks, ReadFixedDataTask) ): # no success in prev cases, but was able to read data
            # self.assertNoTask(tasks, DropFixedTableTask)
            # self.assertNoTask(tasks, AddFixedDataTask)
1183
            # self._state = self.STATE_TABLE_ONLY # no change
S
Steven Li 已提交
1184 1185 1186
        # else: # did not drop table, did not insert data, did not read successfully, that is impossible
        #     raise RuntimeError("Unexpected no-success scenarios")
        # TODO: need to revamp!!
1187

S
Shuduo Sang 已提交
1188

1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
class StateHasData(AnyState):
    def getInfo(self):
        return [
            self.STATE_HAS_DATA,
            False, True,
            False, True,
            True, True,
        ]

    def verifyTasksToState(self, tasks, newState):
S
Shuduo Sang 已提交
1199
        if (newState.equals(AnyState.STATE_EMPTY)):
1200
            self.hasSuccess(tasks, TaskDropDb)
S
Shuduo Sang 已提交
1201 1202 1203 1204
            if (not self.hasTask(tasks, TaskCreateDb)):
                self.assertAtMostOneSuccess(tasks, TaskDropDb)  # TODO: dicy
        elif (newState.equals(AnyState.STATE_DB_ONLY)):  # in DB only
            if (not self.hasTask(tasks, TaskCreateDb)
1205
                ):  # without a create_db task
S
Shuduo Sang 已提交
1206 1207
                # we must have drop_db task
                self.assertNoTask(tasks, TaskDropDb)
1208
            self.hasSuccess(tasks, TaskDropSuperTable)
1209
            # self.assertAtMostOneSuccess(tasks, DropFixedSuperTableTask) # TODO: dicy
1210 1211 1212 1213
        # elif ( newState.equals(AnyState.STATE_TABLE_ONLY) ): # data deleted
            # self.assertNoTask(tasks, TaskDropDb)
            # self.assertNoTask(tasks, TaskDropSuperTable)
            # self.assertNoTask(tasks, TaskAddData)
S
Steven Li 已提交
1214
            # self.hasSuccess(tasks, DeleteDataTasks)
S
Shuduo Sang 已提交
1215 1216
        else:  # should be STATE_HAS_DATA
            if (not self.hasTask(tasks, TaskCreateDb)
1217
                ):  # only if we didn't create one
S
Shuduo Sang 已提交
1218 1219 1220
                # we shouldn't have dropped it
                self.assertNoTask(tasks, TaskDropDb)
            if (not self.hasTask(tasks, TaskCreateSuperTable)
1221
                    ):  # if we didn't create the table
S
Shuduo Sang 已提交
1222 1223
                # we should not have a task that drops it
                self.assertNoTask(tasks, TaskDropSuperTable)
1224
            # self.assertIfExistThenSuccess(tasks, ReadFixedDataTask)
S
Steven Li 已提交
1225

S
Shuduo Sang 已提交
1226

1227
class StateMechine:
1228 1229 1230
    def __init__(self, db: Database): 
        self._db = db
        # transitition target probabilities, indexed with value of STATE_EMPTY, STATE_DB_ONLY, etc.
1231
        self._stateWeights = [1, 2, 10, 40]
S
Shuduo Sang 已提交
1232

1233 1234 1235 1236 1237
    def init(self, dbc: DbConn): # late initailization, don't save the dbConn
        self._curState = self._findCurrentState(dbc)  # starting state
        logger.debug("Found Starting State: {}".format(self._curState))

    # TODO: seems no lnoger used, remove?
1238 1239 1240
    def getCurrentState(self):
        return self._curState

1241 1242 1243
    def hasDatabase(self):
        return self._curState.canDropDb()  # ha, can drop DB means it has one

1244
    # May be slow, use cautionsly...
S
Shuduo Sang 已提交
1245
    def getTaskTypes(self):  # those that can run (directly/indirectly) from the current state
1246 1247 1248 1249 1250 1251
        def typesToStrings(types):
            ss = []
            for t in types:
                ss.append(t.__name__)
            return ss

S
Shuduo Sang 已提交
1252
        allTaskClasses = StateTransitionTask.__subclasses__()  # all state transition tasks
1253 1254
        firstTaskTypes = []
        for tc in allTaskClasses:
S
Shuduo Sang 已提交
1255
            # t = tc(self) # create task object
1256 1257
            if tc.canBeginFrom(self._curState):
                firstTaskTypes.append(tc)
S
Shuduo Sang 已提交
1258 1259 1260 1261 1262 1263 1264 1265
        # now we have all the tasks that can begin directly from the current
        # state, let's figure out the INDIRECT ones
        taskTypes = firstTaskTypes.copy()  # have to have these
        for task1 in firstTaskTypes:  # each task type gathered so far
            endState = task1.getEndState()  # figure the end state
            if endState is None:  # does not change end state
                continue  # no use, do nothing
            for tc in allTaskClasses:  # what task can further begin from there?
1266
                if tc.canBeginFrom(endState) and (tc not in firstTaskTypes):
S
Shuduo Sang 已提交
1267
                    taskTypes.append(tc)  # gather it
1268 1269

        if len(taskTypes) <= 0:
S
Shuduo Sang 已提交
1270 1271 1272 1273 1274 1275 1276
            raise RuntimeError(
                "No suitable task types found for state: {}".format(
                    self._curState))
        logger.debug(
            "[OPS] Tasks found for state {}: {}".format(
                self._curState,
                typesToStrings(taskTypes)))
1277 1278
        return taskTypes

1279
    def _findCurrentState(self, dbc: DbConn):
S
Shuduo Sang 已提交
1280
        ts = time.time()  # we use this to debug how fast/slow it is to do the various queries to find the current DB state
1281 1282
        dbName =self._db.getName()
        if not dbc.existsDatabase(dbName): # dbc.hasDatabases():  # no database?!
1283
            logger.debug( "[STT] empty database found, between {} and {}".format(ts, time.time()))
1284
            return StateEmpty()
S
Shuduo Sang 已提交
1285 1286
        # did not do this when openning connection, and this is NOT the worker
        # thread, which does this on their own
1287
        dbc.use(dbName)
1288 1289
        if not dbc.hasTables():  # no tables
            logger.debug("[STT] DB_ONLY found, between {} and {}".format(ts, time.time()))
1290
            return StateDbOnly()
1291

1292 1293
        sTable = self._db.getFixedSuperTable()
        if sTable.hasRegTables(dbc, dbName):  # no regular tables
1294
            logger.debug("[STT] SUPER_TABLE_ONLY found, between {} and {}".format(ts, time.time()))
1295
            return StateSuperTableOnly()
S
Shuduo Sang 已提交
1296
        else:  # has actual tables
1297
            logger.debug("[STT] HAS_DATA found, between {} and {}".format(ts, time.time()))
1298 1299
            return StateHasData()

1300 1301
    # We transition the system to a new state by examining the current state itself
    def transition(self, tasks, dbc: DbConn):
S
Shuduo Sang 已提交
1302
        if (len(tasks) == 0):  # before 1st step, or otherwise empty
1303
            logger.debug("[STT] Starting State: {}".format(self._curState))
S
Shuduo Sang 已提交
1304
            return  # do nothing
1305

S
Shuduo Sang 已提交
1306
        # this should show up in the server log, separating steps
1307
        dbc.execute("show dnodes")
1308 1309 1310 1311

        # Generic Checks, first based on the start state
        if self._curState.canCreateDb():
            self._curState.assertIfExistThenSuccess(tasks, TaskCreateDb)
S
Shuduo Sang 已提交
1312 1313
            # self.assertAtMostOneSuccess(tasks, CreateDbTask) # not really, in
            # case of multiple creation and drops
1314 1315

        if self._curState.canDropDb():
1316
            if gSvcMgr == None: # only if we are running as client-only
S
Steven Li 已提交
1317
                self._curState.assertIfExistThenSuccess(tasks, TaskDropDb)
S
Shuduo Sang 已提交
1318 1319
            # self.assertAtMostOneSuccess(tasks, DropDbTask) # not really in
            # case of drop-create-drop
1320 1321 1322

        # if self._state.canCreateFixedTable():
            # self.assertIfExistThenSuccess(tasks, CreateFixedTableTask) # Not true, DB may be dropped
S
Shuduo Sang 已提交
1323 1324
            # self.assertAtMostOneSuccess(tasks, CreateFixedTableTask) # not
            # really, in case of create-drop-create
1325 1326 1327

        # if self._state.canDropFixedTable():
            # self.assertIfExistThenSuccess(tasks, DropFixedTableTask) # Not True, the whole DB may be dropped
S
Shuduo Sang 已提交
1328 1329
            # self.assertAtMostOneSuccess(tasks, DropFixedTableTask) # not
            # really in case of drop-create-drop
1330 1331

        # if self._state.canAddData():
S
Shuduo Sang 已提交
1332 1333
        # self.assertIfExistThenSuccess(tasks, AddFixedDataTask)  # not true
        # actually
1334 1335 1336 1337

        # if self._state.canReadData():
            # Nothing for sure

1338
        newState = self._findCurrentState(dbc)
1339
        logger.debug("[STT] New DB state determined: {}".format(newState))
S
Shuduo Sang 已提交
1340 1341
        # can old state move to new state through the tasks?
        self._curState.verifyTasksToState(tasks, newState)
1342 1343 1344
        self._curState = newState

    def pickTaskType(self):
S
Shuduo Sang 已提交
1345 1346
        # all the task types we can choose from at curent state
        taskTypes = self.getTaskTypes()
1347 1348 1349
        weights = []
        for tt in taskTypes:
            endState = tt.getEndState()
S
Shuduo Sang 已提交
1350 1351 1352
            if endState is not None:
                # TODO: change to a method
                weights.append(self._stateWeights[endState.getValIndex()])
1353
            else:
S
Shuduo Sang 已提交
1354 1355
                # read data task, default to 10: TODO: change to a constant
                weights.append(10)
1356
        i = self._weighted_choice_sub(weights)
S
Shuduo Sang 已提交
1357
        # logger.debug(" (weighted random:{}/{}) ".format(i, len(taskTypes)))
1358 1359
        return taskTypes[i]

S
Shuduo Sang 已提交
1360 1361 1362 1363 1364
    # ref:
    # https://eli.thegreenplace.net/2010/01/22/weighted-random-generation-in-python/
    def _weighted_choice_sub(self, weights):
        # TODO: use our dice to ensure it being determinstic?
        rnd = random.random() * sum(weights)
1365 1366 1367 1368
        for i, w in enumerate(weights):
            rnd -= w
            if rnd < 0:
                return i
1369

1370 1371 1372 1373 1374 1375
class Database:
    ''' We use this to represent an actual TDengine database inside a service instance,
        possibly in a cluster environment.

        For now we use it to manage state transitions in that database
    '''
1376 1377 1378 1379 1380
    _clsLock = threading.Lock() # class wide lock
    _lastInt = 101  # next one is initial integer
    _lastTick = 0
    _lastLaggingTick = 0 # lagging tick, for unsequenced insersions

1381 1382 1383 1384
    def __init__(self, dbNum: int, dbc: DbConn): # TODO: remove dbc
        self._dbNum = dbNum # we assign a number to databases, for our testing purpose
        self._stateMachine = StateMechine(self)
        self._stateMachine.init(dbc)
1385
          
1386
        self._lock = threading.RLock()
S
Shuduo Sang 已提交
1387

1388 1389
    def getStateMachine(self) -> StateMechine:
        return self._stateMachine
S
Shuduo Sang 已提交
1390

1391 1392
    def getDbNum(self):
        return self._dbNum
1393

1394 1395
    def getName(self):
        return "db_{}".format(self._dbNum)
1396

1397 1398 1399 1400 1401 1402
    def filterTasks(self, inTasks: List[Task]): # Pick out those belonging to us
        outTasks = []
        for task in inTasks:
            if task.getDb().isSame(self):
                outTasks.append(task)
        return outTasks
1403

1404 1405 1406 1407 1408
    def isSame(self, other):
        return self._dbNum == other._dbNum

    def exists(self, dbc: DbConn):
        return dbc.existsDatabase(self.getName())
1409

1410 1411 1412 1413 1414 1415 1416
    @classmethod
    def getFixedSuperTableName(cls):
        return "fs_table"

    @classmethod
    def getFixedSuperTable(cls) -> TdSuperTable:
        return TdSuperTable(cls.getFixedSuperTableName())
1417 1418 1419 1420 1421 1422

    # We aim to create a starting time tick, such that, whenever we run our test here once
    # We should be able to safely create 100,000 records, which will not have any repeated time stamp
    # when we re-run the test in 3 minutes (180 seconds), basically we should expand time duration
    # by a factor of 500.
    # TODO: what if it goes beyond 10 years into the future
1423
    # TODO: fix the error as result of above: "tsdb timestamp is out of range"
1424 1425
    @classmethod
    def setupLastTick(cls):
1426
        t1 = datetime.datetime(2020, 6, 1)
1427
        t2 = datetime.datetime.now()
S
Shuduo Sang 已提交
1428 1429 1430 1431
        # maybe a very large number, takes 69 years to exceed Python int range
        elSec = int(t2.timestamp() - t1.timestamp())
        elSec2 = (elSec % (8 * 12 * 30 * 24 * 60 * 60 / 500)) * \
            500  # a number representing seconds within 10 years
1432
        # print("elSec = {}".format(elSec))
S
Shuduo Sang 已提交
1433 1434 1435
        t3 = datetime.datetime(2012, 1, 1)  # default "keep" is 10 years
        t4 = datetime.datetime.fromtimestamp(
            t3.timestamp() + elSec2)  # see explanation above
1436 1437 1438
        logger.info("Setting up TICKS to start from: {}".format(t4))
        return t4

1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450
    @classmethod
    def getNextTick(cls):        
        with cls._clsLock:  # prevent duplicate tick
            if cls._lastLaggingTick==0:
                # 10k at 1/20 chance, should be enough to avoid overlaps
                cls._lastLaggingTick = cls.setupLastTick() + datetime.timedelta(0, -10000)                 
            if cls._lastTick==0: # should be quite a bit into the future
                cls._lastTick = cls.setupLastTick()  

            if Dice.throw(20) == 0:  # 1 in 20 chance, return lagging tick
                cls._lastLaggingTick += datetime.timedelta(0, 1) # Go back in time 100 seconds
                return cls._lastLaggingTick 
S
Shuduo Sang 已提交
1451 1452
            else:  # regular
                # add one second to it
1453 1454
                cls._lastTick += datetime.timedelta(0, 1)
                return cls._lastTick
1455 1456

    def getNextInt(self):
1457 1458 1459
        with self._lock:
            self._lastInt += 1
            return self._lastInt
1460 1461

    def getNextBinary(self):
S
Shuduo Sang 已提交
1462 1463
        return "Beijing_Shanghai_Los_Angeles_New_York_San_Francisco_Chicago_Beijing_Shanghai_Los_Angeles_New_York_San_Francisco_Chicago_{}".format(
            self.getNextInt())
1464 1465

    def getNextFloat(self):
1466 1467 1468
        ret = 0.9 + self.getNextInt()
        # print("Float obtained: {}".format(ret))
        return ret
S
Shuduo Sang 已提交
1469

1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518

class DbManager():
    ''' This is a wrapper around DbConn(), to make it easier to use. 

        TODO: rename this to DbConnManager
    '''
    def __init__(self):
        self.tableNumQueue = LinearQueue() # TODO: delete?
        # self.openDbServerConnection()
        self._dbConn = DbConn.createNative() if (
            gConfig.connector_type == 'native') else DbConn.createRest()
        try:
            self._dbConn.open()  # may throw taos.error.ProgrammingError: disconnected
        except taos.error.ProgrammingError as err:
            # print("Error type: {}, msg: {}, value: {}".format(type(err), err.msg, err))
            if (err.msg == 'client disconnected'):  # cannot open DB connection
                print(
                    "Cannot establish DB connection, please re-run script without parameter, and follow the instructions.")
                sys.exit(2)
            else:
                print("Failed to connect to DB, errno = {}, msg: {}"
                    .format(Helper.convertErrno(err.errno), err.msg))
                raise
        except BaseException:
            print("[=] Unexpected exception")
            raise

        # Do this after dbConn is in proper shape
        # Moved to Database()
        # self._stateMachine = StateMechine(self._dbConn)

    def getDbConn(self):
        return self._dbConn

    # TODO: not used any more, to delete
    def pickAndAllocateTable(self):  # pick any table, and "use" it
        return self.tableNumQueue.pickAndAllocate()

    # TODO: Not used any more, to delete
    def addTable(self):
        with self._lock:
            tIndex = self.tableNumQueue.push()
        return tIndex

    # Not used any more, to delete
    def releaseTable(self, i):  # return the table back, so others can use it
        self.tableNumQueue.release(i)    

    # TODO: not used any more, delete
S
Steven Li 已提交
1519
    def getTableNameToDelete(self):
S
Shuduo Sang 已提交
1520 1521
        tblNum = self.tableNumQueue.pop()  # TODO: race condition!
        if (not tblNum):  # maybe false
1522
            return False
S
Shuduo Sang 已提交
1523

S
Steven Li 已提交
1524 1525
        return "table_{}".format(tblNum)

1526
    def cleanUp(self):
S
Shuduo Sang 已提交
1527 1528
        self._dbConn.close()

1529
class TaskExecutor():
1530
    class BoundedList:
S
Shuduo Sang 已提交
1531
        def __init__(self, size=10):
1532 1533
            self._size = size
            self._list = []
S
Steven Li 已提交
1534
            self._lock = threading.Lock()
1535

S
Shuduo Sang 已提交
1536
        def add(self, n: int):
S
Steven Li 已提交
1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
            with self._lock:
                if not self._list:  # empty
                    self._list.append(n)
                    return
                # now we should insert
                nItems = len(self._list)
                insPos = 0
                for i in range(nItems):
                    insPos = i
                    if n <= self._list[i]:  # smaller than this item, time to insert
                        break  # found the insertion point
                    insPos += 1  # insert to the right

                if insPos == 0:  # except for the 1st item, # TODO: elimiate first item as gating item
                    return  # do nothing

                # print("Inserting at postion {}, value: {}".format(insPos, n))
                self._list.insert(insPos, n)  # insert

                newLen = len(self._list)
                if newLen <= self._size:
                    return  # do nothing
                elif newLen == (self._size + 1):
                    del self._list[0]  # remove the first item
                else:
                    raise RuntimeError("Corrupt Bounded List")
1563 1564 1565 1566 1567 1568

        def __str__(self):
            return repr(self._list)

    _boundedList = BoundedList()

1569 1570 1571
    def __init__(self, curStep):
        self._curStep = curStep

1572 1573 1574 1575
    @classmethod
    def getBoundedList(cls):
        return cls._boundedList

1576 1577 1578
    def getCurStep(self):
        return self._curStep

S
Shuduo Sang 已提交
1579
    def execute(self, task: Task, wt: WorkerThread):  # execute a task on a thread
1580
        task.execute(wt)
1581

1582 1583 1584 1585
    def recordDataMark(self, n: int):
        # print("[{}]".format(n), end="", flush=True)
        self._boundedList.add(n)

1586 1587
    # def logInfo(self, msg):
    #     logger.info("    T[{}.x]: ".format(self._curStep) + msg)
1588

1589 1590
    # def logDebug(self, msg):
    #     logger.debug("    T[{}.x]: ".format(self._curStep) + msg)
1591

S
Shuduo Sang 已提交
1592

S
Steven Li 已提交
1593
class Task():
1594 1595 1596 1597
    ''' A generic "Task" to be executed. For now we decide that there is no
        need to embed a DB connection here, we use whatever the Worker Thread has
        instead. But a task is always associated with a DB
    '''
1598 1599 1600 1601
    taskSn = 100

    @classmethod
    def allocTaskNum(cls):
S
Shuduo Sang 已提交
1602
        Task.taskSn += 1  # IMPORTANT: cannot use cls.taskSn, since each sub class will have a copy
S
Steven Li 已提交
1603 1604
        # logger.debug("Allocating taskSN: {}".format(Task.taskSn))
        return Task.taskSn
1605

1606
    def __init__(self, execStats: ExecutionStats, db: Database):
S
Shuduo Sang 已提交
1607
        self._workerThread = None
1608
        self._err = None # type: Exception
1609
        self._aborted = False
1610
        self._curStep = None
S
Shuduo Sang 已提交
1611
        self._numRows = None  # Number of rows affected
1612

S
Shuduo Sang 已提交
1613
        # Assign an incremental task serial number
1614
        self._taskNum = self.allocTaskNum()
S
Steven Li 已提交
1615
        # logger.debug("Creating new task {}...".format(self._taskNum))
1616

1617
        self._execStats = execStats
1618
        self._db = db # A task is always associated/for a specific DB
1619

1620
    def isSuccess(self):
S
Shuduo Sang 已提交
1621
        return self._err is None
1622

1623 1624 1625
    def isAborted(self):
        return self._aborted

S
Shuduo Sang 已提交
1626
    def clone(self):  # TODO: why do we need this again?
1627
        newTask = self.__class__(self._execStats, self._db)
1628 1629
        return newTask

1630 1631 1632
    def getDb(self):
        return self._db

1633
    def logDebug(self, msg):
S
Shuduo Sang 已提交
1634 1635 1636
        self._workerThread.logDebug(
            "Step[{}.{}] {}".format(
                self._curStep, self._taskNum, msg))
1637 1638

    def logInfo(self, msg):
S
Shuduo Sang 已提交
1639 1640 1641
        self._workerThread.logInfo(
            "Step[{}.{}] {}".format(
                self._curStep, self._taskNum, msg))
1642

1643
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
S
Shuduo Sang 已提交
1644 1645 1646
        raise RuntimeError(
            "To be implemeted by child classes, class name: {}".format(
                self.__class__.__name__))
1647

1648 1649 1650
    def _isErrAcceptable(self, errno, msg):
        if errno in [
                0x05,  # TSDB_CODE_RPC_NOT_READY
1651
                0x0B,  # Unable to establish connection, more details in TD-1648
1652
                # 0x200, # invalid SQL, TODO: re-examine with TD-934
1653 1654
                0x217, # "db not selected", client side defined error code
                0x218, # "Table does not exist" client side defined error code
1655 1656 1657 1658 1659 1660 1661 1662 1663
                0x360, 0x362, 
                0x369, # tag already exists
                0x36A, 0x36B, 0x36D,
                0x381, 
                0x380, # "db not selected"
                0x383,
                0x386,  # DB is being dropped?!
                0x503,
                0x510,  # vnode not in ready state
1664
                0x14,   # db not ready, errno changed
1665 1666 1667 1668
                0x600,
                1000  # REST catch-all error
            ]: 
            return True # These are the ALWAYS-ACCEPTABLE ones
1669 1670
        elif (errno in [ 0x0B ]) and gConfig.auto_start_service:
            return True # We may get "network unavilable" when restarting service
1671 1672 1673
        elif errno == 0x200 : # invalid SQL, we need to div in a bit more
            if msg.find("invalid column name") != -1:
                return True 
1674 1675 1676 1677
            elif msg.find("tags number not matched") != -1: # mismatched tags after modification
                return True
            elif msg.find("duplicated column names") != -1: # also alter table tag issues
                return True
S
Steven Li 已提交
1678 1679 1680
        elif (gSvcMgr!=None) and gSvcMgr.isRestarting():
            logger.info("Ignoring error when service is restarting: errno = {}, msg = {}".format(errno, msg))
            return True
1681 1682 1683 1684
        
        return False # Not an acceptable error


1685 1686
    def execute(self, wt: WorkerThread):
        wt.verifyThreadSelf()
S
Shuduo Sang 已提交
1687
        self._workerThread = wt  # type: ignore
1688 1689

        te = wt.getTaskExecutor()
1690
        self._curStep = te.getCurStep()
S
Shuduo Sang 已提交
1691 1692
        self.logDebug(
            "[-] executing task {}...".format(self.__class__.__name__))
1693

1694
        self._err = None # TODO: type hint mess up?
1695 1696
        self._execStats.beginTaskType(self.__class__.__name__)  # mark beginning
        errno2 = None
1697 1698 1699

        # Now pick a database, and stick with it for the duration of the task execution
        dbName = self._db.getName()
1700
        try:
S
Shuduo Sang 已提交
1701
            self._executeInternal(te, wt)  # TODO: no return value?
1702
        except taos.error.ProgrammingError as err:
1703
            errno2 = Helper.convertErrno(err.errno)
1704
            if (gConfig.continue_on_exception):  # user choose to continue
1705
                self.logDebug("[=] Continue after TAOS exception: errno=0x{:X}, msg: {}, SQL: {}".format(
1706
                        errno2, err, wt.getDbConn().getLastSql()))
1707
                self._err = err
1708 1709
            elif self._isErrAcceptable(errno2, err.__str__()):
                self.logDebug("[=] Acceptable Taos library exception: errno=0x{:X}, msg: {}, SQL: {}".format(
1710
                        errno2, err, wt.getDbConn().getLastSql()))
1711
                print("_", end="", flush=True)
S
Shuduo Sang 已提交
1712
                self._err = err
1713
            else: # not an acceptable error
1714 1715 1716
                errMsg = "[=] Unexpected Taos library exception ({}): errno=0x{:X}, msg: {}, SQL: {}".format(
                    self.__class__.__name__,
                    errno2, err, wt.getDbConn().getLastSql())
1717
                self.logDebug(errMsg)
S
Shuduo Sang 已提交
1718
                if gConfig.debug:
1719 1720
                    # raise # so that we see full stack
                    traceback.print_exc()
1721 1722
                print(
                    "\n\n----------------------------\nProgram ABORTED Due to Unexpected TAOS Error: \n\n{}\n".format(errMsg) +
1723 1724 1725 1726
                    "----------------------------\n")
                # sys.exit(-1)
                self._err = err
                self._aborted = True
S
Shuduo Sang 已提交
1727
        except Exception as e:
S
Steven Li 已提交
1728
            self.logInfo("Non-TAOS exception encountered")
S
Shuduo Sang 已提交
1729
            self._err = e
S
Steven Li 已提交
1730
            self._aborted = True
1731
            traceback.print_exc()
1732
        except BaseException as e:
1733
            self.logInfo("Python base exception encountered")
1734
            self._err = e
1735
            self._aborted = True
S
Steven Li 已提交
1736
            traceback.print_exc()
1737
        except BaseException: # TODO: what is this again??!!
S
Shuduo Sang 已提交
1738 1739
            self.logDebug(
                "[=] Unexpected exception, SQL: {}".format(
1740
                    wt.getDbConn().getLastSql()))
1741
            raise
1742
        self._execStats.endTaskType(self.__class__.__name__, self.isSuccess())
S
Shuduo Sang 已提交
1743 1744 1745 1746

        self.logDebug("[X] task execution completed, {}, status: {}".format(
            self.__class__.__name__, "Success" if self.isSuccess() else "Failure"))
        # TODO: merge with above.
1747
        self._execStats.incExecCount(self.__class__.__name__, self.isSuccess(), errno2)
S
Steven Li 已提交
1748

1749
    # TODO: refactor away, just provide the dbConn
S
Shuduo Sang 已提交
1750
    def execWtSql(self, wt: WorkerThread, sql):  # execute an SQL on the worker thread
1751
        """ Haha """
1752 1753
        return wt.execSql(sql)

S
Shuduo Sang 已提交
1754
    def queryWtSql(self, wt: WorkerThread, sql):  # execute an SQL on the worker thread
1755 1756
        return wt.querySql(sql)

S
Shuduo Sang 已提交
1757
    def getQueryResult(self, wt: WorkerThread):  # execute an SQL on the worker thread
1758 1759 1760
        return wt.getQueryResult()


1761
class ExecutionStats:
1762
    def __init__(self):
S
Shuduo Sang 已提交
1763 1764
        # total/success times for a task
        self._execTimes: Dict[str, [int, int]] = {}
1765 1766 1767
        self._tasksInProgress = 0
        self._lock = threading.Lock()
        self._firstTaskStartTime = None
1768
        self._execStartTime = None
1769
        self._errors = {}
S
Shuduo Sang 已提交
1770 1771
        self._elapsedTime = 0.0  # total elapsed time
        self._accRunTime = 0.0  # accumulated run time
1772

1773 1774 1775
        self._failed = False
        self._failureReason = None

S
Steven Li 已提交
1776
    def __str__(self):
S
Shuduo Sang 已提交
1777 1778
        return "[ExecStats: _failed={}, _failureReason={}".format(
            self._failed, self._failureReason)
S
Steven Li 已提交
1779 1780

    def isFailed(self):
S
Shuduo Sang 已提交
1781
        return self._failed
S
Steven Li 已提交
1782

1783 1784 1785 1786 1787 1788
    def startExec(self):
        self._execStartTime = time.time()

    def endExec(self):
        self._elapsedTime = time.time() - self._execStartTime

1789
    def incExecCount(self, klassName, isSuccess, eno=None):  # TODO: add a lock here
1790 1791
        if klassName not in self._execTimes:
            self._execTimes[klassName] = [0, 0]
S
Shuduo Sang 已提交
1792 1793
        t = self._execTimes[klassName]  # tuple for the data
        t[0] += 1  # index 0 has the "total" execution times
1794
        if isSuccess:
S
Shuduo Sang 已提交
1795
            t[1] += 1  # index 1 has the "success" execution times
1796 1797 1798 1799 1800
        if eno != None:             
            if klassName not in self._errors:
                self._errors[klassName] = {}
            errors = self._errors[klassName]
            errors[eno] = errors[eno]+1 if eno in errors else 1
1801 1802 1803

    def beginTaskType(self, klassName):
        with self._lock:
S
Shuduo Sang 已提交
1804 1805
            if self._tasksInProgress == 0:  # starting a new round
                self._firstTaskStartTime = time.time()  # I am now the first task
1806 1807 1808 1809 1810
            self._tasksInProgress += 1

    def endTaskType(self, klassName, isSuccess):
        with self._lock:
            self._tasksInProgress -= 1
S
Shuduo Sang 已提交
1811
            if self._tasksInProgress == 0:  # all tasks have stopped
1812 1813 1814
                self._accRunTime += (time.time() - self._firstTaskStartTime)
                self._firstTaskStartTime = None

1815 1816 1817 1818
    def registerFailure(self, reason):
        self._failed = True
        self._failureReason = reason

1819
    def printStats(self):
S
Shuduo Sang 已提交
1820 1821 1822 1823 1824 1825
        logger.info(
            "----------------------------------------------------------------------")
        logger.info(
            "| Crash_Gen test {}, with the following stats:". format(
                "FAILED (reason: {})".format(
                    self._failureReason) if self._failed else "SUCCEEDED"))
1826
        logger.info("| Task Execution Times (success/total):")
1827
        execTimesAny = 0
S
Shuduo Sang 已提交
1828
        for k, n in self._execTimes.items():
1829
            execTimesAny += n[0]
1830 1831 1832 1833 1834 1835 1836 1837
            errStr = None
            if k in self._errors:
                errors = self._errors[k]
                # print("errors = {}".format(errors))
                errStrs = ["0x{:X}:{}".format(eno, n) for (eno, n) in errors.items()]
                # print("error strings = {}".format(errStrs))
                errStr = ", ".join(errStrs) 
            logger.info("|    {0:<24}: {1}/{2} (Errors: {3})".format(k, n[1], n[0], errStr))
S
Shuduo Sang 已提交
1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851

        logger.info(
            "| Total Tasks Executed (success or not): {} ".format(execTimesAny))
        logger.info(
            "| Total Tasks In Progress at End: {}".format(
                self._tasksInProgress))
        logger.info(
            "| Total Task Busy Time (elapsed time when any task is in progress): {:.3f} seconds".format(
                self._accRunTime))
        logger.info(
            "| Average Per-Task Execution Time: {:.3f} seconds".format(self._accRunTime / execTimesAny))
        logger.info(
            "| Total Elapsed Time (from wall clock): {:.3f} seconds".format(
                self._elapsedTime))
1852
        logger.info("| Top numbers written: {}".format(TaskExecutor.getBoundedList()))
1853
        logger.info("| Active DB Native Connections (now): {}".format(DbConnNative.totalConnections))
1854 1855 1856 1857
        logger.info("| Longest native query time: {:.3f} seconds, started: {}".
            format(MyTDSql.longestQueryTime, 
                time.strftime("%x %X", time.localtime(MyTDSql.lqStartTime))) )
        logger.info("| Longest native query: {}".format(MyTDSql.longestQuery))
S
Shuduo Sang 已提交
1858 1859
        logger.info(
            "----------------------------------------------------------------------")
1860 1861 1862


class StateTransitionTask(Task):
1863 1864 1865 1866 1867
    LARGE_NUMBER_OF_TABLES = 35
    SMALL_NUMBER_OF_TABLES = 3
    LARGE_NUMBER_OF_RECORDS = 50
    SMALL_NUMBER_OF_RECORDS = 3

1868
    @classmethod
S
Shuduo Sang 已提交
1869
    def getInfo(cls):  # each sub class should supply their own information
1870 1871
        raise RuntimeError("Overriding method expected")

S
Shuduo Sang 已提交
1872
    _endState = None
1873
    @classmethod
S
Shuduo Sang 已提交
1874
    def getEndState(cls):  # TODO: optimize by calling it fewer times
1875 1876
        raise RuntimeError("Overriding method expected")

1877 1878 1879
    # @classmethod
    # def getBeginStates(cls):
    #     return cls.getInfo()[0]
1880

1881 1882 1883
    # @classmethod
    # def getEndState(cls): # returning the class name
    #     return cls.getInfo()[0]
1884 1885

    @classmethod
1886 1887 1888
    def canBeginFrom(cls, state: AnyState):
        # return state.getValue() in cls.getBeginStates()
        raise RuntimeError("must be overriden")
1889

1890 1891
    @classmethod
    def getRegTableName(cls, i):
1892
        return "reg_table_{}".format(i)
1893

1894 1895
    def execute(self, wt: WorkerThread):
        super().execute(wt)
S
Shuduo Sang 已提交
1896 1897


1898
class TaskCreateDb(StateTransitionTask):
1899
    @classmethod
1900
    def getEndState(cls):
S
Shuduo Sang 已提交
1901
        return StateDbOnly()
1902

1903 1904 1905 1906
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canCreateDb()

1907
    # Actually creating the database(es)
1908
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1909
        # was: self.execWtSql(wt, "create database db")
1910 1911 1912 1913 1914 1915
        repStr = ""
        if gConfig.max_replicas != 1:
            numReplica = Dice.throw(gConfig.max_replicas) + 1 # 1,2 ... N
            repStr = "replica {}".format(numReplica)
        self.execWtSql(wt, "create database {} {}"
            .format(self._db.getName(), repStr) )
1916

1917
class TaskDropDb(StateTransitionTask):
1918
    @classmethod
1919 1920
    def getEndState(cls):
        return StateEmpty()
1921

1922 1923 1924 1925
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canDropDb()

1926
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1927
        self.execWtSql(wt, "drop database {}".format(self._db.getName()))
S
Steven Li 已提交
1928
        logger.debug("[OPS] database dropped at {}".format(time.time()))
1929

1930
class TaskCreateSuperTable(StateTransitionTask):
1931
    @classmethod
1932 1933
    def getEndState(cls):
        return StateSuperTableOnly()
1934

1935 1936
    @classmethod
    def canBeginFrom(cls, state: AnyState):
1937
        return state.canCreateFixedSuperTable()
1938

1939
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
1940
        if not self._db.exists(wt.getDbConn()):
1941 1942 1943
            logger.debug("Skipping task, no DB yet")
            return

1944
        sTable = self._db.getFixedSuperTable() # type: TdSuperTable
1945
        # wt.execSql("use db")    # should always be in place
1946 1947
        sTable.create(wt.getDbConn(), self._db.getName(), 
            {'ts':'timestamp', 'speed':'int'}, {'b':'binary(200)', 'f':'float'})
1948
        # self.execWtSql(wt,"create table db.{} (ts timestamp, speed int) tags (b binary(200), f float) ".format(tblName))
S
Shuduo Sang 已提交
1949 1950
        # No need to create the regular tables, INSERT will do that
        # automatically
1951

S
Steven Li 已提交
1952

1953 1954 1955 1956
class TdSuperTable:
    def __init__(self, stName):
        self._stName = stName

1957 1958 1959
    def getName(self):
        return self._stName

1960 1961 1962 1963 1964
    # TODO: odd semantic, create() method is usually static?
    def create(self, dbc, dbName, cols: dict, tags: dict):
        '''Creating a super table'''
        sql = "CREATE TABLE {}.{} ({}) TAGS ({})".format(
            dbName,
1965 1966 1967 1968 1969 1970
            self._stName,
            ",".join(['%s %s'%(k,v) for (k,v) in cols.items()]),
            ",".join(['%s %s'%(k,v) for (k,v) in tags.items()])
            )
        dbc.execute(sql)        

1971
    def getRegTables(self, dbc: DbConn, dbName: str):
1972
        try:
1973
            dbc.query("select TBNAME from {}.{}".format(dbName, self._stName))  # TODO: analyze result set later            
1974
        except taos.error.ProgrammingError as err:                    
1975
            errno2 = Helper.convertErrno(err.errno) 
1976 1977 1978 1979 1980 1981
            logger.debug("[=] Failed to get tables from super table: errno=0x{:X}, msg: {}".format(errno2, err))
            raise

        qr = dbc.getQueryResult()
        return [v[0] for v in qr] # list transformation, ref: https://stackoverflow.com/questions/643823/python-list-transformation

1982 1983
    def hasRegTables(self, dbc: DbConn, dbName: str):
        return dbc.query("SELECT * FROM {}.{}".format(dbName, self._stName)) > 0
1984

1985 1986
    def ensureTable(self, dbc: DbConn, dbName: str, regTableName: str):
        sql = "select tbname from {}.{} where tbname in ('{}')".format(dbName, self._stName, regTableName)
1987 1988
        if dbc.query(sql) >= 1 : # reg table exists already
            return
1989 1990
        sql = "CREATE TABLE {}.{} USING {}.{} tags ({})".format(
            dbName, regTableName, dbName, self._stName, self._getTagStrForSql(dbc, dbName)
1991 1992 1993
        )
        dbc.execute(sql)

1994 1995
    def _getTagStrForSql(self, dbc, dbName: str) :
        tags = self._getTags(dbc, dbName)
1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
        tagStrs = []
        for tagName in tags: 
            tagType = tags[tagName]
            if tagType == 'BINARY':
                tagStrs.append("'Beijing-Shanghai-LosAngeles'")
            elif tagType == 'FLOAT':
                tagStrs.append('9.9')
            elif tagType == 'INT':
                tagStrs.append('88')
            else:
                raise RuntimeError("Unexpected tag type: {}".format(tagType))
        return ", ".join(tagStrs)

2009 2010
    def _getTags(self, dbc, dbName) -> dict:
        dbc.query("DESCRIBE {}.{}".format(dbName, self._stName))
2011 2012 2013 2014 2015 2016
        stCols = dbc.getQueryResult()
        # print(stCols)
        ret = {row[0]:row[1] for row in stCols if row[3]=='TAG'} # name:type
        # print("Tags retrieved: {}".format(ret))
        return ret

2017 2018
    def addTag(self, dbc, dbName, tagName, tagType):
        if tagName in self._getTags(dbc, dbName): # already 
2019 2020
            return
        # sTable.addTag("extraTag", "int")
2021
        sql = "alter table {}.{} add tag {} {}".format(dbName, self._stName, tagName, tagType)
2022 2023
        dbc.execute(sql)

2024 2025
    def dropTag(self, dbc, dbName, tagName):
        if not tagName in self._getTags(dbc, dbName): # don't have this tag
2026
            return
2027
        sql = "alter table {}.{} drop tag {}".format(dbName, self._stName, tagName)
2028 2029
        dbc.execute(sql)

2030 2031
    def changeTag(self, dbc, dbName, oldTag, newTag):
        tags = self._getTags(dbc, dbName)
2032 2033 2034 2035
        if not oldTag in tags: # don't have this tag
            return
        if newTag in tags: # already have this tag
            return
2036
        sql = "alter table {}.{} change tag {} {}".format(dbName, self._stName, oldTag, newTag)
2037 2038
        dbc.execute(sql)

2039
class TaskReadData(StateTransitionTask):
2040
    @classmethod
2041
    def getEndState(cls):
S
Shuduo Sang 已提交
2042
        return None  # meaning doesn't affect state
2043

2044 2045 2046 2047
    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canReadData()

2048
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2049
        sTable = self._db.getFixedSuperTable()
2050

2051 2052
        # 1 in 5 chance, simulate a broken connection. 
        if random.randrange(5) == 0:  # TODO: break connection in all situations
2053 2054
            wt.getDbConn().close()
            wt.getDbConn().open()
2055
            print("_r", end="", flush=True)
2056
        
2057
        dbc = wt.getDbConn()
2058 2059
        dbName = self._db.getName()
        for rTbName in sTable.getRegTables(dbc, dbName):  # regular tables
2060
            aggExpr = Dice.choice([
2061 2062 2063
                '*',
                'count(*)',
                'avg(speed)',
2064
                # 'twa(speed)', # TODO: this one REQUIRES a where statement, not reasonable
2065 2066
                'sum(speed)', 
                'stddev(speed)', 
2067
                # SELECTOR functions
2068 2069 2070
                'min(speed)', 
                'max(speed)', 
                'first(speed)', 
2071
                'last(speed)',
2072 2073 2074
                'top(speed, 50)', # TODO: not supported?
                'bottom(speed, 50)', # TODO: not supported?
                'apercentile(speed, 10)', # TODO: TD-1316
2075 2076 2077 2078 2079
                'last_row(speed)',
                # Transformation Functions
                # 'diff(speed)', # TODO: no supported?!
                'spread(speed)'
                ]) # TODO: add more from 'top'
2080 2081 2082
            filterExpr = Dice.choice([ # TODO: add various kind of WHERE conditions
                None
            ])
2083
            try:
2084
                # Run the query against the regular table first
2085
                dbc.execute("select {} from {}.{}".format(aggExpr, dbName, rTbName))
2086
                # Then run it against the super table
2087
                if aggExpr not in ['stddev(speed)']: #TODO: STDDEV not valid for super tables?!
2088
                    dbc.execute("select {} from {}.{}".format(aggExpr, dbName, sTable.getName()))
2089
            except taos.error.ProgrammingError as err:                    
2090
                errno2 = Helper.convertErrno(err.errno)
2091
                logger.debug("[=] Read Failure: errno=0x{:X}, msg: {}, SQL: {}".format(errno2, err, dbc.getLastSql()))
2092
                raise
S
Shuduo Sang 已提交
2093

2094
class TaskDropSuperTable(StateTransitionTask):
2095
    @classmethod
2096
    def getEndState(cls):
S
Shuduo Sang 已提交
2097
        return StateDbOnly()
2098

2099 2100
    @classmethod
    def canBeginFrom(cls, state: AnyState):
2101
        return state.canDropFixedSuperTable()
2102

2103
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2104
        # 1/2 chance, we'll drop the regular tables one by one, in a randomized sequence
S
Shuduo Sang 已提交
2105
        if Dice.throw(2) == 0:
2106
            # print("_7_", end="", flush=True)
S
Shuduo Sang 已提交
2107 2108 2109 2110
            tblSeq = list(range(
                2 + (self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES)))
            random.shuffle(tblSeq)
            tickOutput = False  # if we have spitted out a "d" character for "drop regular table"
2111
            isSuccess = True
S
Shuduo Sang 已提交
2112
            for i in tblSeq:
2113
                regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)
2114
                try:
2115 2116
                    self.execWtSql(wt, "drop table {}.{}".
                        format(self._db.getName(), regTableName))  # nRows always 0, like MySQL
S
Shuduo Sang 已提交
2117
                except taos.error.ProgrammingError as err:
2118 2119
                    # correcting for strange error number scheme                    
                    errno2 = Helper.convertErrno(err.errno)
S
Shuduo Sang 已提交
2120
                    if (errno2 in [0x362]):  # mnode invalid table name
2121
                        isSuccess = False
2122
                        logger.debug("[DB] Acceptable error when dropping a table")
S
Shuduo Sang 已提交
2123
                    continue  # try to delete next regular table
2124 2125

                if (not tickOutput):
S
Shuduo Sang 已提交
2126 2127
                    tickOutput = True  # Print only one time
                    if isSuccess:
2128 2129
                        print("d", end="", flush=True)
                    else:
S
Shuduo Sang 已提交
2130
                        print("f", end="", flush=True)
2131 2132

        # Drop the super table itself
2133 2134
        tblName = self._db.getFixedSuperTableName()
        self.execWtSql(wt, "drop table {}.{}".format(self._db.getName(), tblName))
2135

S
Shuduo Sang 已提交
2136

2137 2138 2139
class TaskAlterTags(StateTransitionTask):
    @classmethod
    def getEndState(cls):
S
Shuduo Sang 已提交
2140
        return None  # meaning doesn't affect state
2141 2142 2143

    @classmethod
    def canBeginFrom(cls, state: AnyState):
S
Shuduo Sang 已提交
2144
        return state.canDropFixedSuperTable()  # if we can drop it, we can alter tags
2145 2146

    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2147 2148
        # tblName = self._dbManager.getFixedSuperTableName()
        dbc = wt.getDbConn()
2149 2150
        sTable = self._db.getFixedSuperTable()
        dbName = self._db.getName()
2151
        dice = Dice.throw(4)
S
Shuduo Sang 已提交
2152
        if dice == 0:
2153
            sTable.addTag(dbc, dbName, "extraTag", "int")
2154
            # sql = "alter table db.{} add tag extraTag int".format(tblName)
S
Shuduo Sang 已提交
2155
        elif dice == 1:
2156
            sTable.dropTag(dbc, dbName, "extraTag")
2157
            # sql = "alter table db.{} drop tag extraTag".format(tblName)
S
Shuduo Sang 已提交
2158
        elif dice == 2:
2159
            sTable.dropTag(dbc, dbName, "newTag")
2160
            # sql = "alter table db.{} drop tag newTag".format(tblName)
S
Shuduo Sang 已提交
2161
        else:  # dice == 3
2162
            sTable.changeTag(dbc, dbName, "extraTag", "newTag")
2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
            # sql = "alter table db.{} change tag extraTag newTag".format(tblName)

class TaskRestartService(StateTransitionTask):
    _isRunning = False
    _classLock = threading.Lock()

    @classmethod
    def getEndState(cls):
        return None  # meaning doesn't affect state

    @classmethod
    def canBeginFrom(cls, state: AnyState):
        if gConfig.auto_start_service:
            return state.canDropFixedSuperTable()  # Basicallly when we have the super table
        return False # don't run this otherwise

2179
    CHANCE_TO_RESTART_SERVICE = 200
2180 2181 2182 2183
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
        if not gConfig.auto_start_service: # only execute when we are in -a mode
            print("_a", end="", flush=True)
            return
2184

2185 2186 2187 2188 2189 2190
        with self._classLock:
            if self._isRunning:
                print("Skipping restart task, another running already")
                return
            self._isRunning = True

2191
        if Dice.throw(self.CHANCE_TO_RESTART_SERVICE) == 0: # 1 in N chance
2192 2193 2194
            dbc = wt.getDbConn()
            dbc.execute("show databases") # simple delay, align timing with other workers
            gSvcMgr.restart()
2195

2196
        self._isRunning = False
S
Shuduo Sang 已提交
2197

2198
class TaskAddData(StateTransitionTask):
S
Shuduo Sang 已提交
2199 2200
    # Track which table is being actively worked on
    activeTable: Set[int] = set()
2201

S
Shuduo Sang 已提交
2202 2203
    # We use these two files to record operations to DB, useful for power-off
    # tests
2204 2205 2206 2207 2208
    fAddLogReady = None
    fAddLogDone = None

    @classmethod
    def prepToRecordOps(cls):
S
Shuduo Sang 已提交
2209 2210 2211 2212
        if gConfig.record_ops:
            if (cls.fAddLogReady is None):
                logger.info(
                    "Recording in a file operations to be performed...")
2213
                cls.fAddLogReady = open("add_log_ready.txt", "w")
S
Shuduo Sang 已提交
2214
            if (cls.fAddLogDone is None):
2215 2216
                logger.info("Recording in a file operations completed...")
                cls.fAddLogDone = open("add_log_done.txt", "w")
2217

2218
    @classmethod
2219 2220
    def getEndState(cls):
        return StateHasData()
2221 2222 2223 2224

    @classmethod
    def canBeginFrom(cls, state: AnyState):
        return state.canAddData()
S
Shuduo Sang 已提交
2225

2226
    def _executeInternal(self, te: TaskExecutor, wt: WorkerThread):
2227 2228
        # ds = self._dbManager # Quite DANGEROUS here, may result in multi-thread client access
        db = self._db
2229
        dbc = wt.getDbConn()
2230
        tblSeq = list(range(
S
Shuduo Sang 已提交
2231 2232 2233 2234
                self.LARGE_NUMBER_OF_TABLES if gConfig.larger_data else self.SMALL_NUMBER_OF_TABLES))
        random.shuffle(tblSeq)
        for i in tblSeq:
            if (i in self.activeTable):  # wow already active
2235
                print("x", end="", flush=True) # concurrent insertion
2236
            else:
S
Shuduo Sang 已提交
2237
                self.activeTable.add(i)  # marking it active
2238
            
2239
            sTable = db.getFixedSuperTable()
2240
            regTableName = self.getRegTableName(i)  # "db.reg_table_{}".format(i)
2241
            sTable.ensureTable(wt.getDbConn(), db.getName(), regTableName)  # Ensure the table exists           
2242 2243
           
            for j in range(self.LARGE_NUMBER_OF_RECORDS if gConfig.larger_data else self.SMALL_NUMBER_OF_RECORDS):  # number of records per table
2244
                nextInt = db.getNextInt()
2245
                nextTick = db.getNextTick()
2246 2247
                if gConfig.record_ops:
                    self.prepToRecordOps()
2248
                    self.fAddLogReady.write("Ready to write {} to {}\n".format(nextInt, regTableName))
2249 2250
                    self.fAddLogReady.flush()
                    os.fsync(self.fAddLogReady)
2251 2252
                sql = "insert into {}.{} values ('{}', {});".format( # removed: tags ('{}', {})
                    db.getName(),
S
Shuduo Sang 已提交
2253
                    regTableName,
2254 2255
                    # ds.getFixedSuperTableName(),
                    # ds.getNextBinary(), ds.getNextFloat(),
2256 2257
                    nextTick, nextInt)
                dbc.execute(sql)
S
Shuduo Sang 已提交
2258 2259
                # Successfully wrote the data into the DB, let's record it
                # somehow
2260
                te.recordDataMark(nextInt)
2261
                if gConfig.record_ops:
S
Shuduo Sang 已提交
2262 2263 2264
                    self.fAddLogDone.write(
                        "Wrote {} to {}\n".format(
                            nextInt, regTableName))
2265 2266
                    self.fAddLogDone.flush()
                    os.fsync(self.fAddLogDone)
2267 2268

                # Now read it back and verify, we might encounter an error if table is dropped
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285
                if gConfig.verify_data: # only if command line asks for it
                    try:
                        readBack = dbc.queryScalar("SELECT speed from {}.{} WHERE ts= '{}'".
                            format(db.getName(), regTableName, nextTick))
                        if readBack != nextInt :
                            raise taos.error.ProgrammingError(
                                "Failed to read back same data, wrote: {}, read: {}"
                                .format(nextInt, readBack), 0x999)
                    except taos.error.ProgrammingError as err:
                        errno = Helper.convertErrno(err.errno)
                        if errno in [0x991, 0x992]  : # not a single result
                            raise taos.error.ProgrammingError(
                                "Failed to read back same data for tick: {}, wrote: {}, read: {}"
                                .format(nextTick, nextInt, "Empty Result" if errno==0x991 else "Multiple Result"),
                                errno)
                        # Re-throw no matter what
                        raise
2286 2287
                

S
Shuduo Sang 已提交
2288
            self.activeTable.discard(i)  # not raising an error, unlike remove
2289 2290


S
Steven Li 已提交
2291 2292
# Deterministic random number generator
class Dice():
S
Shuduo Sang 已提交
2293
    seeded = False  # static, uninitialized
S
Steven Li 已提交
2294 2295

    @classmethod
S
Shuduo Sang 已提交
2296
    def seed(cls, s):  # static
S
Steven Li 已提交
2297
        if (cls.seeded):
S
Shuduo Sang 已提交
2298 2299
            raise RuntimeError(
                "Cannot seed the random generator more than once")
S
Steven Li 已提交
2300 2301 2302 2303 2304
        cls.verifyRNG()
        random.seed(s)
        cls.seeded = True  # TODO: protect against multi-threading

    @classmethod
S
Shuduo Sang 已提交
2305
    def verifyRNG(cls):  # Verify that the RNG is determinstic
S
Steven Li 已提交
2306 2307 2308 2309
        random.seed(0)
        x1 = random.randrange(0, 1000)
        x2 = random.randrange(0, 1000)
        x3 = random.randrange(0, 1000)
S
Shuduo Sang 已提交
2310
        if (x1 != 864 or x2 != 394 or x3 != 776):
S
Steven Li 已提交
2311 2312 2313
            raise RuntimeError("System RNG is not deterministic")

    @classmethod
S
Shuduo Sang 已提交
2314
    def throw(cls, stop):  # get 0 to stop-1
2315
        return cls.throwRange(0, stop)
S
Steven Li 已提交
2316 2317

    @classmethod
S
Shuduo Sang 已提交
2318 2319
    def throwRange(cls, start, stop):  # up to stop-1
        if (not cls.seeded):
S
Steven Li 已提交
2320
            raise RuntimeError("Cannot throw dice before seeding it")
2321
        return random.randrange(start, stop)
S
Steven Li 已提交
2322

2323 2324 2325 2326
    @classmethod
    def choice(cls, cList):
        return random.choice(cList)

S
Steven Li 已提交
2327

S
Steven Li 已提交
2328 2329
class LoggingFilter(logging.Filter):
    def filter(self, record: logging.LogRecord):
S
Shuduo Sang 已提交
2330 2331
        if (record.levelno >= logging.INFO):
            return True  # info or above always log
S
Steven Li 已提交
2332

S
Steven Li 已提交
2333 2334 2335 2336
        # Commenting out below to adjust...

        # if msg.startswith("[TRD]"):
        #     return False
S
Steven Li 已提交
2337 2338
        return True

S
Shuduo Sang 已提交
2339 2340

class MyLoggingAdapter(logging.LoggerAdapter):
2341 2342 2343 2344
    def process(self, msg, kwargs):
        return "[{}]{}".format(threading.get_ident() % 10000, msg), kwargs
        # return '[%s] %s' % (self.extra['connid'], msg), kwargs

S
Shuduo Sang 已提交
2345 2346

class SvcManager:
2347
    def __init__(self):
2348
        print("Starting TDengine Service Manager")
2349 2350 2351
        # signal.signal(signal.SIGTERM, self.sigIntHandler) # Moved to MainExec
        # signal.signal(signal.SIGINT, self.sigIntHandler)
        # signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
2352

2353
        self.inSigHandler = False
2354 2355
        # self._status = MainExec.STATUS_RUNNING # set inside
        # _startTaosService()
2356
        self.svcMgrThread = None # type: ServiceManagerThread
2357 2358
        self._lock = threading.Lock()
        self._isRestarting = False
2359

2360 2361 2362 2363 2364 2365 2366 2367
    def _doMenu(self):
        choice = ""
        while True:
            print("\nInterrupting Service Program, Choose an Action: ")
            print("1: Resume")
            print("2: Terminate")
            print("3: Restart")
            # Remember to update the if range below
S
Shuduo Sang 已提交
2368
            # print("Enter Choice: ", end="", flush=True)
2369 2370 2371
            while choice == "":
                choice = input("Enter Choice: ")
                if choice != "":
S
Shuduo Sang 已提交
2372 2373 2374
                    break  # done with reading repeated input
            if choice in ["1", "2", "3"]:
                break  # we are done with whole method
2375
            print("Invalid choice, please try again.")
S
Shuduo Sang 已提交
2376
            choice = ""  # reset
2377 2378
        return choice

S
Shuduo Sang 已提交
2379
    def sigUsrHandler(self, signalNumber, frame):
2380
        print("Interrupting main thread execution upon SIGUSR1")
2381
        if self.inSigHandler:  # already
2382
            print("Ignoring repeated SIG...")
S
Shuduo Sang 已提交
2383
            return  # do nothing if it's already not running
2384
        self.inSigHandler = True
2385 2386

        choice = self._doMenu()
S
Shuduo Sang 已提交
2387 2388 2389 2390 2391
        if choice == "1":
            # TODO: can the sub-process be blocked due to us not reading from
            # queue?
            self.sigHandlerResume()
        elif choice == "2":
2392
            self.stopTaosService()
2393 2394
        elif choice == "3": # Restart
            self.restart()
2395 2396
        else:
            raise RuntimeError("Invalid menu choice: {}".format(choice))
2397

2398 2399
        self.inSigHandler = False

2400
    def sigIntHandler(self, signalNumber, frame):
2401
        print("SvcManager: INT Signal Handler starting...")
2402
        if self.inSigHandler:
2403 2404
            print("Ignoring repeated SIG_INT...")
            return
2405
        self.inSigHandler = True
2406

S
Shuduo Sang 已提交
2407
        self.stopTaosService()
2408
        print("SvcManager: INT Signal Handler returning...")
2409
        self.inSigHandler = False
2410

S
Shuduo Sang 已提交
2411
    def sigHandlerResume(self):
2412
        print("Resuming TDengine service manager thread (main thread)...\n\n")
2413

2414
    def _checkServiceManagerThread(self):
2415 2416 2417 2418
        if self.svcMgrThread:  # valid svc mgr thread
            if self.svcMgrThread.isStopped():  # done?
                self.svcMgrThread.procIpcBatch()  # one last time. TODO: appropriate?
                self.svcMgrThread = None  # no more
2419 2420

    def _procIpcAll(self):
2421 2422 2423 2424 2425 2426
        while self.isRunning() or self.isRestarting() :  # for as long as the svc mgr thread is still here
            if self.isRunning():
                self.svcMgrThread.procIpcBatch()  # regular processing,
                self._checkServiceManagerThread()
            elif self.isRetarting():
                print("Service restarting...")
2427 2428 2429 2430 2431
            time.sleep(0.5)  # pause, before next round
        print(
            "Service Manager Thread (with subprocess) has ended, main thread now exiting...")

    def startTaosService(self):
2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442
        with self._lock:
            if self.svcMgrThread:
                raise RuntimeError("Cannot start TAOS service when one may already be running")

            # Find if there's already a taosd service, and then kill it
            for proc in psutil.process_iter():
                if proc.name() == 'taosd':
                    print("Killing an existing TAOSD process in 2 seconds... press CTRL-C to interrupe")
                    time.sleep(2.0)
                    proc.kill()
                # print("Process: {}".format(proc.name()))
2443

2444 2445
            
            self.svcMgrThread = ServiceManagerThread()  # create the object
S
Steven Li 已提交
2446
            print("Attempting to start TAOS service started, printing out output...")
2447
            self.svcMgrThread.start()            
2448
            self.svcMgrThread.procIpcBatch(trimToTarget=10, forceOutput=True)  # for printing 10 lines             
2449
            print("TAOS service started")
2450 2451

    def stopTaosService(self, outputLines=20):
2452 2453 2454 2455
        with self._lock:
            if not self.isRunning():
                logger.warning("Cannot stop TAOS service, not running")
                return
2456

2457 2458 2459 2460 2461
            print("Terminating Service Manager Thread (SMT) execution...")
            self.svcMgrThread.stop()
            if self.svcMgrThread.isStopped():
                self.svcMgrThread.procIpcBatch(outputLines)  # one last time
                self.svcMgrThread = None
2462 2463
                print("End of TDengine Service Output")
                print("----- TDengine Service (managed by SMT) is now terminated -----\n")
2464 2465
            else:
                print("WARNING: SMT did not terminate as expected")
2466 2467 2468

    def run(self):
        self.startTaosService()
2469
        self._procIpcAll()  # pump/process all the messages, may encounter SIG + restart
2470
        if self.isRunning():  # if sig handler hasn't destroyed it by now
2471 2472
            self.stopTaosService()  # should have started already

2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486
    def restart(self):
        if self._isRestarting:
            logger.warning("Cannot restart service when it's already restarting")
            return

        self._isRestarting = True
        if self.isRunning():
            self.stopTaosService()
        else:
            logger.warning("Service not running when restart requested")

        self.startTaosService()
        self._isRestarting = False

2487 2488
    def isRunning(self):
        return self.svcMgrThread != None
2489

2490 2491 2492
    def isRestarting(self):
        return self._isRestarting

2493 2494 2495 2496
class ServiceManagerThread:
    MAX_QUEUE_SIZE = 10000

    def __init__(self):
2497
        self._tdeSubProcess = None # type: TdeSubProcess
2498
        self._thread = None
2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515
        self._status = None

    def getStatus(self):
        return self._status

    def isRunning(self):
        # return self._thread and self._thread.is_alive()
        return self._status == MainExec.STATUS_RUNNING

    def isStopping(self):
        return self._status == MainExec.STATUS_STOPPING

    def isStopped(self):
        return self._status == MainExec.STATUS_STOPPED

    # Start the thread (with sub process), and wait for the sub service
    # to become fully operational
2516 2517
    def start(self):
        if self._thread:
2518
            raise RuntimeError("Unexpected _thread")
2519
        if self._tdeSubProcess:
2520 2521 2522 2523
            raise RuntimeError("TDengine sub process already created/running")

        self._status = MainExec.STATUS_STARTING

2524
        self._tdeSubProcess = TdeSubProcess()
2525 2526 2527
        self._tdeSubProcess.start()

        self._ipcQueue = Queue()
2528
        self._thread = threading.Thread( # First thread captures server OUTPUT
2529
            target=self.svcOutputReader,
2530
            args=(self._tdeSubProcess.getStdOut(), self._ipcQueue))
2531
        self._thread.daemon = True  # thread dies with the program
2532 2533
        self._thread.start()

2534
        self._thread2 = threading.Thread( # 2nd thread captures server ERRORs
2535 2536 2537 2538 2539
            target=self.svcErrorReader,
            args=(self._tdeSubProcess.getStdErr(), self._ipcQueue))
        self._thread2.daemon = True  # thread dies with the program
        self._thread2.start()

2540
        # wait for service to start
R
root 已提交
2541
        for i in range(0, 100):
2542 2543 2544
            time.sleep(1.0)
            # self.procIpcBatch() # don't pump message during start up
            print("_zz_", end="", flush=True)
2545
            if self._status == MainExec.STATUS_RUNNING:
2546
                logger.info("[] TDengine service READY to process requests")
2547 2548
                return  # now we've started
        # TODO: handle this better?
R
root 已提交
2549
        self.procIpcBatch(100, True) # display output before cronking out, trim to last 20 msgs, force output
2550
        raise RuntimeError("TDengine service did not start successfully")
2551 2552 2553 2554 2555 2556

    def stop(self):
        # can be called from both main thread or signal handler
        print("Terminating TDengine service running as the sub process...")
        if self.isStopped():
            print("Service already stopped")
2557
            return
2558 2559 2560
        if self.isStopping():
            print("Service is already being stopped")
            return
2561 2562 2563 2564
        # Linux will send Control-C generated SIGINT to the TDengine process
        # already, ref:
        # https://unix.stackexchange.com/questions/176235/fork-and-how-signals-are-delivered-to-processes
        if not self._tdeSubProcess:
2565
            raise RuntimeError("sub process object missing")
2566

2567
        self._status = MainExec.STATUS_STOPPING
2568 2569
        retCode = self._tdeSubProcess.stop()
        print("Attempted to stop sub process, got return code: {}".format(retCode))
2570 2571
        if (retCode==-11): # SGV
            logger.error("[[--ERROR--]]: TDengine service SEGV fault (check core file!)")
2572

2573
        if self._tdeSubProcess.isRunning():  # still running
2574 2575
            print("FAILED to stop sub process, it is still running... pid = {}".format(
                    self._tdeSubProcess.getPid()))
2576
        else:
2577 2578 2579
            self._tdeSubProcess = None  # not running any more
            self.join()  # stop the thread, change the status, etc.

2580 2581 2582
    def join(self):
        # TODO: sanity check
        if not self.isStopping():
2583 2584 2585
            raise RuntimeError(
                "Unexpected status when ending svc mgr thread: {}".format(
                    self._status))
2586

2587
        if self._thread:
2588
            self._thread.join()
2589
            self._thread = None
2590
            self._status = MainExec.STATUS_STOPPED
2591 2592 2593
            # STD ERR thread
            self._thread2.join()
            self._thread2 = None
S
Shuduo Sang 已提交
2594
        else:
2595
            print("Joining empty thread, doing nothing")
2596 2597 2598

    def _trimQueue(self, targetSize):
        if targetSize <= 0:
2599
            return  # do nothing
2600
        q = self._ipcQueue
2601
        if (q.qsize() <= targetSize):  # no need to trim
2602 2603 2604 2605
            return

        logger.debug("Triming IPC queue to target size: {}".format(targetSize))
        itemsToTrim = q.qsize() - targetSize
2606
        for i in range(0, itemsToTrim):
2607 2608 2609
            try:
                q.get_nowait()
            except Empty:
2610 2611
                break  # break out of for loop, no more trimming

2612
    TD_READY_MSG = "TDengine is initialized successfully"
S
Shuduo Sang 已提交
2613

2614 2615
    def procIpcBatch(self, trimToTarget=0, forceOutput=False):
        self._trimQueue(trimToTarget)  # trim if necessary
S
Shuduo Sang 已提交
2616 2617
        # Process all the output generated by the underlying sub process,
        # managed by IO thread
2618
        print("<", end="", flush=True)
S
Shuduo Sang 已提交
2619 2620
        while True:
            try:
2621
                line = self._ipcQueue.get_nowait()  # getting output at fast speed
2622
                self._printProgress("_o")
2623 2624 2625
            except Empty:
                # time.sleep(2.3) # wait only if there's no output
                # no more output
2626
                print(".>", end="", flush=True)
S
Shuduo Sang 已提交
2627
                return  # we are done with THIS BATCH
2628
            else:  # got line, printing out
2629 2630 2631 2632 2633 2634 2635
                if forceOutput:
                    logger.info(line)
                else:
                    logger.debug(line)
        print(">", end="", flush=True)

    _ProgressBars = ["--", "//", "||", "\\\\"]
2636

2637 2638
    def _printProgress(self, msg):  # TODO: assuming 2 chars
        print(msg, end="", flush=True)
2639 2640 2641
        pBar = self._ProgressBars[Dice.throw(4)]
        print(pBar, end="", flush=True)
        print('\b\b\b\b', end="", flush=True)
2642

2643 2644 2645
    def svcOutputReader(self, out: IO, queue):
        # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
        # print("This is the svcOutput Reader...")
2646
        # for line in out :
2647 2648 2649
        for line in iter(out.readline, b''):
            # print("Finished reading a line: {}".format(line))
            # print("Adding item to queue...")
2650 2651 2652 2653 2654
            try:
                line = line.decode("utf-8").rstrip()
            except UnicodeError:
                print("\nNon-UTF8 server output: {}\n".format(line))

2655 2656
            # This might block, and then causing "out" buffer to block
            queue.put(line)
2657 2658
            self._printProgress("_i")

2659 2660
            if self._status == MainExec.STATUS_STARTING:  # we are starting, let's see if we have started
                if line.find(self.TD_READY_MSG) != -1:  # found
S
Steven Li 已提交
2661
                    logger.info("Waiting for the service to become FULLY READY")
2662
                    time.sleep(1.0) # wait for the server to truly start. TODO: remove this
S
Steven Li 已提交
2663 2664
                    logger.info("Service is now FULLY READY")   
                    self._status = MainExec.STATUS_RUNNING                 
2665 2666

            # Trim the queue if necessary: TODO: try this 1 out of 10 times
2667
            self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10)  # trim to 90% size
2668

2669 2670 2671
            if self.isStopping():  # TODO: use thread status instead
                # WAITING for stopping sub process to finish its outptu
                print("_w", end="", flush=True)
2672 2673

            # queue.put(line)
2674 2675
        # meaning sub process must have died
        print("\nNo more output from IO thread managing TDengine service")
2676 2677
        out.close()

2678 2679
    def svcErrorReader(self, err: IO, queue):
        for line in iter(err.readline, b''):
2680
            print("\nTDengine Service (taosd) ERROR (from stderr): {}".format(line))
2681

2682 2683

class TdeSubProcess:
2684 2685 2686 2687 2688
    def __init__(self):
        self.subProcess = None

    def getStdOut(self):
        return self.subProcess.stdout
2689

2690 2691 2692
    def getStdErr(self):
        return self.subProcess.stderr

2693
    def isRunning(self):
2694
        return self.subProcess is not None
2695

2696 2697 2698
    def getPid(self):
        return self.subProcess.pid

S
Shuduo Sang 已提交
2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712
    def getBuildPath(self):
        selfPath = os.path.dirname(os.path.realpath(__file__))
        if ("community" in selfPath):
            projPath = selfPath[:selfPath.find("communit")]
        else:
            projPath = selfPath[:selfPath.find("tests")]

        for root, dirs, files in os.walk(projPath):
            if ("taosd" in files):
                rootRealPath = os.path.dirname(os.path.realpath(root))
                if ("packaging" not in rootRealPath):
                    buildPath = root[:len(root) - len("/build/bin")]
                    break
        return buildPath
2713

2714
    def start(self):
2715
        ON_POSIX = 'posix' in sys.builtin_module_names
S
Shuduo Sang 已提交
2716

2717 2718 2719
        taosdPath = self.getBuildPath() + "/build/bin/taosd"
        cfgPath = self.getBuildPath() + "/test/cfg"

2720 2721 2722
        # Delete the log files
        logPath = self.getBuildPath() + "/test/log"
        # ref: https://stackoverflow.com/questions/1995373/deleting-all-files-in-a-directory-with-python/1995397
2723 2724 2725 2726
        # filelist = [ f for f in os.listdir(logPath) ] # if f.endswith(".bak") ]
        # for f in filelist:
        #     filePath = os.path.join(logPath, f)
        #     print("Removing log file: {}".format(filePath))
2727 2728 2729 2730 2731 2732
        #     os.remove(filePath)        
        if os.path.exists(logPath):
            logPathSaved = logPath + "_" + time.strftime('%Y-%m-%d-%H-%M-%S')
            logger.info("Saving old log files to: {}".format(logPathSaved))
            os.rename(logPath, logPathSaved)
        # os.mkdir(logPath) # recreate, no need actually, TDengine will auto-create with proper perms
2733
            
S
Shuduo Sang 已提交
2734
        svcCmd = [taosdPath, '-c', cfgPath]
2735
        # svcCmdSingle = "{} -c {}".format(taosdPath, cfgPath)
2736
        # svcCmd = ['vmstat', '1']
S
Shuduo Sang 已提交
2737
        if self.subProcess:  # already there
2738 2739
            raise RuntimeError("Corrupt process state")

S
Steven Li 已提交
2740
        # print("Starting service: {}".format(svcCmd))
2741
        self.subProcess = subprocess.Popen(
2742 2743
            svcCmd, shell=False,
            # svcCmdSingle, shell=True, # capture core dump?
S
Shuduo Sang 已提交
2744
            stdout=subprocess.PIPE,
2745
            stderr=subprocess.PIPE,
2746
            # bufsize=1, # not supported in binary mode
S
Steven Li 已提交
2747 2748
            close_fds=ON_POSIX
            )  # had text=True, which interferred with reading EOF
2749

2750 2751 2752
    def stop(self):
        if not self.subProcess:
            print("Sub process already stopped")
2753
            return -1
2754

2755
        retCode = self.subProcess.poll() # contains real sub process return code
S
Shuduo Sang 已提交
2756
        if retCode:  # valid return code, process ended
2757
            self.subProcess = None
S
Shuduo Sang 已提交
2758 2759
        else:  # process still alive, let's interrupt it
            print(
2760
                "Sub process is running, sending SIG_INT and waiting for it to terminate...")
S
Shuduo Sang 已提交
2761 2762 2763 2764
            # sub process should end, then IPC queue should end, causing IO
            # thread to end
            self.subProcess.send_signal(signal.SIGINT)
            try:
2765
                self.subProcess.wait(10)
2766
                retCode = self.subProcess.returncode
2767 2768
            except subprocess.TimeoutExpired as err:
                print("Time out waiting for TDengine service process to exit")
2769
                retCode = -3
2770
            else:
2771
                print("TDengine service process terminated successfully from SIG_INT")
2772
                retCode = -4
2773
                self.subProcess = None
2774
        return retCode
2775

2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803
class ThreadStacks: # stack info for all threads
    def __init__(self):
        self._allStacks = {}
        allFrames = sys._current_frames()
        for th in threading.enumerate():                        
            stack = traceback.extract_stack(allFrames[th.ident])     
            self._allStacks[th.native_id] = stack

    def print(self, filteredEndName = None, filterInternal = False):
        for thNid, stack in self._allStacks.items(): # for each thread            
            lastFrame = stack[-1]
            if filteredEndName: # we need to filter out stacks that match this name                
                if lastFrame.name == filteredEndName : # end did not match
                    continue
            if filterInternal:
                if lastFrame.name in ['wait', 'invoke_excepthook', 
                    '_wait', # The Barrier exception
                    'svcOutputReader', # the svcMgr thread
                    '__init__']: # the thread that extracted the stack
                    continue # ignore
            # Now print
            print("\n<----- Thread Info for ID: {}".format(thNid))
            for frame in stack:
                # print(frame)
                print("File {filename}, line {lineno}, in {name}".format(
                    filename=frame.filename, lineno=frame.lineno, name=frame.name))
                print("    {}".format(frame.line))
            print("-----> End of Thread Info\n")
S
Shuduo Sang 已提交
2804

2805 2806 2807
class ClientManager:
    def __init__(self):
        print("Starting service manager")
2808 2809
        # signal.signal(signal.SIGTERM, self.sigIntHandler)
        # signal.signal(signal.SIGINT, self.sigIntHandler)
2810

2811
        self._status = MainExec.STATUS_RUNNING
2812 2813
        self.tc = None

2814 2815
        self.inSigHandler = False

2816
    def sigIntHandler(self, signalNumber, frame):
2817
        if self._status != MainExec.STATUS_RUNNING:
2818 2819 2820
            print("Repeated SIGINT received, forced exit...")
            # return  # do nothing if it's already not running
            sys.exit(-1)
2821
        self._status = MainExec.STATUS_STOPPING  # immediately set our status
2822

2823
        print("ClientManager: Terminating program...")
2824 2825
        self.tc.requestToStop()

2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866
    def _doMenu(self):
        choice = ""
        while True:
            print("\nInterrupting Client Program, Choose an Action: ")
            print("1: Resume")
            print("2: Terminate")
            print("3: Show Threads")
            # Remember to update the if range below
            # print("Enter Choice: ", end="", flush=True)
            while choice == "":
                choice = input("Enter Choice: ")
                if choice != "":
                    break  # done with reading repeated input
            if choice in ["1", "2", "3"]:
                break  # we are done with whole method
            print("Invalid choice, please try again.")
            choice = ""  # reset
        return choice

    def sigUsrHandler(self, signalNumber, frame):
        print("Interrupting main thread execution upon SIGUSR1")
        if self.inSigHandler:  # already
            print("Ignoring repeated SIG_USR1...")
            return  # do nothing if it's already not running
        self.inSigHandler = True

        choice = self._doMenu()
        if choice == "1":
            print("Resuming execution...")
            time.sleep(1.0)
        elif choice == "2":
            print("Not implemented yet")
            time.sleep(1.0)
        elif choice == "3":
            ts = ThreadStacks()
            ts.print()
        else:
            raise RuntimeError("Invalid menu choice: {}".format(choice))

        self.inSigHandler = False

2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895
    # TODO: need to revise how we verify data durability
    # def _printLastNumbers(self):  # to verify data durability
    #     dbManager = DbManager()
    #     dbc = dbManager.getDbConn()
    #     if dbc.query("show databases") <= 1:  # no database (we have a default called "log")
    #         return
    #     dbc.execute("use db")
    #     if dbc.query("show tables") == 0:  # no tables
    #         return

    #     sTbName = dbManager.getFixedSuperTableName()

    #     # get all regular tables
    #     # TODO: analyze result set later
    #     dbc.query("select TBNAME from db.{}".format(sTbName))
    #     rTables = dbc.getQueryResult()

    #     bList = TaskExecutor.BoundedList()
    #     for rTbName in rTables:  # regular tables
    #         dbc.query("select speed from db.{}".format(rTbName[0]))
    #         numbers = dbc.getQueryResult()
    #         for row in numbers:
    #             # print("<{}>".format(n), end="", flush=True)
    #             bList.add(row[0])

    #     print("Top numbers in DB right now: {}".format(bList))
    #     print("TDengine client execution is about to start in 2 seconds...")
    #     time.sleep(2.0)
    #     dbManager = None  # release?
2896

2897
    def run(self, svcMgr):    
2898
        # self._printLastNumbers()
2899
        global gConfig
2900

S
Shuduo Sang 已提交
2901
        dbManager = DbManager()  # Regular function
2902
        thPool = ThreadPool(gConfig.num_threads, gConfig.max_steps)
2903
        self.tc = ThreadCoordinator(thPool, dbManager)
2904
        
2905
        self.tc.run()
S
Steven Li 已提交
2906 2907
        # print("exec stats: {}".format(self.tc.getExecStats()))
        # print("TC failed = {}".format(self.tc.isFailed()))
2908
        if svcMgr: # gConfig.auto_start_service:
2909
            svcMgr.stopTaosService()
2910
            svcMgr = None
2911 2912
        # Print exec status, etc., AFTER showing messages from the server
        self.conclude()
S
Steven Li 已提交
2913
        # print("TC failed (2) = {}".format(self.tc.isFailed()))
S
Shuduo Sang 已提交
2914
        # Linux return code: ref https://shapeshed.com/unix-exit-codes/
2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933
        ret = 1 if self.tc.isFailed() else 0
        self.tc.cleanup()

        # Release global variables
        gConfig = None
        gSvcMgr = None
        logger = None

        # Release variables here
        self.tc = None
        thPool = None
        dbManager = None

        gc.collect() # force garbage collection
        # h = hpy()
        # print("\n----- Final Python Heap -----\n")        
        # print(h.heap())

        return ret
2934 2935

    def conclude(self):
2936
        # self.tc.getDbManager().cleanUp() # clean up first, so we can show ZERO db connections
2937
        self.tc.printStats()
2938 2939 2940

        
        
2941 2942

class MainExec:
2943 2944 2945
    STATUS_STARTING = 1
    STATUS_RUNNING = 2
    STATUS_STOPPING = 3
2946
    STATUS_STOPPED = 4
2947

2948 2949 2950
    def __init__(self):        
        self._clientMgr = None
        self._svcMgr = None
2951

2952 2953 2954
        signal.signal(signal.SIGTERM, self.sigIntHandler)
        signal.signal(signal.SIGINT,  self.sigIntHandler)
        signal.signal(signal.SIGUSR1, self.sigUsrHandler)  # different handler!
2955

2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968
    def sigUsrHandler(self, signalNumber, frame):
        if self._clientMgr:
            self._clientMgr.sigUsrHandler(signalNumber, frame)
        elif self._svcMgr: # Only if no client mgr, we are running alone
            self._svcMgr.sigUsrHandler(signalNumber, frame)
        
    def sigIntHandler(self, signalNumber, frame):
        if self._svcMgr:
            self._svcMgr.sigIntHandler(signalNumber, frame)
        if self._clientMgr:
            self._clientMgr.sigIntHandler(signalNumber, frame)

    def runClient(self):
2969
        global gSvcMgr
2970 2971
        if gConfig.auto_start_service:
            self._svcMgr = SvcManager()
2972
            gSvcMgr = self._svcMgr # hack alert
2973 2974 2975
            self._svcMgr.startTaosService() # we start, don't run
        
        self._clientMgr = ClientManager()
2976 2977 2978 2979
        ret = None
        try: 
            ret = self._clientMgr.run(self._svcMgr) # stop TAOS service inside
        except requests.exceptions.ConnectionError as err:
S
Steven Li 已提交
2980
            logger.warning("Failed to open REST connection to DB: {}".format(err.getMessage()))
2981 2982
            # don't raise
        return ret
2983 2984

    def runService(self):
2985
        global gSvcMgr
2986
        self._svcMgr = SvcManager()
2987 2988
        gSvcMgr = self._svcMgr # save it in a global variable TODO: hack alert

2989
        self._svcMgr.run() # run to some end state
2990 2991
        self._svcMgr = None 
        gSvcMgr = None        
2992 2993

    def runTemp(self):  # for debugging purposes
2994 2995
        # # Hack to exercise reading from disk, imcreasing coverage. TODO: fix
        # dbc = dbState.getDbConn()
S
Shuduo Sang 已提交
2996
        # sTbName = dbState.getFixedSuperTableName()
2997 2998
        # dbc.execute("create database if not exists db")
        # if not dbState.getState().equals(StateEmpty()):
S
Shuduo Sang 已提交
2999
        #     dbc.execute("use db")
3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010

        # rTables = None
        # try: # the super table may not exist
        #     sql = "select TBNAME from db.{}".format(sTbName)
        #     logger.info("Finding out tables in super table: {}".format(sql))
        #     dbc.query(sql) # TODO: analyze result set later
        #     logger.info("Fetching result")
        #     rTables = dbc.getQueryResult()
        #     logger.info("Result: {}".format(rTables))
        # except taos.error.ProgrammingError as err:
        #     logger.info("Initial Super table OPS error: {}".format(err))
S
Shuduo Sang 已提交
3011

3012 3013 3014 3015 3016 3017 3018 3019
        # # sys.exit()
        # if ( not rTables == None):
        #     # print("rTables[0] = {}, type = {}".format(rTables[0], type(rTables[0])))
        #     try:
        #         for rTbName in rTables : # regular tables
        #             ds = dbState
        #             logger.info("Inserting into table: {}".format(rTbName[0]))
        #             sql = "insert into db.{} values ('{}', {});".format(
S
Shuduo Sang 已提交
3020
        #                 rTbName[0],
3021 3022
        #                 ds.getNextTick(), ds.getNextInt())
        #             dbc.execute(sql)
S
Shuduo Sang 已提交
3023
        #         for rTbName in rTables : # regular tables
3024
        #             dbc.query("select * from db.{}".format(rTbName[0])) # TODO: check success failure
S
Shuduo Sang 已提交
3025
        #         logger.info("Initial READING operation is successful")
3026
        #     except taos.error.ProgrammingError as err:
S
Shuduo Sang 已提交
3027 3028
        #         logger.info("Initial WRITE/READ error: {}".format(err))

3029 3030 3031
        # Sandbox testing code
        # dbc = dbState.getDbConn()
        # while True:
S
Shuduo Sang 已提交
3032
        #     rows = dbc.query("show databases")
3033
        #     print("Rows: {}, time={}".format(rows, time.time()))
S
Shuduo Sang 已提交
3034 3035
        return

S
Steven Li 已提交
3036

3037
def main():
S
Shuduo Sang 已提交
3038 3039
    # Super cool Python argument library:
    # https://docs.python.org/3/library/argparse.html
3040 3041 3042 3043 3044 3045 3046 3047 3048
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent('''\
            TDengine Auto Crash Generator (PLEASE NOTICE the Prerequisites Below)
            ---------------------------------------------------------------------
            1. You build TDengine in the top level ./build directory, as described in offical docs
            2. You run the server there before this script: ./build/bin/taosd -c test/cfg

            '''))
3049

3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070
    # parser.add_argument('-a', '--auto-start-service', action='store_true',                        
    #                     help='Automatically start/stop the TDengine service (default: false)')
    # parser.add_argument('-c', '--connector-type', action='store', default='native', type=str,
    #                     help='Connector type to use: native, rest, or mixed (default: 10)')
    # parser.add_argument('-d', '--debug', action='store_true',                        
    #                     help='Turn on DEBUG mode for more logging (default: false)')
    # parser.add_argument('-e', '--run-tdengine', action='store_true',                        
    #                     help='Run TDengine service in foreground (default: false)')
    # parser.add_argument('-l', '--larger-data', action='store_true',                        
    #                     help='Write larger amount of data during write operations (default: false)')
    # parser.add_argument('-p', '--per-thread-db-connection', action='store_true',                        
    #                     help='Use a single shared db connection (default: false)')
    # parser.add_argument('-r', '--record-ops', action='store_true',                        
    #                     help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')                    
    # parser.add_argument('-s', '--max-steps', action='store', default=1000, type=int,
    #                     help='Maximum number of steps to run (default: 100)')
    # parser.add_argument('-t', '--num-threads', action='store', default=5, type=int,
    #                     help='Number of threads to run (default: 10)')
    # parser.add_argument('-x', '--continue-on-exception', action='store_true',                        
    #                     help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')                        

S
Shuduo Sang 已提交
3071 3072 3073 3074 3075
    parser.add_argument(
        '-a',
        '--auto-start-service',
        action='store_true',
        help='Automatically start/stop the TDengine service (default: false)')
3076 3077 3078 3079 3080 3081 3082
    parser.add_argument(
        '-b',
        '--max-dbs',
        action='store',
        default=0,
        type=int,
        help='Maximum number of DBs to keep, set to disable dropping DB. (default: 0)')
S
Shuduo Sang 已提交
3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099
    parser.add_argument(
        '-c',
        '--connector-type',
        action='store',
        default='native',
        type=str,
        help='Connector type to use: native, rest, or mixed (default: 10)')
    parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        help='Turn on DEBUG mode for more logging (default: false)')
    parser.add_argument(
        '-e',
        '--run-tdengine',
        action='store_true',
        help='Run TDengine service in foreground (default: false)')
3100 3101 3102 3103 3104 3105 3106
    parser.add_argument(
        '-i',
        '--max-replicas',
        action='store',
        default=1,
        type=int,
        help='Maximum number of replicas to use, when testing against clusters. (default: 1)')
S
Shuduo Sang 已提交
3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135
    parser.add_argument(
        '-l',
        '--larger-data',
        action='store_true',
        help='Write larger amount of data during write operations (default: false)')
    parser.add_argument(
        '-p',
        '--per-thread-db-connection',
        action='store_true',
        help='Use a single shared db connection (default: false)')
    parser.add_argument(
        '-r',
        '--record-ops',
        action='store_true',
        help='Use a pair of always-fsynced fils to record operations performing + performed, for power-off tests (default: false)')
    parser.add_argument(
        '-s',
        '--max-steps',
        action='store',
        default=1000,
        type=int,
        help='Maximum number of steps to run (default: 100)')
    parser.add_argument(
        '-t',
        '--num-threads',
        action='store',
        default=5,
        type=int,
        help='Number of threads to run (default: 10)')
3136
    parser.add_argument(
3137 3138 3139 3140 3141
        '-v',
        '--verify-data',
        action='store_true',
        help='Verify data written in a number of places by reading back (default: false)')
    parser.add_argument(
3142 3143 3144 3145
        '-x',
        '--continue-on-exception',
        action='store_true',
        help='Continue execution after encountering unexpected/disallowed errors/exceptions (default: false)')
3146

3147
    global gConfig
3148
    gConfig = parser.parse_args()
3149

3150
    # Logging Stuff
3151
    global logger
S
Shuduo Sang 已提交
3152 3153
    _logger = logging.getLogger('CrashGen')  # real logger
    _logger.addFilter(LoggingFilter())
3154 3155 3156
    ch = logging.StreamHandler()
    _logger.addHandler(ch)

S
Shuduo Sang 已提交
3157 3158
    # Logging adapter, to be used as a logger
    logger = MyLoggingAdapter(_logger, [])
3159

S
Shuduo Sang 已提交
3160 3161
    if (gConfig.debug):
        logger.setLevel(logging.DEBUG)  # default seems to be INFO
S
Steven Li 已提交
3162 3163
    else:
        logger.setLevel(logging.INFO)
S
Shuduo Sang 已提交
3164

3165 3166
    Dice.seed(0)  # initial seeding of dice

3167
    # Run server or client
3168
    mExec = MainExec()
S
Shuduo Sang 已提交
3169
    if gConfig.run_tdengine:  # run server
3170
        mExec.runService()
S
Shuduo Sang 已提交
3171
    else:
3172
        return mExec.runClient()
3173

S
Shuduo Sang 已提交
3174

3175
if __name__ == "__main__":
S
Steven Li 已提交
3176 3177 3178
    exitCode = main()
    # print("Exiting with code: {}".format(exitCode))
    sys.exit(exitCode)