#!/usr/bin/env python # # Copyright (c) Greenplum Inc 2010. All Rights Reserved. import os, sys, re, tempfile, shutil, pickle, getpass, subprocess, time, glob, ConfigParser try: from optparse import Option, OptionParser from gppylib.gpparseopts import OptParser, OptChecker from gppylib.gplog import get_default_logger, setup_tool_logging from gppylib.commands.unix import getLocalHostname, getUserName, SYSTEM from gppylib.commands.base import WorkerPool, Command, REMOTE from gppylib.gpcheckutil import HostType, hosttype_str from gppylib.operations.gpcheck import get_host_for_command, get_command, get_copy_command except ImportError, e: sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e)) EXECNAME = os.path.split(__file__)[-1] class GpCheckInfo: def __init__(self): self.is_root = False self.host_type = HostType.GPCHECK_HOSTTYPE_UNDEFINED self.appliance_version = None self.hosts = dict() class GpCheckConfig: def __init__(self): self.gpcheck_config_version = None self.expectedSysctlValues = dict() gpcheck_info = None gpcheck_config = None config = None gphome = None configfilename = None options = None logger = None pool = None tmpdir = None found_errors = False GPCHECK_CONFIGFILE_GPHOME = None class GpCheckHost: def __init__(self, name): self.hostname = name self.is_a_master = False self.primary_master = False self.datafile = None self.data = None def __str__(self): return "%s is_a_master(%s) primary_master(%s) datafile(%s)" % (self.hostname, self.is_a_master, self.primary_master, self.datafile) # True is an error def parseargs(): global options, gphome, configfilename, GPCHECK_CONFIGFILE_GPHOME parser = OptParser(option_class=OptChecker) parser.remove_option('-h') parser.add_option('-?', '--help', action='help') parser.setHelp('') parser.add_option('--stdout', action='store_true') parser.add_option('--zipout', action='store_true') parser.add_option('--verbose', action='store_true') parser.add_option('--zipin', type='string') parser.add_option('-c', '--config', type='string') parser.add_option('-f', '--file', type='string') parser.add_option('-h', '--host', type='string') parser.add_option('-m', '--master', type='string') parser.add_option('-s', '--standbymaster', type='string') parser.add_option('--local', action='store_true') (options, args) = parser.parse_args() global gphome gphome = os.environ.get('GPHOME') if not gphome: logger.error("GPHOME not set") return True if options.config: configfilename = options.config GPCHECK_CONFIGFILE_GPHOME = "%s/etc/gpcheck.cnf" % gphome if not options.local and not options.file and not options.host and not options.zipin: logger.error(" --local or --file or --host or --zipin must be specified") return True if (options.local and options.file) or (options.local and options.host) or (options.file and options.host): logger.error(" Only 1 of --file or --host or --local can be specified") return True if options.stdout and options.zipout: logger.error(" --stdout and --zipout options can not be used together") return True if options.zipin and (options.master or options.standbymaster or options.file or options.host): logger.error(" the options: --master --standbymaster --file --host can not be used when reading data from zip file") return True return False def printWarning(msg): logger.warning("GPCHECK_WARNING: %s", msg) def printError(host, msg): global found_errors found_errors = True logger.error("GPCHECK_ERROR host(%s): %s", host, msg) #true is error def createHostList(): logger.info("dedupe hostnames") global gpcheck_info hostlist = [] # read the host file if present if options.file: try: for line in open(options.file, "r"): hostname = line.strip() if len(hostname) < 1: continue hostlist.append(hostname) except Exception, e: logger.error("Error while reading host file: %s" % options.file) logger.error(e.__str__()) return True elif options.host: hostlist.append(options.host) elif options.local: hostlist.append(getLocalHostname()) try: for hostname in hostlist: cmd_str = 'hostname' cmd = get_command(options.local, cmd_str, hostname) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: address = get_host_for_command(options.local, i) if i.results.rc or i.results.halt or not i.results.completed: logger.error("error obtaining information from host %s" % address) logger.error(i.results.stderr.strip()) return True actualHostname = i.results.stdout.strip() if actualHostname in gpcheck_info.hosts: pass else: gpcheck_info.hosts[actualHostname] = GpCheckHost(actualHostname) if actualHostname == options.master: gpcheck_info.hosts[actualHostname].is_a_master = True gpcheck_info.hosts[actualHostname].primary_master = True elif actualHostname == options.standbymaster: gpcheck_info.hosts[actualHostname].is_a_master = True gpcheck_info.hosts[actualHostname].primary_master = False else: gpcheck_info.hosts[actualHostname].is_a_master = False gpcheck_info.hosts[actualHostname].primary_master = False except Exception, e: logger.error("Error while deduping host file: %s" % options.file) logger.error(e.__str__()) return True return False #true is error def runCollectionOnServers(): logger.info("generate data on servers") global gpcheck_info if gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX: host_type_cl = "--linux" elif gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS: host_type_cl = "--solaris" else: logger.error("Unexpected host_type in runCollectionOnServers") return True gpcheck_info.collection_start_time = time.time() try: for host in gpcheck_info.hosts.keys(): if options.verbose: logger.info("check host: %s" % host) cmd_str = '%s/sbin/gpcheck_hostdump %s' % (gphome, host_type_cl) cmd = get_command(options.local, cmd_str, host) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: host = get_host_for_command(options.local, i) if i.results.rc or i.results.halt or not i.results.completed: logger.error("error obtaining information from host %s" % host) logger.error(i.results.stderr.strip()) return True gpcheck_info.hosts[host].datafile = i.results.stdout.strip() except Exception, e: logger.error("Exception collecting data from servers") logger.error(e.__str__()) return True gpcheck_info.collection_end_time = time.time() return False #true is error def copyFilesLocally(): logger.info("copy data files from servers") try: for host in gpcheck_info.hosts.keys(): cmd = get_copy_command(options.local, host, gpcheck_info.hosts[host].datafile, tmpdir) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error running cmd: %s" % i.cmdStr) logger.error(i.results.stderr.strip()) return True except Exception, e: logger.error("Exception collecting data from servers") logger.error(e.__str__()) return True return False #true is error def deleteRemoteFiles(): logger.info("delete remote tmp files") try: for host in gpcheck_info.hosts.keys(): cmd_str = "rm -f %s" % gpcheck_info.hosts[host].datafile cmd = get_command(options.local, cmd_str, host) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error deleting remote tmp files") logger.error(i.results.stderr.strip()) return True except Exception, e: logger.error("Exception deleting remote tmp files") logger.error(e.__str__()) return True return False #true is error def readDataFiles(): for host in gpcheck_info.hosts.keys(): fname = "%s/%s.data" % (tmpdir, host) try: fd = open(fname, "rb") gpcheck_info.hosts[host].data = pickle.load(fd) fd.close() except Exception, e: logger.errr(e.__str__()) logger.error("Failed extracting pickled save file: %s" % fname) return True return False def testConnectEmc(host): if not host.is_a_master: return expected = "Running" if host.data.connectemc.output != expected: printError(host.hostname, "Connect EMC is not running on master (try /etc/init.d/connectemc status)") def testSolarisEtcSystem(host): requiredValues = { 'rlim_fd_cur' : '65536', 'zfs:zfs_arc_max' : '0x600000000', 'pcplusmp:apic_panic_on_nmi' : '1', 'nopanicdebug' : '1' } results = dict() for k in requiredValues.keys(): results[k] = 0 for key in host.data.etc_system.parameters.keys(): if key not in requiredValues: continue foundValue = host.data.etc_system.parameters[key] if foundValue == requiredValues[key]: results[key] = 1 for k in results.keys(): if results[k]: continue printError(host.hostname, "/etc/system is missing expected line 'set %s=%s'" % (k, requiredValues[k])) def testSolarisEtcProject(host): requiredValues = { 'default:3::::project.max-sem-ids=(priv,1024,deny);process.max-file-descriptor=(priv,252144,deny)' : 0 } unexpectedValues = set(['default:3::::']) for line in host.data.etc_project.lines: if line in unexpectedValues: printError(host.hostname, "unexpected line in /etc/project: '%s'" % line) continue if line in requiredValues: requiredValues[line] = 1 for line in requiredValues.keys(): if requiredValues[line]: continue printError(host.hostname, "/etc/project is missing expected line '%s'" % line) def testSolarisEtcUserAttr(host): requiredValues = { 'gpadmin::::defaultpriv=basic,dtrace_user,dtrace_proc' : 0 } for line in host.data.etc_user_attr.lines: if line in requiredValues: requiredValues[line] = 1 for line in requiredValues.keys(): if requiredValues[line]: continue printError(host.hostname, "/etc/user_attr is missing expected line '%s'" % line) def testBlockdev(host): expectedReadAhead = "16384" for dev in host.data.blockdev.ra.keys(): ra = host.data.blockdev.ra[dev] if ra != expectedReadAhead: printError(host.hostname, "on device (%s) blockdev readahead value '%s' does not match expected value '%s'" % (dev, ra, expectedReadAhead)) def testSysctl(host): expectedValues = gpcheck_config.expectedSysctlValues for k in expectedValues.keys(): if k not in host.data.sysctl.variables: printError(host.hostname, "variable not detected in /etc/sysctl.conf: '%s'" % k) continue value = host.data.sysctl.variables[k] if value != expectedValues[k]: printError(host.hostname, "/etc/sysctl.conf value for key '%s' has value '%s' and expects '%s'" % (k, value, expectedValues[k])) def testIOSchedulers(host): expectedScheduler = "deadline" for dev in host.data.ioschedulers.devices.keys(): scheduler = host.data.ioschedulers.devices[dev] if scheduler != expectedScheduler: printError(host.hostname, "on device (%s) IO scheduler '%s' does not match expected value '%s'" % (dev, scheduler, expectedScheduler)) def testLimitsConf(host): # utility class for this function only class limitsconf_data: def __init__(self, expected): self.domain = None self.value_found = None self.value_expected = int(expected) # maps tuple (type/item) to limitsconf_data table = dict() table[('soft', 'nofile')] = limitsconf_data(65536) table[('hard', 'nofile')] = limitsconf_data(65536) table[('soft', 'nproc')] = limitsconf_data(131072) table[('hard', 'nproc')] = limitsconf_data(131072) for entry in host.data.limitsconf.lines: if (entry.type, entry.item) not in table: continue tabledata = table[(entry.type, entry.item)] # we only supersede previous data if it is specific to gpadmin or it is * and there is not # specific gpadmin data already if entry.domain == 'gpadmin' or (entry.domain == '*' and tabledata.domain != 'gpadmin'): tabledata.value_found = entry.value for k in table: ld = table[k] if not ld.value_found: printError(host.hostname, "%s %s not found in /etc/security/limits.conf" % (k[0], k[1])) continue try: found = int(ld.value_found) except: found = 0 if found < ld.value_expected: printError(host.hostname, "%s %s in /etc/security/limits.conf has value %d lower than expected value %d" % (k[0], k[1], found, ld.value_expected)) def testLinuxMounts(host): xfs_mounts = list() for k in host.data.mounts.entries.keys(): entry = host.data.mounts.entries[k] if entry.type == "xfs": xfs_mounts.append(entry) for mnt in xfs_mounts: if mnt.type != "xfs": printError(host.hostname, "device %s is not XFS and is expected to be so on a %s host" % (mnt.partition, hosttype)) continue expectedOptions = set(gpcheck_config.xfs_mount_options.split(",")) if len(mnt.options) != len(expectedOptions): printError(host.hostname, "XFS filesystem on device %s has %d XFS mount options and %d are expected" % (mnt.partition, len(mnt.options), len(expectedOptions))) for opt in expectedOptions: if opt not in mnt.options: printError(host.hostname, "XFS filesystem on device %s is missing the recommended mount option '%s'" % (mnt.partition, opt)) continue def testGenericLinuxHost(host): # test IO schedulers testIOSchedulers(host) # test blockdev testBlockdev(host) # test sysctl testSysctl(host) # test limits.conf testLimitsConf(host) # test XFS mounts testLinuxMounts(host) def testGenericSolarisHost(host): testSolarisEtcSystem(host) testSolarisEtcProject(host) testSolarisEtcUserAttr(host) def testUnameConsistency(): firstUname = None firstHost = None for h in gpcheck_info.hosts.keys(): host = gpcheck_info.hosts[h] uname = host.data.uname.output if firstUname: if firstUname != uname: printError(h, "uname -r output different among hosts: %s : %s != %s : %s" % (firstHost, firstUname, host.hostname, uname)) else: firstUname = uname firstHost = host.hostname # return True for error def getSysCtlFromConfig(section): global gpcheck_config for opt in config.options(section): if re.match('sysctl\.', opt): fields = opt.split("sysctl.") if len(fields) != 2: printError(None, "Bad config line entry: %s" % opt) sysctl = fields[1] gpcheck_config.expectedSysctlValues[sysctl] = config.get(section, opt) # return True for error def populateGpCheckLinuxConfigParams(): if not config.has_section('linux'): printError(None, "%s does not contain a linux section. Exiting early" % configfilename) return True requiredOptions = set(['xfs_mount_options']) for option in requiredOptions: if not config.has_option('linux', option): printError(None, "%s does not contain required option %s. Exiting early" % (configfilename, option)) return True global gpcheck_config gpcheck_config.xfs_mount_options = config.get('linux', 'xfs_mount_options') if getSysCtlFromConfig('linux'): return True return False def testGenericLinuxCluster(): if populateGpCheckLinuxConfigParams(): return for h in gpcheck_info.hosts.keys(): testGenericLinuxHost(gpcheck_info.hosts[h]) testUnameConsistency() def testGenericSolarisCluster(): for h in gpcheck_info.hosts.keys(): testGenericSolarisHost(gpcheck_info.hosts[h]) testUnameConsistency() def getVersionNumberFromConfigFile(file): version = 0 try: config = ConfigParser.SafeConfigParser() config.readfp(open(file)) if not config.has_option('global', 'configfile_version'): return 0 version = config.getint('global', 'configfile_version') except Exception, e: logger.warning("GPCHECK_WARNING: could not read config file %s: exception: %s" % (file, e.__str__())) return version # get the filename that has the latest version number # could be in gphome or part of dca configs def findAndSetConfigFileName(): global configfilename configfilename = GPCHECK_CONFIGFILE_GPHOME def runTests(): global config, gpcheck_config gpcheck_config = GpCheckConfig() try: if not configfilename: findAndSetConfigFileName() logger.info("Using gpcheck config file: %s", configfilename) config = ConfigParser.SafeConfigParser() config.readfp(open(configfilename)) except Exception, e: logger.error(e.__str__()) printError(None, "Exception reading config file: %s" % configfilename) return True if not gpcheck_info.is_root: printError(None, "utility will not check all settings when run as non-root user") if gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX: testGenericLinuxCluster() elif gpcheck_info.host_type == HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS: testGenericSolarisCluster() else: printWarning("No tests exist for this platform in gpcheck") return False #true is error def createTmpDir(): global tmpdir try: tmpdir = tempfile.mkdtemp(prefix='gpcheck') except Exception, e: logger.error("error creating tmp dir on master") logger.error(e.__str__()) return True return False #true is error def runCollections(): retVal = runCollectionOnServers() if retVal: return True retVal = copyFilesLocally() if retVal: return True retVal = deleteRemoteFiles() if retVal: return True return False # true is error def checkPlatform(): global gpcheck_info gpcheck_info.host_type = HostType.GPCHECK_HOSTTYPE_UNDEFINED if SYSTEM.getName() == 'linux': gpcheck_info.host_type = HostType.GPCHECK_HOSTTYPE_GENERIC_LINUX return False elif SYSTEM.getName() == 'sunos': gpcheck_info.host_type = HostType.GPCHECK_HOSTTYPE_GENERIC_SOLARIS return False printWarning("No tests exist for this platform in gpcheck") return True def earlyExit(error = False): try: if tmpdir: shutil.rmtree(tmpdir) except Exception, e: logger.error("error removing tempdir during job cleanup") logger.error(e.__str__()) if error: logger.fatal("early exit from gpcheck") else: logger.info("gpcheck completing...") if pool: pool.join() pool.haltWork() pool.joinWorkers() if error: sys.exit(1) else: sys.exit(0) # True is an error def readZip(): words = options.zipin.split(".tar.gz") if len(words) != 2: logger.error("--zipin file needs to be a .tar.gz file") return True fname = words[0] cmdStr = "tar xfz %s" % (options.zipin) try: print cmdStr cmd = Command("tarcmd", cmdStr) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error running command: %s" % cmdStr) return True except Exception, e: logger.errr(e.__str__()) logger.error("Failed extracting pickled save file: %s" % options.zipin) return True newfname = "%s/%s" % (tmpdir, fname) cmdStr = "mv %s %s" % (fname, newfname) try: cmd = Command("mvcmd", cmdStr) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error running command: %s" % cmdStr) return True except Exception, e: logger.errr(e.__str__()) logger.error("Exception running command: %s" % cmdStr) return True fname = newfname global gpcheck_info try: fd = open(fname, "rb") gpcheck_info = pickle.load(fd) fd.close() except Exception, e: logger.error(e.__str__()) logger.error("Failed extracting pickled save file: %s" % fname) return True logger.info("read from gpcheck file platform = %s" % hosttype_str(gpcheck_info.host_type)) logger.info("read from gpcheck file is_root = %s" % gpcheck_info.is_root) logger.info("read from gpcheck file number hosts = %s" % len(gpcheck_info.hosts)) return False def doZip(fname): try: fd = open(fname, "wb") pickle.dump(gpcheck_info, fd) fd.close() except Exception, e: logger.errr(e.__str__()) logger.error("Failed writing pickled save file: %s" % fname) return cmdStr = "tar cfz %s.tar.gz %s" % (fname, fname) try: cmd = Command("tarcmd", cmdStr) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error running command: %s" % cmdStr) return except Exception, e: logger.errr(e.__str__()) logger.error("Exception running command: %s" % cmdStr) return logger.info("SUCCESS creating %s.tar.gz" % fname) def cleanZip(dirname): cmdStr = "rm -rf %s" % (dirname) try: cmd = Command("rmcmd", cmdStr) pool.addCommand(cmd) pool.join() items = pool.getCompletedItems() for i in items: if i.results.rc or i.results.halt or not i.results.completed: logger.error("error running command: %s" % cmdStr) return except Exception, e: logger.errr(e.__str__()) logger.error("Failed cleaning up after running gpcheck") return def doPrint(): for h in gpcheck_info.hosts: print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" print "HOST: %s" % h print gpcheck_info.hosts[h].data.__str__() print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" def checkUser(): global gpcheck_info user = getpass.getuser() if user == 'root': gpcheck_info.is_root = True else: gpcheck_info.is_root = False logger = get_default_logger() setup_tool_logging(EXECNAME,getLocalHostname(),getUserName()) if (parseargs()): sys.exit(0) pool = WorkerPool() ## COLLECT INPUT SECTION ### if (createTmpDir()): earlyExit(True) if options.zipin: if (readZip()): earlyExit(True) else: gpcheck_info = GpCheckInfo() if (checkPlatform()): sys.exit(0) checkUser() if (createHostList()): earlyExit(True) logger.info("Detected platform: %s" % hosttype_str(gpcheck_info.host_type)) if (runCollections()): earlyExit(True) if (readDataFiles()): earlyExit(True) ## GENERATE OUTPUT SECTION ### if options.zipout: filename = "./gpcheck_%s" % time.time() doZip(filename) cleanZip(filename) earlyExit(False) if options.stdout: doPrint() earlyExit(False) if (runTests()): earlyExit(True) if not found_errors: logger.info("GPCHECK_NORMAL") earlyExit(False)