From ce4586a7a4a3a1880fe7445b924ec2217b651ba2 Mon Sep 17 00:00:00 2001 From: Pengcheng Tang Date: Mon, 11 Jan 2016 23:43:57 -0800 Subject: [PATCH] Backup and restore to report errors found from segments' status file Gpcrondump and gpdbrestore redirect stderr into status file for segments, errors should be reflected into the master's report file. Warnings will be given if process completed but errors were generated in the report file. Also made some refactor of the code. This commit also reverts 08f4ada76da058e85943b86a91f7ad28f3936e6c. Authors: Pengcheng Tang, Nikhil Kak --- .../test/behave/mgmt_utils/backup.feature | 4 +- .../test/behave/mgmt_utils/netbackup.feature | 10 +-- .../behave/mgmt_utils/steps/mgmt_utils.py | 2 +- src/bin/pg_dump/cdb/cdb_dump.c | 40 ++++++----- src/bin/pg_dump/cdb/cdb_restore.c | 66 ++++++++++++++----- 5 files changed, 83 insertions(+), 39 deletions(-) diff --git a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/backup.feature b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/backup.feature index ef07bad20f..79c65dd17e 100644 --- a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/backup.feature +++ b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/backup.feature @@ -343,7 +343,7 @@ Feature: Validate command line arguments And the database "bkdb" does not exist And database "bkdb" exists And the user runs gp_restore with the the stored timestamp and subdir in "bkdb" - And gp_restore should return a return code of 0 + And gp_restore should return a return code of 2 And verify that partitioned tables "ao_part_table, co_part_table, heap_part_table" in "bkdb" have 6 partitions And verify that partitioned tables "ao_part_table_comp, co_part_table_comp" in "bkdb" have 6 partitions And verify that partitioned tables "part_external" in "bkdb" have 5 partitions in partition level "0" @@ -431,7 +431,7 @@ Feature: Validate command line arguments And the database "bkdb" does not exist And database "bkdb" exists And the user runs gp_restore with the stored timestamp and subdir in "bkdb" and backup_dir "/tmp" - And gp_restore should return a return code of 0 + And gp_restore should return a return code of 2 And verify that partitioned tables "ao_part_table, co_part_table, heap_part_table" in "bkdb" have 6 partitions And verify that partitioned tables "ao_part_table_comp, co_part_table_comp" in "bkdb" have 6 partitions And verify that partitioned tables "part_external" in "bkdb" have 5 partitions in partition level "0" diff --git a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/netbackup.feature b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/netbackup.feature index 663bb94d01..ec45d5d53f 100644 --- a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/netbackup.feature +++ b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/netbackup.feature @@ -283,7 +283,7 @@ Feature: NetBackup Integration with GPDB And the database "nbubkdb" does not exist And database "nbubkdb" exists When the user runs gp_restore with the the stored timestamp and subdir in "nbubkdb" using netbackup - Then gp_restore should return a return code of 0 + Then gp_restore should return a return code of 2 And verify that the data of "68" tables in "nbubkdb" is validated after restore @nbuall @@ -300,7 +300,7 @@ Feature: NetBackup Integration with GPDB And the database "nbubkdb" does not exist And database "nbubkdb" exists When the user runs "gp_restore -i --gp-k 20131228111527 --gp-d db_dumps --gp-i --gp-r db_dumps --gp-l=p -d nbubkdb --gp-c" using netbackup - Then gp_restore should return a return code of 0 + Then gp_restore should return a return code of 2 And verify that the data of "68" tables in "nbubkdb" is validated after restore @nbusmoke @@ -3172,7 +3172,7 @@ Feature: NetBackup Integration with GPDB And there are no backup files And the database "testdb" does not exist And database "testdb" exists - And there are "2" "heap" tables "public.heap_table" with data in "testdb" + And there are "2" "heap" tables "public.heap_table" with data in "testdb" And there is a "ao" partition table "ao_part_table" with compression "None" in "testdb" with data And there is a "ao" partition table "ao_part_table1" with compression "None" in "testdb" with data Then data for partition table "ao_part_table" with partition level "1" is distributed across all segments on "testdb" @@ -3209,7 +3209,7 @@ Feature: NetBackup Integration with GPDB And there are no backup files And database "schematestdb" is created if not exists on host "None" with port "0" with user "None" And there is schema "pepper" exists in "schematestdb" - And there are "2" "heap" tables "public.heap_table" with data in "schematestdb" + And there are "2" "heap" tables "public.heap_table" with data in "schematestdb" And there is a "ao" partition table "ao_part_table" with compression "None" in "schematestdb" with data And there is a "ao" partition table "ao_part_table1" with compression "None" in "schematestdb" with data And there is a "ao" partition table "pepper.ao_part_table1" with compression "None" in "schematestdb" with data @@ -3252,7 +3252,7 @@ Feature: NetBackup Integration with GPDB And there are no backup files And the database "testdb" does not exist And database "testdb" exists - And there are "2" "heap" tables "public.heap_table" with data in "testdb" + And there are "2" "heap" tables "public.heap_table" with data in "testdb" And there is a "ao" partition table "ao_part_table" with compression "None" in "testdb" with data And there is a "ao" partition table "ao_part_table1" with compression "None" in "testdb" with data Then data for partition table "ao_part_table" with partition level "1" is distributed across all segments on "testdb" diff --git a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/steps/mgmt_utils.py b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/steps/mgmt_utils.py index 5945c32e75..c0b92ee4c5 100644 --- a/gpMgmt/bin/gppylib/test/behave/mgmt_utils/steps/mgmt_utils.py +++ b/gpMgmt/bin/gppylib/test/behave/mgmt_utils/steps/mgmt_utils.py @@ -80,7 +80,7 @@ def impl(context, dbname): @then('database "{dbname}" is created if not exists on host "{HOST}" with port "{PORT}" with user "{USER}"') def impl(context, dbname, HOST, PORT, USER): host = os.environ.get(HOST) - port = int(os.environ.get(PORT)) + port = 0 if os.environ.get(PORT) == None else int(os.environ.get(PORT)) user = os.environ.get(USER) create_database_if_not_exists(context, dbname, host, port, user) diff --git a/src/bin/pg_dump/cdb/cdb_dump.c b/src/bin/pg_dump/cdb/cdb_dump.c index a2675ad63e..9d5a07d423 100644 --- a/src/bin/pg_dump/cdb/cdb_dump.c +++ b/src/bin/pg_dump/cdb/cdb_dump.c @@ -35,8 +35,6 @@ int optreset; #endif -#define DUMP_PREFIX (dump_prefix==NULL?"":dump_prefix) - /* * static helper functions. See each function body for a description. */ @@ -87,7 +85,6 @@ static char *selectSchemaName = NULL; /* name of a single schema to dump */ static char *tableFileName = NULL; /* file name with tables to dump (--table-file) */ static char *schemaFileName = NULL; /* file name with tables to dump (--schema-file) */ static char *excludeTableFileName = NULL; /* file name with tables to exclude (--exclude-table-file) */ -static char *dump_prefix = NULL; /* NetBackup related variables */ static char *netbackup_service_host = NULL; @@ -132,6 +129,7 @@ static pthread_mutex_t MyMutex2 = PTHREAD_MUTEX_INITIALIZER; static bool g_b_SendCancelMessage = false; static const char *pszAgent = "gp_dump_agent"; +PQExpBuffer dump_prefix_buf = NULL; #ifdef USE_DDBOOST #include "ddp_api.h" @@ -143,6 +141,7 @@ main(int argc, char **argv) { int rc = 0; /* return code */ int remote_version; + dump_prefix_buf = createPQExpBuffer(); InputOptions inputOpts; /* command line parameters */ SegmentDatabaseArray segDBAr; /* array of the segdbs from the master @@ -240,6 +239,8 @@ cleanup: if (master_db_conn != NULL) PQfinish(master_db_conn); + destroyPQExpBuffer(dump_prefix_buf); + exit(rc); } @@ -1209,13 +1210,13 @@ fillInputOptions(int argc, char **argv, InputOptions * pInputOpts) } break; - case 11: - no_expand_children = true; - break; - case 12: - dump_prefix = pg_strdup(optarg); - pInputOpts->pszPassThroughParms = addPassThroughLongParm("prefix", DUMP_PREFIX, pInputOpts->pszPassThroughParms); - break; + case 11: + no_expand_children = true; + break; + case 12: + appendPQExpBuffer(dump_prefix_buf, "%s", optarg); + pInputOpts->pszPassThroughParms = addPassThroughLongParm("prefix", dump_prefix_buf->data, pInputOpts->pszPassThroughParms); + break; case 13: incremental_filter = pg_strdup(optarg); pInputOpts->pszPassThroughParms = addPassThroughLongParm("incremental-filter", incremental_filter, pInputOpts->pszPassThroughParms); @@ -1693,7 +1694,7 @@ reportBackupResults(InputOptions inputopts, ThreadParmArray *pParmAr) else pszFormat = "%s%sgp_dump_%s.rpt"; - pszReportPathName = MakeString(pszFormat, pszReportDirectory, DUMP_PREFIX, pParmAr->pData[0].pOptionsData->pszKey); + pszReportPathName = MakeString(pszFormat, pszReportDirectory, dump_prefix_buf == NULL ? "" : dump_prefix_buf->data, pParmAr->pData[0].pOptionsData->pszKey); fRptFile = fopen(pszReportPathName, "w"); if (fRptFile == NULL) @@ -1945,13 +1946,19 @@ spinOffThreads(PGconn *pConn, // Create a file on the master to signal to gpcrondump that it can release its pg_class lock // Only do this if no-lock is passed, otherwise it can cause problems if gp_dump is called directly and does its own locks if (pParm->pTargetSegDBData->role == ROLE_MASTER && no_lock) - { + { char *dumpkey = pParmAr->pData[0].pOptionsData->pszKey; - char *filedir = pg_strdup(pInputOpts->pszReportDirectory); - if (filedir == NULL) + char *filedir = NULL; + if (pInputOpts->pszReportDirectory == NULL) { - filedir = pg_strdup(getenv("MASTER_DATA_DIRECTORY")); + if (getenv("MASTER_DATA_DIRECTORY") != NULL) + filedir = pg_strdup(getenv("MASTER_DATA_DIRECTORY")); + else + filedir = pg_strdup("./"); } + else + filedir = pg_strdup(pInputOpts->pszReportDirectory); + char *signalFileName = MakeString("%s/gp_lockfile_%s", filedir, &dumpkey[8]); mpp_msg(logInfo, progname, "Signal filename is %s.\n", signalFileName); bool success = false; @@ -2282,8 +2289,9 @@ threadProc(void *arg) BFT_BACKUP_STATUS, progname, pqBuffer); if(status != 0) { - pParm->pszErrorMsg = pqBuffer->data; + pParm->pszErrorMsg = MakeString("%s", pqBuffer->data); } + destroyPQExpBuffer(pqBuffer); } /* diff --git a/src/bin/pg_dump/cdb/cdb_restore.c b/src/bin/pg_dump/cdb/cdb_restore.c index c745eaa434..131c95d9db 100644 --- a/src/bin/pg_dump/cdb/cdb_restore.c +++ b/src/bin/pg_dump/cdb/cdb_restore.c @@ -69,8 +69,8 @@ static const char *logError = "ERROR"; static const char *logFatal = "FATAL"; const char *progname; static char * addPassThroughLongParm(const char *Parm, const char *pszValue, char *pszPassThroughParmString); -static char *dump_prefix = NULL; -static char *status_file_dir = NULL; +PQExpBuffer dir_buf = NULL; +PQExpBuffer dump_prefix_buf = NULL; /* NetBackup related variable */ static char *netbackup_service_host = NULL; @@ -92,6 +92,9 @@ main(int argc, char **argv) SegmentDatabase *sourceSegDB = NULL; SegmentDatabase *targetSegDB = NULL; + dir_buf = createPQExpBuffer(); + dump_prefix_buf = createPQExpBuffer(); + progname = get_progname(argv[0]); /* This struct holds the values of the command line parameters */ @@ -239,10 +242,12 @@ main(int argc, char **argv) failCount = reportRestoreResults(inputOpts.pszReportDirectory, &masterParm, &parmAr); cleanup: + FreeRestorePairArray(&restorePairAr); FreeInputOptions(&inputOpts); + if (opts != NULL) free(opts); - FreeRestorePairArray(&restorePairAr); + freeThreadParmArray(&parmAr); if (masterParm.pszErrorMsg) @@ -254,6 +259,9 @@ cleanup: if (pConn != NULL) PQfinish(pConn); + destroyPQExpBuffer(dir_buf); + destroyPQExpBuffer(dump_prefix_buf); + return failCount; } @@ -734,25 +742,29 @@ fillInputOptions(int argc, char **argv, InputOptions * pInputOpts) bAoStats = false; break; case 13: - dump_prefix = Safe_strdup(optarg); - pInputOpts->pszPassThroughParms = addPassThroughLongParm("prefix", DUMP_PREFIX, pInputOpts->pszPassThroughParms); + appendPQExpBuffer(dump_prefix_buf, "%s", optarg); + pInputOpts->pszPassThroughParms = addPassThroughLongParm("prefix", dump_prefix_buf->data, pInputOpts->pszPassThroughParms); break; case 14: - status_file_dir = Safe_strdup(optarg); - pInputOpts->pszPassThroughParms = addPassThroughLongParm("status", status_file_dir, pInputOpts->pszPassThroughParms); + appendPQExpBuffer(dir_buf, "%s", optarg); + pInputOpts->pszPassThroughParms = addPassThroughLongParm("status", dir_buf->data, pInputOpts->pszPassThroughParms); break; case 15: netbackup_service_host = Safe_strdup(optarg); pInputOpts->pszPassThroughParms = addPassThroughLongParm("netbackup-service-host", netbackup_service_host, pInputOpts->pszPassThroughParms); + if (netbackup_service_host != NULL) + free(netbackup_service_host); break; case 16: netbackup_block_size = Safe_strdup(optarg); pInputOpts->pszPassThroughParms = addPassThroughLongParm("netbackup-block-size", netbackup_block_size, pInputOpts->pszPassThroughParms); + if (netbackup_block_size != NULL) + free(netbackup_block_size); break; case 17: change_schema = Safe_strdup(optarg); pInputOpts->pszPassThroughParms = addPassThroughLongParm("change-schema", change_schema, pInputOpts->pszPassThroughParms); - if (change_schema!= NULL) + if (change_schema != NULL) free(change_schema); break; default: @@ -1138,6 +1150,11 @@ threadProc(void *arg) pszKey, sSegDB->dbid, tSegDB->dbid); mpp_err_msg(logError, progname, pParm->pszErrorMsg); PQfinish(pConn); + free(pszNotifyRelName); + free(pszNotifyRelNameStart); + free(pszNotifyRelNameSucceed); + free(pszNotifyRelNameFail); + free(pollInput); return NULL; } @@ -1148,6 +1165,11 @@ threadProc(void *arg) pParm->pszErrorMsg = MakeString("connection went down for backup key %s, source dbid %d, target dbid %d\n", pszKey, sSegDB->dbid, tSegDB->dbid); mpp_err_msg(logFatal, progname, pParm->pszErrorMsg); + free(pszNotifyRelName); + free(pszNotifyRelNameStart); + free(pszNotifyRelNameSucceed); + free(pszNotifyRelNameFail); + free(pollInput); PQfinish(pConn); return NULL; } @@ -1180,8 +1202,9 @@ threadProc(void *arg) pParm->bSuccess = false; pqBuffer = createPQExpBuffer(); /* Make call to get error message from file on server */ - ReadBackendBackupFileError(pConn, status_file_dir, pszKey, BFT_RESTORE_STATUS, progname, pqBuffer); - pParm->pszErrorMsg = pqBuffer->data; + ReadBackendBackupFileError(pConn, dir_buf->data, pszKey, BFT_RESTORE_STATUS, progname, pqBuffer); + pParm->pszErrorMsg = MakeString("%s", pqBuffer->data); + destroyPQExpBuffer(pqBuffer); mpp_err_msg(logError, progname, "restore failed for source dbid %d, target dbid %d on host %s\n", sSegDB->dbid, tSegDB->dbid, StringNotNull(sSegDB->pszHost, "localhost")); @@ -1217,11 +1240,12 @@ threadProc(void *arg) if(pParm->bSuccess || pParm->pszErrorMsg == NULL) { pqBuffer = createPQExpBuffer(); - int status = ReadBackendBackupFileError(pConn, status_file_dir, pszKey, BFT_RESTORE_STATUS, progname, pqBuffer); + int status = ReadBackendBackupFileError(pConn, dir_buf == NULL ? "" : dir_buf->data, pszKey, BFT_RESTORE_STATUS, progname, pqBuffer); if (status != 0) { - pParm->pszErrorMsg = pqBuffer->data; + pParm->pszErrorMsg = MakeString("%s", pqBuffer->data); } + destroyPQExpBuffer(pqBuffer); } if (pszNotifyRelName != NULL) @@ -1627,7 +1651,7 @@ reportRestoreResults(const char *pszReportDirectory, const ThreadParm * pMasterP * report directory not set by user - default to * $MASTER_DATA_DIRECTORY */ - pszReportDirectory = Safe_strdup(getenv("MASTER_DATA_DIRECTORY")); + pszReportDirectory = getenv("MASTER_DATA_DIRECTORY"); } else { @@ -1642,7 +1666,13 @@ reportRestoreResults(const char *pszReportDirectory, const ThreadParm * pMasterP else pszFormat = "%s%sgp_restore_%s.rpt"; - pszReportPathName = MakeString(pszFormat, pszReportDirectory, DUMP_PREFIX, pOptions->pszKey); + pszReportPathName = MakeString(pszFormat, pszReportDirectory, dump_prefix_buf == NULL ? "" : dump_prefix_buf->data, pOptions->pszKey); + + if (pszReportPathName == NULL) + { + mpp_err_msg(logError, progname, "Cannot allocate memory for report file path\n"); + exit(-1); + } fRptFile = fopen(pszReportPathName, "w"); if (fRptFile == NULL) @@ -1795,7 +1825,7 @@ reportMasterError(InputOptions inputopts, const ThreadParm * pMasterParm, const * report directory not set by user - default to * $MASTER_DATA_DIRECTORY */ - pszReportDirectory = Safe_strdup(getenv("MASTER_DATA_DIRECTORY")); + pszReportDirectory = getenv("MASTER_DATA_DIRECTORY"); } else { @@ -1812,6 +1842,12 @@ reportMasterError(InputOptions inputopts, const ThreadParm * pMasterParm, const pszReportPathName = MakeString(pszFormat, pszReportDirectory, pOptions->pszKey); + if (pszReportPathName == NULL) + { + mpp_err_msg(logError, progname, "Cannot allocate memory for report file path\n"); + exit(-1); + } + fRptFile = fopen(pszReportPathName, "w"); if (fRptFile == NULL) mpp_err_msg(logWarn, progname, "Cannot open report file %s for writing. Will use stdout instead.\n", -- GitLab