diff --git a/src/backend/utils/mmgr/runaway_cleaner.c b/src/backend/utils/mmgr/runaway_cleaner.c index 9c01603d2acc06c47711626033169df486e71b9a..387d2eb7ebfc95a611bd677be75c150f4ce3ccc8 100644 --- a/src/backend/utils/mmgr/runaway_cleaner.c +++ b/src/backend/utils/mmgr/runaway_cleaner.c @@ -19,6 +19,7 @@ #include "postgres.h" +#include "access/xact.h" #include "cdb/cdbvars.h" #include "miscadmin.h" #include "port/atomics.h" @@ -97,6 +98,63 @@ RunawayCleaner_ShouldStartRunawayCleanup() return false; } + +/* + * Determine if the runaway cleanup should be handled by aborting the current + * query or must be ignored. Since the cleanup can be attempted from multiple + * places, it is important to first validate if calling elog(ERROR) is safe and + * of value. + */ +static bool +RunawayCleaner_ShouldCancelQuery() +{ + /* VMEM tracker not being used */ + if (!vmemTrackerInited) + return false; + + /* In critical section or when holding off on handling interrupts */ + if (CritSectionCount != 0 || InterruptHoldoffCount != 0) + return false; + + /* + * Cleaning up QEs that are not executing a valid command may cause the QD to + * get stuck [MPP-24950] + */ + if (gp_command_count <= 0) + return false; + + /* + * If not currently executing a transaction, aborting it won't release any + * more resources. + */ + if (!IsTransactionState()) + return false; + + /* Ok, we are actively executing a query */ + + if (MySessionState->runawayStatus == RunawayStatus_PrimaryRunawaySession) + { + /* + * Abort the query if it is actively executing and has been flagged as + * consuming the most memory + */ + return true; + } + else + { + Assert(MySessionState->runawayStatus = RunawayStatus_SecondaryRunawaySession); + + /* + * If this process was flagged as a runaway session inspite another session + * using more memory, only abort this query if the current user is not a + * superuser. This is to ensure that critical administrative commands (such + * as database restarts), which are done as superuser, are not interrupted + * by the runaway cleaner. + */ + return !superuser(); + } +} + /* * Starts a runaway cleanup by triggering an ERROR if the VMEM tracker is active * and a commit is not already in progress. Otherwise, it marks the process as clean @@ -118,14 +176,11 @@ RunawayCleaner_StartCleanup() { Assert(beginCleanupRunawayVersion < *latestRunawayVersion); Assert(endCleanupRunawayVersion < *latestRunawayVersion); + /* We don't want to cleanup multiple times for same runaway event */ beginCleanupRunawayVersion = *latestRunawayVersion; - if (CritSectionCount == 0 && InterruptHoldoffCount == 0 && vmemTrackerInited && - gp_command_count > 0 /* Cleaning up QEs that are not executing a valid command - may cause the QD to get stuck [MPP-24950] */ && - /* Super user is terminated only when it's the primary runaway consumer (i.e., the top consumer) */ - (!superuser() || MySessionState->runawayStatus == RunawayStatus_PrimaryRunawaySession)) + if (RunawayCleaner_ShouldCancelQuery()) { SIMPLE_FAULT_INJECTOR(RunawayCleanup); diff --git a/src/backend/utils/mmgr/test/Makefile b/src/backend/utils/mmgr/test/Makefile index b0d147eeb583d54bce9251d1723da6a8a98aadda..d25c3b8bc3b184783246015ce487eb3372fa1b69 100755 --- a/src/backend/utils/mmgr/test/Makefile +++ b/src/backend/utils/mmgr/test/Makefile @@ -43,6 +43,7 @@ memaccounting.t: \ runaway_cleaner.t: \ $(MOCK_DIR)/backend/utils/error/elog_mock.o \ + $(MOCK_DIR)/backend/access/transam/xact_mock.o \ $(MOCK_DIR)/backend/utils/mmgr/mcxt_mock.o \ $(MOCK_DIR)/backend/utils/mmgr/memaccounting_mock.o \ $(MOCK_DIR)/backend/utils/misc/faultinjector_mock.o \ diff --git a/src/backend/utils/mmgr/test/runaway_cleaner_test.c b/src/backend/utils/mmgr/test/runaway_cleaner_test.c index 5eb73d1b1a2c8b1debd5074e9e3514dff533996d..0e56901c6f07cf13863cf892e0a0b8ac8e6e248f 100755 --- a/src/backend/utils/mmgr/test/runaway_cleaner_test.c +++ b/src/backend/utils/mmgr/test/runaway_cleaner_test.c @@ -86,7 +86,7 @@ test__RunawayCleaner_StartCleanup__IgnoresNonRunaway(void **state) CLEANUP_COUNTDOWN_BEFORE_RUNAWAY /* cleanupCountdown */, RunawayStatus_NotRunaway /* runawayStatus */, 2 /* pinCount */, 0 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; beginCleanupRunawayVersion = 0; @@ -107,7 +107,7 @@ test__RunawayCleaner_StartCleanup__IgnoresDuplicateCleanup(void **state) 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 0 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; beginCleanupRunawayVersion = *latestRunawayVersion; @@ -121,18 +121,83 @@ test__RunawayCleaner_StartCleanup__IgnoresDuplicateCleanup(void **state) /* * Checks if RunawayCleaner_StartCleanup() starts the cleanup process if * all conditions are met (i.e., no commit is in progress and vmem tracker - * is initialized) + * is initialized) and runaway session is "primary" */ void -test__RunawayCleaner_StartCleanup__StartsCleanupIfPossible(void **state) +test__RunawayCleaner_StartCleanup__StartsPrimaryCleanupIfPossible(void **state) { InitFakeSessionState(2 /* activeProcessCount */, 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; + latestRunawayVersion = &fakeLatestRunawayVersion; + + /* + * Set beginCleanupRunawayVersion to less than *latestRunawayVersion + * to trigger a cleanup + */ + beginCleanupRunawayVersion = 1; + endCleanupRunawayVersion = 1; + isProcessActive = true; + + /* Make sure the cleanup goes through */ + vmemTrackerInited = true; + CritSectionCount = 0; + InterruptHoldoffCount = 0; + /* We need a valid gp_command_count to execute cleanup */ + gp_command_count = 1; + will_return(IsTransactionState, true); + +#ifdef FAULT_INJECTOR + expect_value(FaultInjector_InjectFaultIfSet, identifier, RunawayCleanup); + expect_value(FaultInjector_InjectFaultIfSet, ddlStatement, DDLNotSpecified); + expect_value(FaultInjector_InjectFaultIfSet, databaseName, ""); + expect_value(FaultInjector_InjectFaultIfSet, tableName, ""); + will_be_called(FaultInjector_InjectFaultIfSet); +#endif + + EXPECT_EREPORT(ERROR); + + PG_TRY(); + { + RunawayCleaner_StartCleanup(); + assert_false("Cleanup didn't throw error"); + } + PG_CATCH(); + { + + } + PG_END_TRY(); + + assert_true(beginCleanupRunawayVersion == *latestRunawayVersion); + /* We should not finish the cleanup as we errored out */ + assert_true(endCleanupRunawayVersion == 1); + + /* cleanupCountdown shouldn't change as we haven't finished cleanup */ + assert_true(MySessionState->cleanupCountdown == 2); + + /* + * If we call RunawayCleaner_StartCleanup again for the same runaway event, + * it should be a noop, therefore requiring no "will_be_called" setup + */ + RunawayCleaner_StartCleanup(); +} + +/* + * Checks if RunawayCleaner_StartCleanup() starts the cleanup process if + * all conditions are met (i.e., no commit is in progress and vmem tracker + * is initialized) and runaway session is "secondary" + */ +void +test__RunawayCleaner_StartCleanup__StartsSecondaryCleanupIfPossible(void **state) +{ + InitFakeSessionState(2 /* activeProcessCount */, + 2 /* cleanupCountdown */, + RunawayStatus_SecondaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); + + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; - *latestRunawayVersion = 10; /* * Set beginCleanupRunawayVersion to less than *latestRunawayVersion @@ -149,6 +214,7 @@ test__RunawayCleaner_StartCleanup__StartsCleanupIfPossible(void **state) /* We need a valid gp_command_count to execute cleanup */ gp_command_count = 1; will_return(superuser, false); + will_return(IsTransactionState, true); #ifdef FAULT_INJECTOR expect_value(FaultInjector_InjectFaultIfSet, identifier, RunawayCleanup); @@ -185,6 +251,7 @@ test__RunawayCleaner_StartCleanup__StartsCleanupIfPossible(void **state) RunawayCleaner_StartCleanup(); } + /* * Checks if RunawayCleaner_StartCleanup() ignores cleanup if in critical section */ @@ -195,8 +262,9 @@ test__RunawayCleaner_StartCleanup__IgnoresCleanupInCriticalSection(void **state) 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static EventVersion fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; + /* * Set beginCleanupRunawayVersion to less than *latestRunawayVersino * to trigger a cleanup @@ -237,8 +305,9 @@ test__RunawayCleaner_StartCleanup__IgnoresCleanupInHoldoffInterrupt(void **state 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static EventVersion fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; + /* * Set beginCleanupRunawayVersion to less than *latestRunawayVersino * to trigger a cleanup @@ -269,6 +338,53 @@ test__RunawayCleaner_StartCleanup__IgnoresCleanupInHoldoffInterrupt(void **state InterruptHoldoffCount = 0; } +/* + * Checks if RunawayCleaner_StartCleanup() ignores cleanup if outside of any transaction + */ +void +test__RunawayCleaner_StartCleanup__IgnoresCleanupOutsideAnyTransaction(void **state) +{ + InitFakeSessionState(2 /* activeProcessCount */, + 2 /* cleanupCountdown */, + RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); + + EventVersion fakeLatestRunawayVersion = 10; + latestRunawayVersion = &fakeLatestRunawayVersion; + + /* + * Set beginCleanupRunawayVersion to less than *latestRunawayVersino + * to trigger a cleanup + */ + beginCleanupRunawayVersion = 1; + endCleanupRunawayVersion = 1; + isProcessActive = true; + + /* Make sure the cleanup goes through */ + vmemTrackerInited = true; + CritSectionCount = 0; + InterruptHoldoffCount = 0; + gp_command_count = 1; + + /* But it is called outside of a transaction */ + will_return(IsTransactionState, false); + + CHECK_FOR_RUNAWAY_CLEANUP_MEMORY_LOGGING(); + /* Should not call superuser() as that can cause a PANIC */ + RunawayCleaner_StartCleanup(); + + assert_true(beginCleanupRunawayVersion == *latestRunawayVersion); + /* Cleanup is done, without ever throwing an ERROR */ + assert_true(endCleanupRunawayVersion == beginCleanupRunawayVersion); + + /* + * cleanupCountdown is decremented by 1 as there was no error, and therefore + * the cleanup is done within the same call of RunawayCleaner_StartCleanup + */ + assert_true(MySessionState->cleanupCountdown == 1); + + InterruptHoldoffCount = 0; +} + /* * Checks if RunawayCleaner_RunawayCleanupDoneForProcess() ignores cleanupCountdown * if optional cleanup @@ -281,9 +397,8 @@ test__RunawayCleaner_RunawayCleanupDoneForProcess__IgnoresCleanupIfNotRequired(v CLEANUP_COUNTDOWN /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static EventVersion fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; - *latestRunawayVersion = 10; /* * Set beginCleanupRunawayVersion to less than *latestRunawayVersino @@ -351,7 +466,7 @@ test__RunawayCleaner_RunawayCleanupDoneForProcess__IgnoresDuplicateCalls(void ** 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; /* * Set beginCleanupRunawayVersion and endCleanupRunawayVersion to @@ -381,9 +496,9 @@ test__RunawayCleaner_RunawayCleanupDoneForProcess__PreventsDuplicateCleanup(void 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, CLEANUP_COUNTDOWN /* pinCount */, 12345 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; - *latestRunawayVersion = 10; + /* * Some imaginary cleanup begin/end event version. The idea is to ensure * that once the RunawayCleaner_RunawayCleanupDoneForProcess call returns @@ -421,8 +536,9 @@ test__RunawayCleaner_RunawayCleanupDoneForProcess__UndoDeactivation(void **state 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; + /* * Set beginCleanupRunawayVersion to latestRunawayVersion and endCleanupRunawayVersion * to a smaller value to simulate an ongoing cleanup @@ -464,8 +580,9 @@ test__RunawayCleaner_RunawayCleanupDoneForProcess__ReactivatesRunawayDetection(v 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static EventVersion fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; + /* * Set beginCleanupRunawayVersion to latestRunawayVersion and endCleanupRunawayVersion * to a smaller value to simulate an ongoing cleanup @@ -522,8 +639,9 @@ test__RunawayCleaner_RunawayCleanupDoneForSession__ResetsRunawayFlagAndReactivat 2 /* cleanupCountdown */, RunawayStatus_PrimaryRunawaySession /* runawayStatus */, 2 /* pinCount */, 12345 /* vmem */); - static EventVersion fakeLatestRunawayVersion = 10; + EventVersion fakeLatestRunawayVersion = 10; latestRunawayVersion = &fakeLatestRunawayVersion; + /* * Satisfy asserts */ @@ -548,9 +666,11 @@ main(int argc, char* argv[]) const UnitTest tests[] = { unit_test(test__RunawayCleaner_StartCleanup__IgnoresNonRunaway), unit_test(test__RunawayCleaner_StartCleanup__IgnoresDuplicateCleanup), - unit_test(test__RunawayCleaner_StartCleanup__StartsCleanupIfPossible), + unit_test(test__RunawayCleaner_StartCleanup__StartsPrimaryCleanupIfPossible), + unit_test(test__RunawayCleaner_StartCleanup__StartsSecondaryCleanupIfPossible), unit_test(test__RunawayCleaner_StartCleanup__IgnoresCleanupInCriticalSection), unit_test(test__RunawayCleaner_StartCleanup__IgnoresCleanupInHoldoffInterrupt), + unit_test(test__RunawayCleaner_StartCleanup__IgnoresCleanupOutsideAnyTransaction), unit_test(test__RunawayCleaner_RunawayCleanupDoneForProcess__IgnoresCleanupIfNotRequired), unit_test(test__RunawayCleaner_RunawayCleanupDoneForProcess__IgnoresDuplicateCalls), unit_test(test__RunawayCleaner_RunawayCleanupDoneForProcess__PreventsDuplicateCleanup),