From c046dd73217308229d39ec77d7ca467f476569cd Mon Sep 17 00:00:00 2001 From: johnc Date: Tue, 4 Aug 2009 16:00:17 -0700 Subject: [PATCH] 6819077: G1: first GC thread coming late into the GC. Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter. Reviewed-by: iveresov, tonyp --- .../g1/concurrentG1Refine.cpp | 384 +++++++++++------- .../g1/concurrentG1Refine.hpp | 117 +++++- .../gc_implementation/g1/g1CollectedHeap.cpp | 2 - .../g1/g1CollectorPolicy.cpp | 16 + .../g1/g1CollectorPolicy.hpp | 22 +- .../vm/gc_implementation/g1/g1RemSet.cpp | 153 ++++--- .../vm/gc_implementation/g1/g1RemSet.hpp | 4 + .../vm/gc_implementation/g1/g1_globals.hpp | 10 +- .../vm/gc_implementation/includeDB_gc_g1 | 3 + 9 files changed, 483 insertions(+), 228 deletions(-) diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp index 4b6aafbd2..8ec60012c 100644 --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp @@ -25,11 +25,21 @@ #include "incls/_precompiled.incl" #include "incls/_concurrentG1Refine.cpp.incl" +// Possible sizes for the card counts cache: odd primes that roughly double in size. +// (See jvmtiTagMap.cpp). +int ConcurrentG1Refine::_cc_cache_sizes[] = { + 16381, 32771, 76831, 150001, 307261, + 614563, 1228891, 2457733, 4915219, 9830479, + 19660831, 39321619, 78643219, 157286461, -1 + }; + ConcurrentG1Refine::ConcurrentG1Refine() : - _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), + _card_counts(NULL), _card_epochs(NULL), + _n_card_counts(0), _max_n_card_counts(0), + _cache_size_index(0), _expand_card_counts(false), _hot_cache(NULL), _def_use_cache(false), _use_cache(false), - _n_periods(0), _total_cards(0), _total_travs(0), + _n_periods(0), _threads(NULL), _n_threads(0) { if (G1ConcRefine) { @@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() { } void ConcurrentG1Refine::init() { - G1CollectedHeap* g1h = G1CollectedHeap::heap(); - if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { - _n_card_counts = - (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift); - _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts); - for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0; - ModRefBarrierSet* bs = g1h->mr_bs(); - guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition"); - CardTableModRefBS* ctbs = (CardTableModRefBS*)bs; - _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start()); - if (G1ConcRSCountTraversals) { - _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256); - _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256); - for (int i = 0; i < 256; i++) { - _cur_card_count_histo[i] = 0; - _cum_card_count_histo[i] = 0; - } - } - } if (G1ConcRSLogCacheSize > 0) { + _g1h = G1CollectedHeap::heap(); + _max_n_card_counts = + (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift); + + size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1; + guarantee(_max_n_card_counts < max_card_num, "card_num representation"); + + int desired = _max_n_card_counts / InitialCacheFraction; + for (_cache_size_index = 0; + _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) { + if (_cc_cache_sizes[_cache_size_index] >= desired) break; + } + _cache_size_index = MAX2(0, (_cache_size_index - 1)); + + int initial_size = _cc_cache_sizes[_cache_size_index]; + if (initial_size < 0) initial_size = _max_n_card_counts; + + // Make sure we don't go bigger than we will ever need + _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts); + + _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts); + _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts); + + Copy::fill_to_bytes(&_card_counts[0], + _n_card_counts * sizeof(CardCountCacheEntry)); + Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); + + ModRefBarrierSet* bs = _g1h->mr_bs(); + guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition"); + _ct_bs = (CardTableModRefBS*)bs; + _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start()); + _def_use_cache = true; _use_cache = true; _hot_cache_size = (1 << G1ConcRSLogCacheSize); @@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() { // For refining the cards in the hot cache in parallel int n_workers = (ParallelGCThreads > 0 ? - g1h->workers()->total_workers() : 1); + _g1h->workers()->total_workers() : 1); _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); _hot_cache_par_claimed_idx = 0; } @@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() { } ConcurrentG1Refine::~ConcurrentG1Refine() { - if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) { - assert(_card_counts != NULL, "Logic"); - FREE_C_HEAP_ARRAY(unsigned char, _card_counts); - assert(_cur_card_count_histo != NULL, "Logic"); - FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo); - assert(_cum_card_count_histo != NULL, "Logic"); - FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo); - } if (G1ConcRSLogCacheSize > 0) { + assert(_card_counts != NULL, "Logic"); + FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts); + assert(_card_epochs != NULL, "Logic"); + FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs); assert(_hot_cache != NULL, "Logic"); FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); } @@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) { } } +bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) { + HeapWord* start = _ct_bs->addr_for(card_ptr); + HeapRegion* r = _g1h->heap_region_containing(start); + if (r != NULL && r->is_young()) { + return true; + } + // This card is not associated with a heap region + // so can't be young. + return false; +} + +jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) { + unsigned new_card_num = ptr_2_card_num(card_ptr); + unsigned bucket = hash(new_card_num); + assert(0 <= bucket && bucket < _n_card_counts, "Bounds"); + + CardCountCacheEntry* count_ptr = &_card_counts[bucket]; + CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket]; + + // We have to construct a new entry if we haven't updated the counts + // during the current period, or if the count was updated for a + // different card number. + unsigned int new_epoch = (unsigned int) _n_periods; + julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch); -int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { - size_t card_num = (card_ptr - _ct_bot); - guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds"); - unsigned char cnt = _card_counts[card_num]; - if (cnt < 255) _card_counts[card_num]++; - return cnt; - _total_travs++; + while (true) { + // Fetch the previous epoch value + julong prev_epoch_entry = epoch_ptr->_value; + julong cas_res; + + if (extract_epoch(prev_epoch_entry) != new_epoch) { + // This entry has not yet been updated during this period. + // Note: we update the epoch value atomically to ensure + // that there is only one winner that updates the cached + // card_ptr value even though all the refine threads share + // the same epoch value. + + cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, + (volatile jlong*)&epoch_ptr->_value, + (jlong) prev_epoch_entry); + + if (cas_res == prev_epoch_entry) { + // We have successfully won the race to update the + // epoch and card_num value. Make it look like the + // count and eviction count were previously cleared. + count_ptr->_count = 1; + count_ptr->_evict_count = 0; + *count = 0; + // We can defer the processing of card_ptr + *defer = true; + return card_ptr; + } + // We did not win the race to update the epoch field, so some other + // thread must have done it. The value that gets returned by CAS + // should be the new epoch value. + assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch"); + // We could 'continue' here or just re-read the previous epoch value + prev_epoch_entry = epoch_ptr->_value; + } + + // The epoch entry for card_ptr has been updated during this period. + unsigned old_card_num = extract_card_num(prev_epoch_entry); + + // The card count that will be returned to caller + *count = count_ptr->_count; + + // Are we updating the count for the same card? + if (new_card_num == old_card_num) { + // Same card - just update the count. We could have more than one + // thread racing to update count for the current card. It should be + // OK not to use a CAS as the only penalty should be some missed + // increments of the count which delays identifying the card as "hot". + + if (*count < max_jubyte) count_ptr->_count++; + // We can defer the processing of card_ptr + *defer = true; + return card_ptr; + } + + // Different card - evict old card info + if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++; + if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) { + // Trigger a resize the next time we clear + _expand_card_counts = true; + } + + cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry, + (volatile jlong*)&epoch_ptr->_value, + (jlong) prev_epoch_entry); + + if (cas_res == prev_epoch_entry) { + // We successfully updated the card num value in the epoch entry + count_ptr->_count = 0; // initialize counter for new card num + + // Even though the region containg the card at old_card_num was not + // in the young list when old_card_num was recorded in the epoch + // cache it could have been added to the free list and subsequently + // added to the young list in the intervening time. If the evicted + // card is in a young region just return the card_ptr and the evicted + // card will not be cleaned. See CR 6817995. + + jbyte* old_card_ptr = card_num_2_ptr(old_card_num); + if (is_young_card(old_card_ptr)) { + *count = 0; + // We can defer the processing of card_ptr + *defer = true; + return card_ptr; + } + + // We do not want to defer processing of card_ptr in this case + // (we need to refine old_card_ptr and card_ptr) + *defer = false; + return old_card_ptr; + } + // Someone else beat us - try again. + } } -jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) { - int count = add_card_count(card_ptr); - // Count previously unvisited cards. - if (count == 0) _total_cards++; - // We'll assume a traversal unless we store it in the cache. +jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) { + int count; + jbyte* cached_ptr = add_card_count(card_ptr, &count, defer); + assert(cached_ptr != NULL, "bad cached card ptr"); + assert(!is_young_card(cached_ptr), "shouldn't get a card in young region"); + + // The card pointer we obtained from card count cache is not hot + // so do not store it in the cache; return it for immediate + // refining. if (count < G1ConcRSHotCardLimit) { - _total_travs++; - return card_ptr; + return cached_ptr; } - // Otherwise, it's hot. + + // Otherwise, the pointer we got from the _card_counts is hot. jbyte* res = NULL; MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); if (_n_hot == _hot_cache_size) { - _total_travs++; res = _hot_cache[_hot_cache_idx]; _n_hot--; } // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. - _hot_cache[_hot_cache_idx] = card_ptr; + _hot_cache[_hot_cache_idx] = cached_ptr; _hot_cache_idx++; if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; _n_hot++; + + if (res != NULL) { + // Even though the region containg res was not in the young list + // when it was recorded in the hot cache it could have been added + // to the free list and subsequently added to the young list in + // the intervening time. If res is in a young region, return NULL + // so that res is not cleaned. See CR 6817995. + + if (is_young_card(res)) { + res = NULL; + } + } + return res; } - void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { assert(!use_cache(), "cache should be disabled"); int start_idx; @@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { } } -void ConcurrentG1Refine::clear_and_record_card_counts() { - if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return; - _n_periods++; - if (G1ConcRSCountTraversals) { - for (size_t i = 0; i < _n_card_counts; i++) { - unsigned char bucket = _card_counts[i]; - _cur_card_count_histo[bucket]++; - _card_counts[i] = 0; - } - gclog_or_tty->print_cr("Card counts:"); - for (int i = 0; i < 256; i++) { - if (_cur_card_count_histo[i] > 0) { - gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]); - _cum_card_count_histo[i] += _cur_card_count_histo[i]; - _cur_card_count_histo[i] = 0; - } - } - } else { - assert(G1ConcRSLogCacheSize > 0, "Logic"); - Copy::fill_to_words((HeapWord*)(&_card_counts[0]), - _n_card_counts / HeapWordSize); - } -} +void ConcurrentG1Refine::expand_card_count_cache() { + if (_n_card_counts < _max_n_card_counts) { + int new_idx = _cache_size_index+1; + int new_size = _cc_cache_sizes[new_idx]; + if (new_size < 0) new_size = _max_n_card_counts; -void -ConcurrentG1Refine:: -print_card_count_histo_range(unsigned* histo, int from, int to, - float& cum_card_pct, - float& cum_travs_pct) { - unsigned cards = 0; - unsigned travs = 0; - guarantee(to <= 256, "Precondition"); - for (int i = from; i < to-1; i++) { - cards += histo[i]; - travs += histo[i] * i; - } - if (to == 256) { - unsigned histo_card_sum = 0; - unsigned histo_trav_sum = 0; - for (int i = 1; i < 255; i++) { - histo_trav_sum += histo[i] * i; - } - cards += histo[255]; - // correct traversals for the last one. - unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum); - travs += travs_255; - - } else { - cards += histo[to-1]; - travs += histo[to-1] * (to-1); - } - float fperiods = (float)_n_periods; - float f_tot_cards = (float)_total_cards/fperiods; - float f_tot_travs = (float)_total_travs/fperiods; - if (cards > 0) { - float fcards = (float)cards/fperiods; - float ftravs = (float)travs/fperiods; - if (to == 256) { - gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs); - } else { - gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs); + // Make sure we don't go bigger than we will ever need + new_size = MIN2((unsigned) new_size, _max_n_card_counts); + + // Expand the card count and card epoch tables + if (new_size > (int)_n_card_counts) { + // We can just free and allocate a new array as we're + // not interested in preserving the contents + assert(_card_counts != NULL, "Logic!"); + assert(_card_epochs != NULL, "Logic!"); + FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts); + FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs); + _n_card_counts = new_size; + _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts); + _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts); + _cache_size_index = new_idx; } - float pct_cards = fcards*100.0/f_tot_cards; - cum_card_pct += pct_cards; - float pct_travs = ftravs*100.0/f_tot_travs; - cum_travs_pct += pct_travs; - gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f", - pct_cards, cum_card_pct, - pct_travs, cum_travs_pct); } } -void ConcurrentG1Refine::print_final_card_counts() { - if (!G1ConcRSCountTraversals) return; - - gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.", - _total_travs, _total_cards); - float fperiods = (float)_n_periods; - gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, " - "per collection.", (float)_total_travs/fperiods, - (float)_total_cards/fperiods); - gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct " - "dirty card.\n", - _total_cards > 0 ? - (float)_total_travs/(float)_total_cards : 0.0); - - - gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s", - "range", "# cards", "# travs", "% cards", "(cum)", - "% travs", "(cum)"); - gclog_or_tty->print_cr("------------------------------------------------------------" - "-------------"); - float cum_cards_pct = 0.0; - float cum_travs_pct = 0.0; - for (int i = 1; i < 10; i++) { - print_card_count_histo_range(_cum_card_count_histo, i, i+1, - cum_cards_pct, cum_travs_pct); - } - for (int i = 10; i < 100; i += 10) { - print_card_count_histo_range(_cum_card_count_histo, i, i+10, - cum_cards_pct, cum_travs_pct); +void ConcurrentG1Refine::clear_and_record_card_counts() { + if (G1ConcRSLogCacheSize == 0) return; + +#ifndef PRODUCT + double start = os::elapsedTime(); +#endif + + if (_expand_card_counts) { + expand_card_count_cache(); + _expand_card_counts = false; + // Only need to clear the epochs. + Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry)); } - print_card_count_histo_range(_cum_card_count_histo, 100, 150, - cum_cards_pct, cum_travs_pct); - print_card_count_histo_range(_cum_card_count_histo, 150, 200, - cum_cards_pct, cum_travs_pct); - print_card_count_histo_range(_cum_card_count_histo, 150, 255, - cum_cards_pct, cum_travs_pct); - print_card_count_histo_range(_cum_card_count_histo, 255, 256, - cum_cards_pct, cum_travs_pct); + + int this_epoch = (int) _n_periods; + assert((this_epoch+1) <= max_jint, "to many periods"); + // Update epoch + _n_periods++; + +#ifndef PRODUCT + double elapsed = os::elapsedTime() - start; + _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0); +#endif } diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp index 279996448..a6ee5484a 100644 --- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp +++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp @@ -29,18 +29,77 @@ class G1RemSet; class ConcurrentG1Refine: public CHeapObj { ConcurrentG1RefineThread** _threads; int _n_threads; + // The cache for card refinement. - bool _use_cache; - bool _def_use_cache; - size_t _n_periods; - size_t _total_cards; - size_t _total_travs; - - unsigned char* _card_counts; - unsigned _n_card_counts; - const jbyte* _ct_bot; - unsigned* _cur_card_count_histo; - unsigned* _cum_card_count_histo; + bool _use_cache; + bool _def_use_cache; + + size_t _n_periods; // Used as clearing epoch + + // An evicting cache of the number of times each card + // is accessed. Reduces, but does not eliminate, the amount + // of duplicated processing of dirty cards. + + enum SomePrivateConstants { + epoch_bits = 32, + card_num_shift = epoch_bits, + epoch_mask = AllBits, + card_num_mask = AllBits, + + // The initial cache size is approximately this fraction + // of a maximal cache (i.e. the size needed for all cards + // in the heap) + InitialCacheFraction = 512 + }; + + const static julong card_num_mask_in_place = + (julong) card_num_mask << card_num_shift; + + typedef struct { + julong _value; // | card_num | epoch | + } CardEpochCacheEntry; + + julong make_epoch_entry(unsigned int card_num, unsigned int epoch) { + assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds"); + assert(0 <= epoch && epoch <= _n_periods, "must be"); + + return ((julong) card_num << card_num_shift) | epoch; + } + + unsigned int extract_epoch(julong v) { + return (v & epoch_mask); + } + + unsigned int extract_card_num(julong v) { + return (v & card_num_mask_in_place) >> card_num_shift; + } + + typedef struct { + unsigned char _count; + unsigned char _evict_count; + } CardCountCacheEntry; + + CardCountCacheEntry* _card_counts; + CardEpochCacheEntry* _card_epochs; + + // The current number of buckets in the card count cache + unsigned _n_card_counts; + + // The max number of buckets required for the number of + // cards for the entire reserved heap + unsigned _max_n_card_counts; + + // Possible sizes of the cache: odd primes that roughly double in size. + // (See jvmtiTagMap.cpp). + static int _cc_cache_sizes[]; + + // The index in _cc_cache_sizes corresponding to the size of + // _card_counts. + int _cache_size_index; + + bool _expand_card_counts; + + const jbyte* _ct_bot; jbyte** _hot_cache; int _hot_cache_size; @@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj { int _hot_cache_par_chunk_size; volatile int _hot_cache_par_claimed_idx; + // Needed to workaround 6817995 + CardTableModRefBS* _ct_bs; + G1CollectedHeap* _g1h; + + // Expands the array that holds the card counts to the next size up + void expand_card_count_cache(); + + // hash a given key (index of card_ptr) with the specified size + static unsigned int hash(size_t key, int size) { + return (unsigned int) key % size; + } + + // hash a given key (index of card_ptr) + unsigned int hash(size_t key) { + return hash(key, _n_card_counts); + } + + unsigned ptr_2_card_num(jbyte* card_ptr) { + return (unsigned) (card_ptr - _ct_bot); + } + + jbyte* card_num_2_ptr(unsigned card_num) { + return (jbyte*) (_ct_bot + card_num); + } + // Returns the count of this card after incrementing it. - int add_card_count(jbyte* card_ptr); + jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer); + + // Returns true if this card is in a young region + bool is_young_card(jbyte* card_ptr); - void print_card_count_histo_range(unsigned* histo, int from, int to, - float& cum_card_pct, - float& cum_travs_pct); public: ConcurrentG1Refine(); ~ConcurrentG1Refine(); @@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj { // If this is the first entry for the slot, writes into the cache and // returns NULL. If it causes an eviction, returns the evicted pointer. // Otherwise, its a cache hit, and returns NULL. - jbyte* cache_insert(jbyte* card_ptr); + jbyte* cache_insert(jbyte* card_ptr, bool* defer); // Process the cached entries. void clean_up_cache(int worker_i, G1RemSet* g1rs); @@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj { } void clear_and_record_card_counts(); - void print_final_card_counts(); static size_t thread_num(); }; diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp index 86872d6e0..8b75a1913 100644 --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp @@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const { } void G1CollectedHeap::print_tracing_info() const { - concurrent_g1_refine()->print_final_card_counts(); - // We'll overload this to mean "trace GC pause statistics." if (TraceGen0Time || TraceGen1Time) { // The "G1CollectorPolicy" is keeping track of these stats, so delegate diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp index bb219c59c..c0ddd2d4d 100644 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp @@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() : _summary(new Summary()), _abandoned_summary(new AbandonedSummary()), +#ifndef PRODUCT _cur_clear_ct_time_ms(0.0), + _min_clear_cc_time_ms(-1.0), + _max_clear_cc_time_ms(-1.0), + _cur_clear_cc_time_ms(0.0), + _cum_clear_cc_time_ms(0.0), + _num_cc_clears(0L), +#endif _region_num_young(0), _region_num_tenured(0), @@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) { print_stats(1, "Object Copying", obj_copy_time); } } +#ifndef PRODUCT + print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms); + print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms); + print_stats(1, "Min Clear CC", _min_clear_cc_time_ms); + print_stats(1, "Max Clear CC", _max_clear_cc_time_ms); + if (_num_cc_clears > 0) { + print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears)); + } +#endif print_stats(1, "Other", other_time_ms); for (int i = 0; i < _aux_num; ++i) { if (_cur_aux_times_set[i]) { diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp index 8fa4ef4a8..3bb7023b2 100644 --- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp +++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp @@ -112,7 +112,6 @@ protected: return 8*M; } - double _cur_collection_start_sec; size_t _cur_collection_pause_used_at_start_bytes; size_t _cur_collection_pause_used_regions_at_start; @@ -122,6 +121,15 @@ protected: double _cur_clear_ct_time_ms; bool _satb_drain_time_set; +#ifndef PRODUCT + // Card Table Count Cache stats + double _min_clear_cc_time_ms; // min + double _max_clear_cc_time_ms; // max + double _cur_clear_cc_time_ms; // clearing time during current pause + double _cum_clear_cc_time_ms; // cummulative clearing time + jlong _num_cc_clears; // number of times the card count cache has been cleared +#endif + double _cur_CH_strong_roots_end_sec; double _cur_CH_strong_roots_dur_ms; double _cur_G1_strong_roots_end_sec; @@ -931,6 +939,18 @@ public: _cur_aux_times_ms[i] += ms; } +#ifndef PRODUCT + void record_cc_clear_time(double ms) { + if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms) + _min_clear_cc_time_ms = ms; + if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms) + _max_clear_cc_time_ms = ms; + _cur_clear_cc_time_ms = ms; + _cum_clear_cc_time_ms += ms; + _num_cc_clears++; + } +#endif + // Record the fact that "bytes" bytes allocated in a region. void record_before_bytes(size_t bytes); void record_after_bytes(size_t bytes); diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.cpp b/src/share/vm/gc_implementation/g1/g1RemSet.cpp index 9c3602617..224263e54 100644 --- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp +++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp @@ -676,61 +676,12 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm, static IntHistogram out_of_histo(50, 50); -void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { - // If the card is no longer dirty, nothing to do. - if (*card_ptr != CardTableModRefBS::dirty_card_val()) return; - +void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) { // Construct the region representing the card. HeapWord* start = _ct_bs->addr_for(card_ptr); // And find the region containing it. HeapRegion* r = _g1->heap_region_containing(start); - if (r == NULL) { - guarantee(_g1->is_in_permanent(start), "Or else where?"); - return; // Not in the G1 heap (might be in perm, for example.) - } - // Why do we have to check here whether a card is on a young region, - // given that we dirty young regions and, as a result, the - // post-barrier is supposed to filter them out and never to enqueue - // them? When we allocate a new region as the "allocation region" we - // actually dirty its cards after we release the lock, since card - // dirtying while holding the lock was a performance bottleneck. So, - // as a result, it is possible for other threads to actually - // allocate objects in the region (after the acquire the lock) - // before all the cards on the region are dirtied. This is unlikely, - // and it doesn't happen often, but it can happen. So, the extra - // check below filters out those cards. - if (r->is_young()) { - return; - } - // While we are processing RSet buffers during the collection, we - // actually don't want to scan any cards on the collection set, - // since we don't want to update remebered sets with entries that - // point into the collection set, given that live objects from the - // collection set are about to move and such entries will be stale - // very soon. This change also deals with a reliability issue which - // involves scanning a card in the collection set and coming across - // an array that was being chunked and looking malformed. Note, - // however, that if evacuation fails, we have to scan any objects - // that were not moved and create any missing entries. - if (r->in_collection_set()) { - return; - } - - // Should we defer it? - if (_cg1r->use_cache()) { - card_ptr = _cg1r->cache_insert(card_ptr); - // If it was not an eviction, nothing to do. - if (card_ptr == NULL) return; - - // OK, we have to reset the card start, region, etc. - start = _ct_bs->addr_for(card_ptr); - r = _g1->heap_region_containing(start); - if (r == NULL) { - guarantee(_g1->is_in_permanent(start), "Or else where?"); - return; // Not in the G1 heap (might be in perm, for example.) - } - guarantee(!r->is_young(), "It was evicted in the current minor cycle."); - } + assert(r != NULL, "unexpected null"); HeapWord* end = _ct_bs->addr_for(card_ptr + 1); MemRegion dirtyRegion(start, end); @@ -774,6 +725,106 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { } } +void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { + // If the card is no longer dirty, nothing to do. + if (*card_ptr != CardTableModRefBS::dirty_card_val()) return; + + // Construct the region representing the card. + HeapWord* start = _ct_bs->addr_for(card_ptr); + // And find the region containing it. + HeapRegion* r = _g1->heap_region_containing(start); + if (r == NULL) { + guarantee(_g1->is_in_permanent(start), "Or else where?"); + return; // Not in the G1 heap (might be in perm, for example.) + } + // Why do we have to check here whether a card is on a young region, + // given that we dirty young regions and, as a result, the + // post-barrier is supposed to filter them out and never to enqueue + // them? When we allocate a new region as the "allocation region" we + // actually dirty its cards after we release the lock, since card + // dirtying while holding the lock was a performance bottleneck. So, + // as a result, it is possible for other threads to actually + // allocate objects in the region (after the acquire the lock) + // before all the cards on the region are dirtied. This is unlikely, + // and it doesn't happen often, but it can happen. So, the extra + // check below filters out those cards. + if (r->is_young()) { + return; + } + // While we are processing RSet buffers during the collection, we + // actually don't want to scan any cards on the collection set, + // since we don't want to update remebered sets with entries that + // point into the collection set, given that live objects from the + // collection set are about to move and such entries will be stale + // very soon. This change also deals with a reliability issue which + // involves scanning a card in the collection set and coming across + // an array that was being chunked and looking malformed. Note, + // however, that if evacuation fails, we have to scan any objects + // that were not moved and create any missing entries. + if (r->in_collection_set()) { + return; + } + + // Should we defer processing the card? + // + // Previously the result from the insert_cache call would be + // either card_ptr (implying that card_ptr was currently "cold"), + // null (meaning we had inserted the card ptr into the "hot" + // cache, which had some headroom), or a "hot" card ptr + // extracted from the "hot" cache. + // + // Now that the _card_counts cache in the ConcurrentG1Refine + // instance is an evicting hash table, the result we get back + // could be from evicting the card ptr in an already occupied + // bucket (in which case we have replaced the card ptr in the + // bucket with card_ptr and "defer" is set to false). To avoid + // having a data structure (updates to which would need a lock) + // to hold these unprocessed dirty cards, we need to immediately + // process card_ptr. The actions needed to be taken on return + // from cache_insert are summarized in the following table: + // + // res defer action + // -------------------------------------------------------------- + // null false card evicted from _card_counts & replaced with + // card_ptr; evicted ptr added to hot cache. + // No need to process res; immediately process card_ptr + // + // null true card not evicted from _card_counts; card_ptr added + // to hot cache. + // Nothing to do. + // + // non-null false card evicted from _card_counts & replaced with + // card_ptr; evicted ptr is currently "cold" or + // caused an eviction from the hot cache. + // Immediately process res; process card_ptr. + // + // non-null true card not evicted from _card_counts; card_ptr is + // currently cold, or caused an eviction from hot + // cache. + // Immediately process res; no need to process card_ptr. + + jbyte* res = card_ptr; + bool defer = false; + if (_cg1r->use_cache()) { + jbyte* res = _cg1r->cache_insert(card_ptr, &defer); + if (res != NULL && (res != card_ptr || defer)) { + start = _ct_bs->addr_for(res); + r = _g1->heap_region_containing(start); + if (r == NULL) { + assert(_g1->is_in_permanent(start), "Or else where?"); + } else { + guarantee(!r->is_young(), "It was evicted in the current minor cycle."); + // Process card pointer we get back from the hot card cache + concurrentRefineOneCard_impl(res, worker_i); + } + } + } + + if (!defer) { + concurrentRefineOneCard_impl(card_ptr, worker_i); + } +} + class HRRSStatsIter: public HeapRegionClosure { size_t _occupied; size_t _total_mem_sz; diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.hpp b/src/share/vm/gc_implementation/g1/g1RemSet.hpp index 60ba420d8..c193157fc 100644 --- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp +++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp @@ -157,6 +157,10 @@ protected: } } + // The routine that performs the actual work of refining a dirty + // card. + void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i); + protected: template void write_ref_nv(HeapRegion* from, T* p); template void par_write_ref_nv(HeapRegion* from, T* p, int tid); diff --git a/src/share/vm/gc_implementation/g1/g1_globals.hpp b/src/share/vm/gc_implementation/g1/g1_globals.hpp index c7748f0d9..e3d154274 100644 --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp @@ -187,10 +187,6 @@ develop(intx, G1ConcRSLogCacheSize, 10, \ "Log base 2 of the length of conc RS hot-card cache.") \ \ - develop(bool, G1ConcRSCountTraversals, false, \ - "If true, gather data about the number of times CR traverses " \ - "cards ") \ - \ develop(intx, G1ConcRSHotCardLimit, 4, \ "The threshold that defines (>=) a hot card.") \ \ @@ -264,6 +260,10 @@ \ product(uintx, G1ParallelRSetThreads, 0, \ "If non-0 is the number of parallel rem set update threads, " \ - "otherwise the value is determined ergonomically.") + "otherwise the value is determined ergonomically.") \ + \ + develop(intx, G1CardCountCacheExpandThreshold, 16, \ + "Expand the card count cache if the number of collisions for " \ + "a particular entry exceeds this value.") G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG) diff --git a/src/share/vm/gc_implementation/includeDB_gc_g1 b/src/share/vm/gc_implementation/includeDB_gc_g1 index 7e2df3fc8..d9d151dc5 100644 --- a/src/share/vm/gc_implementation/includeDB_gc_g1 +++ b/src/share/vm/gc_implementation/includeDB_gc_g1 @@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp concurrentG1Refine.cpp concurrentG1RefineThread.hpp concurrentG1Refine.cpp copy.hpp concurrentG1Refine.cpp g1CollectedHeap.inline.hpp +concurrentG1Refine.cpp g1CollectorPolicy.hpp concurrentG1Refine.cpp g1RemSet.hpp concurrentG1Refine.cpp space.inline.hpp +concurrentG1Refine.cpp heapRegionSeq.inline.hpp concurrentG1Refine.hpp globalDefinitions.hpp concurrentG1Refine.hpp allocation.hpp +concurrentG1Refine.hpp cardTableModRefBS.hpp concurrentG1Refine.hpp thread.hpp concurrentG1RefineThread.cpp concurrentG1Refine.hpp -- GitLab