提交 c046dd73 编写于 作者: J johnc

6819077: G1: first GC thread coming late into the GC.

Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter.
Reviewed-by: iveresov, tonyp
上级 8763012d
...@@ -25,11 +25,21 @@ ...@@ -25,11 +25,21 @@
#include "incls/_precompiled.incl" #include "incls/_precompiled.incl"
#include "incls/_concurrentG1Refine.cpp.incl" #include "incls/_concurrentG1Refine.cpp.incl"
// Possible sizes for the card counts cache: odd primes that roughly double in size.
// (See jvmtiTagMap.cpp).
int ConcurrentG1Refine::_cc_cache_sizes[] = {
16381, 32771, 76831, 150001, 307261,
614563, 1228891, 2457733, 4915219, 9830479,
19660831, 39321619, 78643219, 157286461, -1
};
ConcurrentG1Refine::ConcurrentG1Refine() : ConcurrentG1Refine::ConcurrentG1Refine() :
_card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), _card_counts(NULL), _card_epochs(NULL),
_n_card_counts(0), _max_n_card_counts(0),
_cache_size_index(0), _expand_card_counts(false),
_hot_cache(NULL), _hot_cache(NULL),
_def_use_cache(false), _use_cache(false), _def_use_cache(false), _use_cache(false),
_n_periods(0), _total_cards(0), _total_travs(0), _n_periods(0),
_threads(NULL), _n_threads(0) _threads(NULL), _n_threads(0)
{ {
if (G1ConcRefine) { if (G1ConcRefine) {
...@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() { ...@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() {
} }
void ConcurrentG1Refine::init() { void ConcurrentG1Refine::init() {
G1CollectedHeap* g1h = G1CollectedHeap::heap();
if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
_n_card_counts =
(unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
_card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0;
ModRefBarrierSet* bs = g1h->mr_bs();
guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
CardTableModRefBS* ctbs = (CardTableModRefBS*)bs;
_ct_bot = ctbs->byte_for_const(g1h->reserved_region().start());
if (G1ConcRSCountTraversals) {
_cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
_cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
for (int i = 0; i < 256; i++) {
_cur_card_count_histo[i] = 0;
_cum_card_count_histo[i] = 0;
}
}
}
if (G1ConcRSLogCacheSize > 0) { if (G1ConcRSLogCacheSize > 0) {
_g1h = G1CollectedHeap::heap();
_max_n_card_counts =
(unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
guarantee(_max_n_card_counts < max_card_num, "card_num representation");
int desired = _max_n_card_counts / InitialCacheFraction;
for (_cache_size_index = 0;
_cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
if (_cc_cache_sizes[_cache_size_index] >= desired) break;
}
_cache_size_index = MAX2(0, (_cache_size_index - 1));
int initial_size = _cc_cache_sizes[_cache_size_index];
if (initial_size < 0) initial_size = _max_n_card_counts;
// Make sure we don't go bigger than we will ever need
_n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
_card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
_card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
Copy::fill_to_bytes(&_card_counts[0],
_n_card_counts * sizeof(CardCountCacheEntry));
Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
ModRefBarrierSet* bs = _g1h->mr_bs();
guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
_ct_bs = (CardTableModRefBS*)bs;
_ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
_def_use_cache = true; _def_use_cache = true;
_use_cache = true; _use_cache = true;
_hot_cache_size = (1 << G1ConcRSLogCacheSize); _hot_cache_size = (1 << G1ConcRSLogCacheSize);
...@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() { ...@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() {
// For refining the cards in the hot cache in parallel // For refining the cards in the hot cache in parallel
int n_workers = (ParallelGCThreads > 0 ? int n_workers = (ParallelGCThreads > 0 ?
g1h->workers()->total_workers() : 1); _g1h->workers()->total_workers() : 1);
_hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
_hot_cache_par_claimed_idx = 0; _hot_cache_par_claimed_idx = 0;
} }
...@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() { ...@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() {
} }
ConcurrentG1Refine::~ConcurrentG1Refine() { ConcurrentG1Refine::~ConcurrentG1Refine() {
if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
assert(_card_counts != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned char, _card_counts);
assert(_cur_card_count_histo != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo);
assert(_cum_card_count_histo != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
}
if (G1ConcRSLogCacheSize > 0) { if (G1ConcRSLogCacheSize > 0) {
assert(_card_counts != NULL, "Logic");
FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
assert(_card_epochs != NULL, "Logic");
FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
assert(_hot_cache != NULL, "Logic"); assert(_hot_cache != NULL, "Logic");
FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
} }
...@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) { ...@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
} }
} }
bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
HeapWord* start = _ct_bs->addr_for(card_ptr);
HeapRegion* r = _g1h->heap_region_containing(start);
if (r != NULL && r->is_young()) {
return true;
}
// This card is not associated with a heap region
// so can't be young.
return false;
}
jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
unsigned new_card_num = ptr_2_card_num(card_ptr);
unsigned bucket = hash(new_card_num);
assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
CardCountCacheEntry* count_ptr = &_card_counts[bucket];
CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
// We have to construct a new entry if we haven't updated the counts
// during the current period, or if the count was updated for a
// different card number.
unsigned int new_epoch = (unsigned int) _n_periods;
julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { while (true) {
size_t card_num = (card_ptr - _ct_bot); // Fetch the previous epoch value
guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds"); julong prev_epoch_entry = epoch_ptr->_value;
unsigned char cnt = _card_counts[card_num]; julong cas_res;
if (cnt < 255) _card_counts[card_num]++;
return cnt; if (extract_epoch(prev_epoch_entry) != new_epoch) {
_total_travs++; // This entry has not yet been updated during this period.
// Note: we update the epoch value atomically to ensure
// that there is only one winner that updates the cached
// card_ptr value even though all the refine threads share
// the same epoch value.
cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
(volatile jlong*)&epoch_ptr->_value,
(jlong) prev_epoch_entry);
if (cas_res == prev_epoch_entry) {
// We have successfully won the race to update the
// epoch and card_num value. Make it look like the
// count and eviction count were previously cleared.
count_ptr->_count = 1;
count_ptr->_evict_count = 0;
*count = 0;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// We did not win the race to update the epoch field, so some other
// thread must have done it. The value that gets returned by CAS
// should be the new epoch value.
assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
// We could 'continue' here or just re-read the previous epoch value
prev_epoch_entry = epoch_ptr->_value;
}
// The epoch entry for card_ptr has been updated during this period.
unsigned old_card_num = extract_card_num(prev_epoch_entry);
// The card count that will be returned to caller
*count = count_ptr->_count;
// Are we updating the count for the same card?
if (new_card_num == old_card_num) {
// Same card - just update the count. We could have more than one
// thread racing to update count for the current card. It should be
// OK not to use a CAS as the only penalty should be some missed
// increments of the count which delays identifying the card as "hot".
if (*count < max_jubyte) count_ptr->_count++;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// Different card - evict old card info
if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
// Trigger a resize the next time we clear
_expand_card_counts = true;
}
cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
(volatile jlong*)&epoch_ptr->_value,
(jlong) prev_epoch_entry);
if (cas_res == prev_epoch_entry) {
// We successfully updated the card num value in the epoch entry
count_ptr->_count = 0; // initialize counter for new card num
// Even though the region containg the card at old_card_num was not
// in the young list when old_card_num was recorded in the epoch
// cache it could have been added to the free list and subsequently
// added to the young list in the intervening time. If the evicted
// card is in a young region just return the card_ptr and the evicted
// card will not be cleaned. See CR 6817995.
jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
if (is_young_card(old_card_ptr)) {
*count = 0;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// We do not want to defer processing of card_ptr in this case
// (we need to refine old_card_ptr and card_ptr)
*defer = false;
return old_card_ptr;
}
// Someone else beat us - try again.
}
} }
jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) { jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
int count = add_card_count(card_ptr); int count;
// Count previously unvisited cards. jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
if (count == 0) _total_cards++; assert(cached_ptr != NULL, "bad cached card ptr");
// We'll assume a traversal unless we store it in the cache. assert(!is_young_card(cached_ptr), "shouldn't get a card in young region");
// The card pointer we obtained from card count cache is not hot
// so do not store it in the cache; return it for immediate
// refining.
if (count < G1ConcRSHotCardLimit) { if (count < G1ConcRSHotCardLimit) {
_total_travs++; return cached_ptr;
return card_ptr;
} }
// Otherwise, it's hot.
// Otherwise, the pointer we got from the _card_counts is hot.
jbyte* res = NULL; jbyte* res = NULL;
MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
if (_n_hot == _hot_cache_size) { if (_n_hot == _hot_cache_size) {
_total_travs++;
res = _hot_cache[_hot_cache_idx]; res = _hot_cache[_hot_cache_idx];
_n_hot--; _n_hot--;
} }
// Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
_hot_cache[_hot_cache_idx] = card_ptr; _hot_cache[_hot_cache_idx] = cached_ptr;
_hot_cache_idx++; _hot_cache_idx++;
if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
_n_hot++; _n_hot++;
if (res != NULL) {
// Even though the region containg res was not in the young list
// when it was recorded in the hot cache it could have been added
// to the free list and subsequently added to the young list in
// the intervening time. If res is in a young region, return NULL
// so that res is not cleaned. See CR 6817995.
if (is_young_card(res)) {
res = NULL;
}
}
return res; return res;
} }
void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
assert(!use_cache(), "cache should be disabled"); assert(!use_cache(), "cache should be disabled");
int start_idx; int start_idx;
...@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { ...@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
} }
} }
void ConcurrentG1Refine::clear_and_record_card_counts() { void ConcurrentG1Refine::expand_card_count_cache() {
if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return; if (_n_card_counts < _max_n_card_counts) {
_n_periods++; int new_idx = _cache_size_index+1;
if (G1ConcRSCountTraversals) { int new_size = _cc_cache_sizes[new_idx];
for (size_t i = 0; i < _n_card_counts; i++) { if (new_size < 0) new_size = _max_n_card_counts;
unsigned char bucket = _card_counts[i];
_cur_card_count_histo[bucket]++;
_card_counts[i] = 0;
}
gclog_or_tty->print_cr("Card counts:");
for (int i = 0; i < 256; i++) {
if (_cur_card_count_histo[i] > 0) {
gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]);
_cum_card_count_histo[i] += _cur_card_count_histo[i];
_cur_card_count_histo[i] = 0;
}
}
} else {
assert(G1ConcRSLogCacheSize > 0, "Logic");
Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
_n_card_counts / HeapWordSize);
}
}
void // Make sure we don't go bigger than we will ever need
ConcurrentG1Refine:: new_size = MIN2((unsigned) new_size, _max_n_card_counts);
print_card_count_histo_range(unsigned* histo, int from, int to,
float& cum_card_pct, // Expand the card count and card epoch tables
float& cum_travs_pct) { if (new_size > (int)_n_card_counts) {
unsigned cards = 0; // We can just free and allocate a new array as we're
unsigned travs = 0; // not interested in preserving the contents
guarantee(to <= 256, "Precondition"); assert(_card_counts != NULL, "Logic!");
for (int i = from; i < to-1; i++) { assert(_card_epochs != NULL, "Logic!");
cards += histo[i]; FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
travs += histo[i] * i; FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
} _n_card_counts = new_size;
if (to == 256) { _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
unsigned histo_card_sum = 0; _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
unsigned histo_trav_sum = 0; _cache_size_index = new_idx;
for (int i = 1; i < 255; i++) {
histo_trav_sum += histo[i] * i;
}
cards += histo[255];
// correct traversals for the last one.
unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
travs += travs_255;
} else {
cards += histo[to-1];
travs += histo[to-1] * (to-1);
}
float fperiods = (float)_n_periods;
float f_tot_cards = (float)_total_cards/fperiods;
float f_tot_travs = (float)_total_travs/fperiods;
if (cards > 0) {
float fcards = (float)cards/fperiods;
float ftravs = (float)travs/fperiods;
if (to == 256) {
gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs);
} else {
gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs);
} }
float pct_cards = fcards*100.0/f_tot_cards;
cum_card_pct += pct_cards;
float pct_travs = ftravs*100.0/f_tot_travs;
cum_travs_pct += pct_travs;
gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
pct_cards, cum_card_pct,
pct_travs, cum_travs_pct);
} }
} }
void ConcurrentG1Refine::print_final_card_counts() { void ConcurrentG1Refine::clear_and_record_card_counts() {
if (!G1ConcRSCountTraversals) return; if (G1ConcRSLogCacheSize == 0) return;
gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.", #ifndef PRODUCT
_total_travs, _total_cards); double start = os::elapsedTime();
float fperiods = (float)_n_periods; #endif
gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, "
"per collection.", (float)_total_travs/fperiods, if (_expand_card_counts) {
(float)_total_cards/fperiods); expand_card_count_cache();
gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct " _expand_card_counts = false;
"dirty card.\n", // Only need to clear the epochs.
_total_cards > 0 ? Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
(float)_total_travs/(float)_total_cards : 0.0);
gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s",
"range", "# cards", "# travs", "% cards", "(cum)",
"% travs", "(cum)");
gclog_or_tty->print_cr("------------------------------------------------------------"
"-------------");
float cum_cards_pct = 0.0;
float cum_travs_pct = 0.0;
for (int i = 1; i < 10; i++) {
print_card_count_histo_range(_cum_card_count_histo, i, i+1,
cum_cards_pct, cum_travs_pct);
}
for (int i = 10; i < 100; i += 10) {
print_card_count_histo_range(_cum_card_count_histo, i, i+10,
cum_cards_pct, cum_travs_pct);
} }
print_card_count_histo_range(_cum_card_count_histo, 100, 150,
cum_cards_pct, cum_travs_pct); int this_epoch = (int) _n_periods;
print_card_count_histo_range(_cum_card_count_histo, 150, 200, assert((this_epoch+1) <= max_jint, "to many periods");
cum_cards_pct, cum_travs_pct); // Update epoch
print_card_count_histo_range(_cum_card_count_histo, 150, 255, _n_periods++;
cum_cards_pct, cum_travs_pct);
print_card_count_histo_range(_cum_card_count_histo, 255, 256, #ifndef PRODUCT
cum_cards_pct, cum_travs_pct); double elapsed = os::elapsedTime() - start;
_g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
#endif
} }
...@@ -29,18 +29,77 @@ class G1RemSet; ...@@ -29,18 +29,77 @@ class G1RemSet;
class ConcurrentG1Refine: public CHeapObj { class ConcurrentG1Refine: public CHeapObj {
ConcurrentG1RefineThread** _threads; ConcurrentG1RefineThread** _threads;
int _n_threads; int _n_threads;
// The cache for card refinement. // The cache for card refinement.
bool _use_cache; bool _use_cache;
bool _def_use_cache; bool _def_use_cache;
size_t _n_periods;
size_t _total_cards; size_t _n_periods; // Used as clearing epoch
size_t _total_travs;
// An evicting cache of the number of times each card
unsigned char* _card_counts; // is accessed. Reduces, but does not eliminate, the amount
unsigned _n_card_counts; // of duplicated processing of dirty cards.
const jbyte* _ct_bot;
unsigned* _cur_card_count_histo; enum SomePrivateConstants {
unsigned* _cum_card_count_histo; epoch_bits = 32,
card_num_shift = epoch_bits,
epoch_mask = AllBits,
card_num_mask = AllBits,
// The initial cache size is approximately this fraction
// of a maximal cache (i.e. the size needed for all cards
// in the heap)
InitialCacheFraction = 512
};
const static julong card_num_mask_in_place =
(julong) card_num_mask << card_num_shift;
typedef struct {
julong _value; // | card_num | epoch |
} CardEpochCacheEntry;
julong make_epoch_entry(unsigned int card_num, unsigned int epoch) {
assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds");
assert(0 <= epoch && epoch <= _n_periods, "must be");
return ((julong) card_num << card_num_shift) | epoch;
}
unsigned int extract_epoch(julong v) {
return (v & epoch_mask);
}
unsigned int extract_card_num(julong v) {
return (v & card_num_mask_in_place) >> card_num_shift;
}
typedef struct {
unsigned char _count;
unsigned char _evict_count;
} CardCountCacheEntry;
CardCountCacheEntry* _card_counts;
CardEpochCacheEntry* _card_epochs;
// The current number of buckets in the card count cache
unsigned _n_card_counts;
// The max number of buckets required for the number of
// cards for the entire reserved heap
unsigned _max_n_card_counts;
// Possible sizes of the cache: odd primes that roughly double in size.
// (See jvmtiTagMap.cpp).
static int _cc_cache_sizes[];
// The index in _cc_cache_sizes corresponding to the size of
// _card_counts.
int _cache_size_index;
bool _expand_card_counts;
const jbyte* _ct_bot;
jbyte** _hot_cache; jbyte** _hot_cache;
int _hot_cache_size; int _hot_cache_size;
...@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj {
int _hot_cache_par_chunk_size; int _hot_cache_par_chunk_size;
volatile int _hot_cache_par_claimed_idx; volatile int _hot_cache_par_claimed_idx;
// Needed to workaround 6817995
CardTableModRefBS* _ct_bs;
G1CollectedHeap* _g1h;
// Expands the array that holds the card counts to the next size up
void expand_card_count_cache();
// hash a given key (index of card_ptr) with the specified size
static unsigned int hash(size_t key, int size) {
return (unsigned int) key % size;
}
// hash a given key (index of card_ptr)
unsigned int hash(size_t key) {
return hash(key, _n_card_counts);
}
unsigned ptr_2_card_num(jbyte* card_ptr) {
return (unsigned) (card_ptr - _ct_bot);
}
jbyte* card_num_2_ptr(unsigned card_num) {
return (jbyte*) (_ct_bot + card_num);
}
// Returns the count of this card after incrementing it. // Returns the count of this card after incrementing it.
int add_card_count(jbyte* card_ptr); jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer);
// Returns true if this card is in a young region
bool is_young_card(jbyte* card_ptr);
void print_card_count_histo_range(unsigned* histo, int from, int to,
float& cum_card_pct,
float& cum_travs_pct);
public: public:
ConcurrentG1Refine(); ConcurrentG1Refine();
~ConcurrentG1Refine(); ~ConcurrentG1Refine();
...@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj {
// If this is the first entry for the slot, writes into the cache and // If this is the first entry for the slot, writes into the cache and
// returns NULL. If it causes an eviction, returns the evicted pointer. // returns NULL. If it causes an eviction, returns the evicted pointer.
// Otherwise, its a cache hit, and returns NULL. // Otherwise, its a cache hit, and returns NULL.
jbyte* cache_insert(jbyte* card_ptr); jbyte* cache_insert(jbyte* card_ptr, bool* defer);
// Process the cached entries. // Process the cached entries.
void clean_up_cache(int worker_i, G1RemSet* g1rs); void clean_up_cache(int worker_i, G1RemSet* g1rs);
...@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj {
} }
void clear_and_record_card_counts(); void clear_and_record_card_counts();
void print_final_card_counts();
static size_t thread_num(); static size_t thread_num();
}; };
...@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const { ...@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
} }
void G1CollectedHeap::print_tracing_info() const { void G1CollectedHeap::print_tracing_info() const {
concurrent_g1_refine()->print_final_card_counts();
// We'll overload this to mean "trace GC pause statistics." // We'll overload this to mean "trace GC pause statistics."
if (TraceGen0Time || TraceGen1Time) { if (TraceGen0Time || TraceGen1Time) {
// The "G1CollectorPolicy" is keeping track of these stats, so delegate // The "G1CollectorPolicy" is keeping track of these stats, so delegate
......
...@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() : ...@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() :
_summary(new Summary()), _summary(new Summary()),
_abandoned_summary(new AbandonedSummary()), _abandoned_summary(new AbandonedSummary()),
#ifndef PRODUCT
_cur_clear_ct_time_ms(0.0), _cur_clear_ct_time_ms(0.0),
_min_clear_cc_time_ms(-1.0),
_max_clear_cc_time_ms(-1.0),
_cur_clear_cc_time_ms(0.0),
_cum_clear_cc_time_ms(0.0),
_num_cc_clears(0L),
#endif
_region_num_young(0), _region_num_young(0),
_region_num_tenured(0), _region_num_tenured(0),
...@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) { ...@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
print_stats(1, "Object Copying", obj_copy_time); print_stats(1, "Object Copying", obj_copy_time);
} }
} }
#ifndef PRODUCT
print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms);
print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms);
print_stats(1, "Min Clear CC", _min_clear_cc_time_ms);
print_stats(1, "Max Clear CC", _max_clear_cc_time_ms);
if (_num_cc_clears > 0) {
print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears));
}
#endif
print_stats(1, "Other", other_time_ms); print_stats(1, "Other", other_time_ms);
for (int i = 0; i < _aux_num; ++i) { for (int i = 0; i < _aux_num; ++i) {
if (_cur_aux_times_set[i]) { if (_cur_aux_times_set[i]) {
......
...@@ -112,7 +112,6 @@ protected: ...@@ -112,7 +112,6 @@ protected:
return 8*M; return 8*M;
} }
double _cur_collection_start_sec; double _cur_collection_start_sec;
size_t _cur_collection_pause_used_at_start_bytes; size_t _cur_collection_pause_used_at_start_bytes;
size_t _cur_collection_pause_used_regions_at_start; size_t _cur_collection_pause_used_regions_at_start;
...@@ -122,6 +121,15 @@ protected: ...@@ -122,6 +121,15 @@ protected:
double _cur_clear_ct_time_ms; double _cur_clear_ct_time_ms;
bool _satb_drain_time_set; bool _satb_drain_time_set;
#ifndef PRODUCT
// Card Table Count Cache stats
double _min_clear_cc_time_ms; // min
double _max_clear_cc_time_ms; // max
double _cur_clear_cc_time_ms; // clearing time during current pause
double _cum_clear_cc_time_ms; // cummulative clearing time
jlong _num_cc_clears; // number of times the card count cache has been cleared
#endif
double _cur_CH_strong_roots_end_sec; double _cur_CH_strong_roots_end_sec;
double _cur_CH_strong_roots_dur_ms; double _cur_CH_strong_roots_dur_ms;
double _cur_G1_strong_roots_end_sec; double _cur_G1_strong_roots_end_sec;
...@@ -931,6 +939,18 @@ public: ...@@ -931,6 +939,18 @@ public:
_cur_aux_times_ms[i] += ms; _cur_aux_times_ms[i] += ms;
} }
#ifndef PRODUCT
void record_cc_clear_time(double ms) {
if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
_min_clear_cc_time_ms = ms;
if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms)
_max_clear_cc_time_ms = ms;
_cur_clear_cc_time_ms = ms;
_cum_clear_cc_time_ms += ms;
_num_cc_clears++;
}
#endif
// Record the fact that "bytes" bytes allocated in a region. // Record the fact that "bytes" bytes allocated in a region.
void record_before_bytes(size_t bytes); void record_before_bytes(size_t bytes);
void record_after_bytes(size_t bytes); void record_after_bytes(size_t bytes);
......
...@@ -676,61 +676,12 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm, ...@@ -676,61 +676,12 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
static IntHistogram out_of_histo(50, 50); static IntHistogram out_of_histo(50, 50);
void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) {
// If the card is no longer dirty, nothing to do.
if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
// Construct the region representing the card. // Construct the region representing the card.
HeapWord* start = _ct_bs->addr_for(card_ptr); HeapWord* start = _ct_bs->addr_for(card_ptr);
// And find the region containing it. // And find the region containing it.
HeapRegion* r = _g1->heap_region_containing(start); HeapRegion* r = _g1->heap_region_containing(start);
if (r == NULL) { assert(r != NULL, "unexpected null");
guarantee(_g1->is_in_permanent(start), "Or else where?");
return; // Not in the G1 heap (might be in perm, for example.)
}
// Why do we have to check here whether a card is on a young region,
// given that we dirty young regions and, as a result, the
// post-barrier is supposed to filter them out and never to enqueue
// them? When we allocate a new region as the "allocation region" we
// actually dirty its cards after we release the lock, since card
// dirtying while holding the lock was a performance bottleneck. So,
// as a result, it is possible for other threads to actually
// allocate objects in the region (after the acquire the lock)
// before all the cards on the region are dirtied. This is unlikely,
// and it doesn't happen often, but it can happen. So, the extra
// check below filters out those cards.
if (r->is_young()) {
return;
}
// While we are processing RSet buffers during the collection, we
// actually don't want to scan any cards on the collection set,
// since we don't want to update remebered sets with entries that
// point into the collection set, given that live objects from the
// collection set are about to move and such entries will be stale
// very soon. This change also deals with a reliability issue which
// involves scanning a card in the collection set and coming across
// an array that was being chunked and looking malformed. Note,
// however, that if evacuation fails, we have to scan any objects
// that were not moved and create any missing entries.
if (r->in_collection_set()) {
return;
}
// Should we defer it?
if (_cg1r->use_cache()) {
card_ptr = _cg1r->cache_insert(card_ptr);
// If it was not an eviction, nothing to do.
if (card_ptr == NULL) return;
// OK, we have to reset the card start, region, etc.
start = _ct_bs->addr_for(card_ptr);
r = _g1->heap_region_containing(start);
if (r == NULL) {
guarantee(_g1->is_in_permanent(start), "Or else where?");
return; // Not in the G1 heap (might be in perm, for example.)
}
guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
}
HeapWord* end = _ct_bs->addr_for(card_ptr + 1); HeapWord* end = _ct_bs->addr_for(card_ptr + 1);
MemRegion dirtyRegion(start, end); MemRegion dirtyRegion(start, end);
...@@ -774,6 +725,106 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { ...@@ -774,6 +725,106 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
} }
} }
void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
// If the card is no longer dirty, nothing to do.
if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
// Construct the region representing the card.
HeapWord* start = _ct_bs->addr_for(card_ptr);
// And find the region containing it.
HeapRegion* r = _g1->heap_region_containing(start);
if (r == NULL) {
guarantee(_g1->is_in_permanent(start), "Or else where?");
return; // Not in the G1 heap (might be in perm, for example.)
}
// Why do we have to check here whether a card is on a young region,
// given that we dirty young regions and, as a result, the
// post-barrier is supposed to filter them out and never to enqueue
// them? When we allocate a new region as the "allocation region" we
// actually dirty its cards after we release the lock, since card
// dirtying while holding the lock was a performance bottleneck. So,
// as a result, it is possible for other threads to actually
// allocate objects in the region (after the acquire the lock)
// before all the cards on the region are dirtied. This is unlikely,
// and it doesn't happen often, but it can happen. So, the extra
// check below filters out those cards.
if (r->is_young()) {
return;
}
// While we are processing RSet buffers during the collection, we
// actually don't want to scan any cards on the collection set,
// since we don't want to update remebered sets with entries that
// point into the collection set, given that live objects from the
// collection set are about to move and such entries will be stale
// very soon. This change also deals with a reliability issue which
// involves scanning a card in the collection set and coming across
// an array that was being chunked and looking malformed. Note,
// however, that if evacuation fails, we have to scan any objects
// that were not moved and create any missing entries.
if (r->in_collection_set()) {
return;
}
// Should we defer processing the card?
//
// Previously the result from the insert_cache call would be
// either card_ptr (implying that card_ptr was currently "cold"),
// null (meaning we had inserted the card ptr into the "hot"
// cache, which had some headroom), or a "hot" card ptr
// extracted from the "hot" cache.
//
// Now that the _card_counts cache in the ConcurrentG1Refine
// instance is an evicting hash table, the result we get back
// could be from evicting the card ptr in an already occupied
// bucket (in which case we have replaced the card ptr in the
// bucket with card_ptr and "defer" is set to false). To avoid
// having a data structure (updates to which would need a lock)
// to hold these unprocessed dirty cards, we need to immediately
// process card_ptr. The actions needed to be taken on return
// from cache_insert are summarized in the following table:
//
// res defer action
// --------------------------------------------------------------
// null false card evicted from _card_counts & replaced with
// card_ptr; evicted ptr added to hot cache.
// No need to process res; immediately process card_ptr
//
// null true card not evicted from _card_counts; card_ptr added
// to hot cache.
// Nothing to do.
//
// non-null false card evicted from _card_counts & replaced with
// card_ptr; evicted ptr is currently "cold" or
// caused an eviction from the hot cache.
// Immediately process res; process card_ptr.
//
// non-null true card not evicted from _card_counts; card_ptr is
// currently cold, or caused an eviction from hot
// cache.
// Immediately process res; no need to process card_ptr.
jbyte* res = card_ptr;
bool defer = false;
if (_cg1r->use_cache()) {
jbyte* res = _cg1r->cache_insert(card_ptr, &defer);
if (res != NULL && (res != card_ptr || defer)) {
start = _ct_bs->addr_for(res);
r = _g1->heap_region_containing(start);
if (r == NULL) {
assert(_g1->is_in_permanent(start), "Or else where?");
} else {
guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
// Process card pointer we get back from the hot card cache
concurrentRefineOneCard_impl(res, worker_i);
}
}
}
if (!defer) {
concurrentRefineOneCard_impl(card_ptr, worker_i);
}
}
class HRRSStatsIter: public HeapRegionClosure { class HRRSStatsIter: public HeapRegionClosure {
size_t _occupied; size_t _occupied;
size_t _total_mem_sz; size_t _total_mem_sz;
......
...@@ -157,6 +157,10 @@ protected: ...@@ -157,6 +157,10 @@ protected:
} }
} }
// The routine that performs the actual work of refining a dirty
// card.
void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i);
protected: protected:
template <class T> void write_ref_nv(HeapRegion* from, T* p); template <class T> void write_ref_nv(HeapRegion* from, T* p);
template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid); template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid);
......
...@@ -187,10 +187,6 @@ ...@@ -187,10 +187,6 @@
develop(intx, G1ConcRSLogCacheSize, 10, \ develop(intx, G1ConcRSLogCacheSize, 10, \
"Log base 2 of the length of conc RS hot-card cache.") \ "Log base 2 of the length of conc RS hot-card cache.") \
\ \
develop(bool, G1ConcRSCountTraversals, false, \
"If true, gather data about the number of times CR traverses " \
"cards ") \
\
develop(intx, G1ConcRSHotCardLimit, 4, \ develop(intx, G1ConcRSHotCardLimit, 4, \
"The threshold that defines (>=) a hot card.") \ "The threshold that defines (>=) a hot card.") \
\ \
...@@ -264,6 +260,10 @@ ...@@ -264,6 +260,10 @@
\ \
product(uintx, G1ParallelRSetThreads, 0, \ product(uintx, G1ParallelRSetThreads, 0, \
"If non-0 is the number of parallel rem set update threads, " \ "If non-0 is the number of parallel rem set update threads, " \
"otherwise the value is determined ergonomically.") "otherwise the value is determined ergonomically.") \
\
develop(intx, G1CardCountCacheExpandThreshold, 16, \
"Expand the card count cache if the number of collisions for " \
"a particular entry exceeds this value.")
G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG) G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
...@@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp ...@@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp
concurrentG1Refine.cpp concurrentG1RefineThread.hpp concurrentG1Refine.cpp concurrentG1RefineThread.hpp
concurrentG1Refine.cpp copy.hpp concurrentG1Refine.cpp copy.hpp
concurrentG1Refine.cpp g1CollectedHeap.inline.hpp concurrentG1Refine.cpp g1CollectedHeap.inline.hpp
concurrentG1Refine.cpp g1CollectorPolicy.hpp
concurrentG1Refine.cpp g1RemSet.hpp concurrentG1Refine.cpp g1RemSet.hpp
concurrentG1Refine.cpp space.inline.hpp concurrentG1Refine.cpp space.inline.hpp
concurrentG1Refine.cpp heapRegionSeq.inline.hpp
concurrentG1Refine.hpp globalDefinitions.hpp concurrentG1Refine.hpp globalDefinitions.hpp
concurrentG1Refine.hpp allocation.hpp concurrentG1Refine.hpp allocation.hpp
concurrentG1Refine.hpp cardTableModRefBS.hpp
concurrentG1Refine.hpp thread.hpp concurrentG1Refine.hpp thread.hpp
concurrentG1RefineThread.cpp concurrentG1Refine.hpp concurrentG1RefineThread.cpp concurrentG1Refine.hpp
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册