提交 c046dd73 编写于 作者: J johnc

6819077: G1: first GC thread coming late into the GC.

Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter.
Reviewed-by: iveresov, tonyp
上级 8763012d
...@@ -25,11 +25,21 @@ ...@@ -25,11 +25,21 @@
#include "incls/_precompiled.incl" #include "incls/_precompiled.incl"
#include "incls/_concurrentG1Refine.cpp.incl" #include "incls/_concurrentG1Refine.cpp.incl"
// Possible sizes for the card counts cache: odd primes that roughly double in size.
// (See jvmtiTagMap.cpp).
int ConcurrentG1Refine::_cc_cache_sizes[] = {
16381, 32771, 76831, 150001, 307261,
614563, 1228891, 2457733, 4915219, 9830479,
19660831, 39321619, 78643219, 157286461, -1
};
ConcurrentG1Refine::ConcurrentG1Refine() : ConcurrentG1Refine::ConcurrentG1Refine() :
_card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL), _card_counts(NULL), _card_epochs(NULL),
_n_card_counts(0), _max_n_card_counts(0),
_cache_size_index(0), _expand_card_counts(false),
_hot_cache(NULL), _hot_cache(NULL),
_def_use_cache(false), _use_cache(false), _def_use_cache(false), _use_cache(false),
_n_periods(0), _total_cards(0), _total_travs(0), _n_periods(0),
_threads(NULL), _n_threads(0) _threads(NULL), _n_threads(0)
{ {
if (G1ConcRefine) { if (G1ConcRefine) {
...@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() { ...@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() {
} }
void ConcurrentG1Refine::init() { void ConcurrentG1Refine::init() {
G1CollectedHeap* g1h = G1CollectedHeap::heap();
if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
_n_card_counts =
(unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
_card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0;
ModRefBarrierSet* bs = g1h->mr_bs();
guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
CardTableModRefBS* ctbs = (CardTableModRefBS*)bs;
_ct_bot = ctbs->byte_for_const(g1h->reserved_region().start());
if (G1ConcRSCountTraversals) {
_cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
_cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
for (int i = 0; i < 256; i++) {
_cur_card_count_histo[i] = 0;
_cum_card_count_histo[i] = 0;
}
}
}
if (G1ConcRSLogCacheSize > 0) { if (G1ConcRSLogCacheSize > 0) {
_g1h = G1CollectedHeap::heap();
_max_n_card_counts =
(unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
guarantee(_max_n_card_counts < max_card_num, "card_num representation");
int desired = _max_n_card_counts / InitialCacheFraction;
for (_cache_size_index = 0;
_cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
if (_cc_cache_sizes[_cache_size_index] >= desired) break;
}
_cache_size_index = MAX2(0, (_cache_size_index - 1));
int initial_size = _cc_cache_sizes[_cache_size_index];
if (initial_size < 0) initial_size = _max_n_card_counts;
// Make sure we don't go bigger than we will ever need
_n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
_card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
_card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
Copy::fill_to_bytes(&_card_counts[0],
_n_card_counts * sizeof(CardCountCacheEntry));
Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
ModRefBarrierSet* bs = _g1h->mr_bs();
guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
_ct_bs = (CardTableModRefBS*)bs;
_ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
_def_use_cache = true; _def_use_cache = true;
_use_cache = true; _use_cache = true;
_hot_cache_size = (1 << G1ConcRSLogCacheSize); _hot_cache_size = (1 << G1ConcRSLogCacheSize);
...@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() { ...@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() {
// For refining the cards in the hot cache in parallel // For refining the cards in the hot cache in parallel
int n_workers = (ParallelGCThreads > 0 ? int n_workers = (ParallelGCThreads > 0 ?
g1h->workers()->total_workers() : 1); _g1h->workers()->total_workers() : 1);
_hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers); _hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
_hot_cache_par_claimed_idx = 0; _hot_cache_par_claimed_idx = 0;
} }
...@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() { ...@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() {
} }
ConcurrentG1Refine::~ConcurrentG1Refine() { ConcurrentG1Refine::~ConcurrentG1Refine() {
if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
assert(_card_counts != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned char, _card_counts);
assert(_cur_card_count_histo != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo);
assert(_cum_card_count_histo != NULL, "Logic");
FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
}
if (G1ConcRSLogCacheSize > 0) { if (G1ConcRSLogCacheSize > 0) {
assert(_card_counts != NULL, "Logic");
FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
assert(_card_epochs != NULL, "Logic");
FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
assert(_hot_cache != NULL, "Logic"); assert(_hot_cache != NULL, "Logic");
FREE_C_HEAP_ARRAY(jbyte*, _hot_cache); FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
} }
...@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) { ...@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
} }
} }
bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
HeapWord* start = _ct_bs->addr_for(card_ptr);
HeapRegion* r = _g1h->heap_region_containing(start);
if (r != NULL && r->is_young()) {
return true;
}
// This card is not associated with a heap region
// so can't be young.
return false;
}
jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
unsigned new_card_num = ptr_2_card_num(card_ptr);
unsigned bucket = hash(new_card_num);
assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
CardCountCacheEntry* count_ptr = &_card_counts[bucket];
CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
// We have to construct a new entry if we haven't updated the counts
// during the current period, or if the count was updated for a
// different card number.
unsigned int new_epoch = (unsigned int) _n_periods;
julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
while (true) {
// Fetch the previous epoch value
julong prev_epoch_entry = epoch_ptr->_value;
julong cas_res;
if (extract_epoch(prev_epoch_entry) != new_epoch) {
// This entry has not yet been updated during this period.
// Note: we update the epoch value atomically to ensure
// that there is only one winner that updates the cached
// card_ptr value even though all the refine threads share
// the same epoch value.
cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
(volatile jlong*)&epoch_ptr->_value,
(jlong) prev_epoch_entry);
if (cas_res == prev_epoch_entry) {
// We have successfully won the race to update the
// epoch and card_num value. Make it look like the
// count and eviction count were previously cleared.
count_ptr->_count = 1;
count_ptr->_evict_count = 0;
*count = 0;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// We did not win the race to update the epoch field, so some other
// thread must have done it. The value that gets returned by CAS
// should be the new epoch value.
assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
// We could 'continue' here or just re-read the previous epoch value
prev_epoch_entry = epoch_ptr->_value;
}
// The epoch entry for card_ptr has been updated during this period.
unsigned old_card_num = extract_card_num(prev_epoch_entry);
// The card count that will be returned to caller
*count = count_ptr->_count;
// Are we updating the count for the same card?
if (new_card_num == old_card_num) {
// Same card - just update the count. We could have more than one
// thread racing to update count for the current card. It should be
// OK not to use a CAS as the only penalty should be some missed
// increments of the count which delays identifying the card as "hot".
if (*count < max_jubyte) count_ptr->_count++;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// Different card - evict old card info
if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
// Trigger a resize the next time we clear
_expand_card_counts = true;
}
cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
(volatile jlong*)&epoch_ptr->_value,
(jlong) prev_epoch_entry);
int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) { if (cas_res == prev_epoch_entry) {
size_t card_num = (card_ptr - _ct_bot); // We successfully updated the card num value in the epoch entry
guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds"); count_ptr->_count = 0; // initialize counter for new card num
unsigned char cnt = _card_counts[card_num];
if (cnt < 255) _card_counts[card_num]++; // Even though the region containg the card at old_card_num was not
return cnt; // in the young list when old_card_num was recorded in the epoch
_total_travs++; // cache it could have been added to the free list and subsequently
// added to the young list in the intervening time. If the evicted
// card is in a young region just return the card_ptr and the evicted
// card will not be cleaned. See CR 6817995.
jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
if (is_young_card(old_card_ptr)) {
*count = 0;
// We can defer the processing of card_ptr
*defer = true;
return card_ptr;
}
// We do not want to defer processing of card_ptr in this case
// (we need to refine old_card_ptr and card_ptr)
*defer = false;
return old_card_ptr;
}
// Someone else beat us - try again.
}
} }
jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) { jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
int count = add_card_count(card_ptr); int count;
// Count previously unvisited cards. jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
if (count == 0) _total_cards++; assert(cached_ptr != NULL, "bad cached card ptr");
// We'll assume a traversal unless we store it in the cache. assert(!is_young_card(cached_ptr), "shouldn't get a card in young region");
// The card pointer we obtained from card count cache is not hot
// so do not store it in the cache; return it for immediate
// refining.
if (count < G1ConcRSHotCardLimit) { if (count < G1ConcRSHotCardLimit) {
_total_travs++; return cached_ptr;
return card_ptr;
} }
// Otherwise, it's hot.
// Otherwise, the pointer we got from the _card_counts is hot.
jbyte* res = NULL; jbyte* res = NULL;
MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag); MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
if (_n_hot == _hot_cache_size) { if (_n_hot == _hot_cache_size) {
_total_travs++;
res = _hot_cache[_hot_cache_idx]; res = _hot_cache[_hot_cache_idx];
_n_hot--; _n_hot--;
} }
// Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx. // Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
_hot_cache[_hot_cache_idx] = card_ptr; _hot_cache[_hot_cache_idx] = cached_ptr;
_hot_cache_idx++; _hot_cache_idx++;
if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0; if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
_n_hot++; _n_hot++;
if (res != NULL) {
// Even though the region containg res was not in the young list
// when it was recorded in the hot cache it could have been added
// to the free list and subsequently added to the young list in
// the intervening time. If res is in a young region, return NULL
// so that res is not cleaned. See CR 6817995.
if (is_young_card(res)) {
res = NULL;
}
}
return res; return res;
} }
void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
assert(!use_cache(), "cache should be disabled"); assert(!use_cache(), "cache should be disabled");
int start_idx; int start_idx;
...@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) { ...@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
} }
} }
void ConcurrentG1Refine::clear_and_record_card_counts() { void ConcurrentG1Refine::expand_card_count_cache() {
if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return; if (_n_card_counts < _max_n_card_counts) {
_n_periods++; int new_idx = _cache_size_index+1;
if (G1ConcRSCountTraversals) { int new_size = _cc_cache_sizes[new_idx];
for (size_t i = 0; i < _n_card_counts; i++) { if (new_size < 0) new_size = _max_n_card_counts;
unsigned char bucket = _card_counts[i];
_cur_card_count_histo[bucket]++; // Make sure we don't go bigger than we will ever need
_card_counts[i] = 0; new_size = MIN2((unsigned) new_size, _max_n_card_counts);
}
gclog_or_tty->print_cr("Card counts:"); // Expand the card count and card epoch tables
for (int i = 0; i < 256; i++) { if (new_size > (int)_n_card_counts) {
if (_cur_card_count_histo[i] > 0) { // We can just free and allocate a new array as we're
gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]); // not interested in preserving the contents
_cum_card_count_histo[i] += _cur_card_count_histo[i]; assert(_card_counts != NULL, "Logic!");
_cur_card_count_histo[i] = 0; assert(_card_epochs != NULL, "Logic!");
} FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
_n_card_counts = new_size;
_card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
_card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
_cache_size_index = new_idx;
} }
} else {
assert(G1ConcRSLogCacheSize > 0, "Logic");
Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
_n_card_counts / HeapWordSize);
} }
} }
void void ConcurrentG1Refine::clear_and_record_card_counts() {
ConcurrentG1Refine:: if (G1ConcRSLogCacheSize == 0) return;
print_card_count_histo_range(unsigned* histo, int from, int to,
float& cum_card_pct, #ifndef PRODUCT
float& cum_travs_pct) { double start = os::elapsedTime();
unsigned cards = 0; #endif
unsigned travs = 0;
guarantee(to <= 256, "Precondition"); if (_expand_card_counts) {
for (int i = from; i < to-1; i++) { expand_card_count_cache();
cards += histo[i]; _expand_card_counts = false;
travs += histo[i] * i; // Only need to clear the epochs.
} Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
if (to == 256) {
unsigned histo_card_sum = 0;
unsigned histo_trav_sum = 0;
for (int i = 1; i < 255; i++) {
histo_trav_sum += histo[i] * i;
}
cards += histo[255];
// correct traversals for the last one.
unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
travs += travs_255;
} else {
cards += histo[to-1];
travs += histo[to-1] * (to-1);
}
float fperiods = (float)_n_periods;
float f_tot_cards = (float)_total_cards/fperiods;
float f_tot_travs = (float)_total_travs/fperiods;
if (cards > 0) {
float fcards = (float)cards/fperiods;
float ftravs = (float)travs/fperiods;
if (to == 256) {
gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs);
} else {
gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs);
}
float pct_cards = fcards*100.0/f_tot_cards;
cum_card_pct += pct_cards;
float pct_travs = ftravs*100.0/f_tot_travs;
cum_travs_pct += pct_travs;
gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
pct_cards, cum_card_pct,
pct_travs, cum_travs_pct);
} }
}
void ConcurrentG1Refine::print_final_card_counts() { int this_epoch = (int) _n_periods;
if (!G1ConcRSCountTraversals) return; assert((this_epoch+1) <= max_jint, "to many periods");
// Update epoch
gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.", _n_periods++;
_total_travs, _total_cards);
float fperiods = (float)_n_periods; #ifndef PRODUCT
gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, " double elapsed = os::elapsedTime() - start;
"per collection.", (float)_total_travs/fperiods, _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
(float)_total_cards/fperiods); #endif
gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct "
"dirty card.\n",
_total_cards > 0 ?
(float)_total_travs/(float)_total_cards : 0.0);
gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s",
"range", "# cards", "# travs", "% cards", "(cum)",
"% travs", "(cum)");
gclog_or_tty->print_cr("------------------------------------------------------------"
"-------------");
float cum_cards_pct = 0.0;
float cum_travs_pct = 0.0;
for (int i = 1; i < 10; i++) {
print_card_count_histo_range(_cum_card_count_histo, i, i+1,
cum_cards_pct, cum_travs_pct);
}
for (int i = 10; i < 100; i += 10) {
print_card_count_histo_range(_cum_card_count_histo, i, i+10,
cum_cards_pct, cum_travs_pct);
}
print_card_count_histo_range(_cum_card_count_histo, 100, 150,
cum_cards_pct, cum_travs_pct);
print_card_count_histo_range(_cum_card_count_histo, 150, 200,
cum_cards_pct, cum_travs_pct);
print_card_count_histo_range(_cum_card_count_histo, 150, 255,
cum_cards_pct, cum_travs_pct);
print_card_count_histo_range(_cum_card_count_histo, 255, 256,
cum_cards_pct, cum_travs_pct);
} }
...@@ -29,18 +29,77 @@ class G1RemSet; ...@@ -29,18 +29,77 @@ class G1RemSet;
class ConcurrentG1Refine: public CHeapObj { class ConcurrentG1Refine: public CHeapObj {
ConcurrentG1RefineThread** _threads; ConcurrentG1RefineThread** _threads;
int _n_threads; int _n_threads;
// The cache for card refinement. // The cache for card refinement.
bool _use_cache; bool _use_cache;
bool _def_use_cache; bool _def_use_cache;
size_t _n_periods;
size_t _total_cards;
size_t _total_travs;
unsigned char* _card_counts; size_t _n_periods; // Used as clearing epoch
// An evicting cache of the number of times each card
// is accessed. Reduces, but does not eliminate, the amount
// of duplicated processing of dirty cards.
enum SomePrivateConstants {
epoch_bits = 32,
card_num_shift = epoch_bits,
epoch_mask = AllBits,
card_num_mask = AllBits,
// The initial cache size is approximately this fraction
// of a maximal cache (i.e. the size needed for all cards
// in the heap)
InitialCacheFraction = 512
};
const static julong card_num_mask_in_place =
(julong) card_num_mask << card_num_shift;
typedef struct {
julong _value; // | card_num | epoch |
} CardEpochCacheEntry;
julong make_epoch_entry(unsigned int card_num, unsigned int epoch) {
assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds");
assert(0 <= epoch && epoch <= _n_periods, "must be");
return ((julong) card_num << card_num_shift) | epoch;
}
unsigned int extract_epoch(julong v) {
return (v & epoch_mask);
}
unsigned int extract_card_num(julong v) {
return (v & card_num_mask_in_place) >> card_num_shift;
}
typedef struct {
unsigned char _count;
unsigned char _evict_count;
} CardCountCacheEntry;
CardCountCacheEntry* _card_counts;
CardEpochCacheEntry* _card_epochs;
// The current number of buckets in the card count cache
unsigned _n_card_counts; unsigned _n_card_counts;
// The max number of buckets required for the number of
// cards for the entire reserved heap
unsigned _max_n_card_counts;
// Possible sizes of the cache: odd primes that roughly double in size.
// (See jvmtiTagMap.cpp).
static int _cc_cache_sizes[];
// The index in _cc_cache_sizes corresponding to the size of
// _card_counts.
int _cache_size_index;
bool _expand_card_counts;
const jbyte* _ct_bot; const jbyte* _ct_bot;
unsigned* _cur_card_count_histo;
unsigned* _cum_card_count_histo;
jbyte** _hot_cache; jbyte** _hot_cache;
int _hot_cache_size; int _hot_cache_size;
...@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj {
int _hot_cache_par_chunk_size; int _hot_cache_par_chunk_size;
volatile int _hot_cache_par_claimed_idx; volatile int _hot_cache_par_claimed_idx;
// Needed to workaround 6817995
CardTableModRefBS* _ct_bs;
G1CollectedHeap* _g1h;
// Expands the array that holds the card counts to the next size up
void expand_card_count_cache();
// hash a given key (index of card_ptr) with the specified size
static unsigned int hash(size_t key, int size) {
return (unsigned int) key % size;
}
// hash a given key (index of card_ptr)
unsigned int hash(size_t key) {
return hash(key, _n_card_counts);
}
unsigned ptr_2_card_num(jbyte* card_ptr) {
return (unsigned) (card_ptr - _ct_bot);
}
jbyte* card_num_2_ptr(unsigned card_num) {
return (jbyte*) (_ct_bot + card_num);
}
// Returns the count of this card after incrementing it. // Returns the count of this card after incrementing it.
int add_card_count(jbyte* card_ptr); jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer);
// Returns true if this card is in a young region
bool is_young_card(jbyte* card_ptr);
void print_card_count_histo_range(unsigned* histo, int from, int to,
float& cum_card_pct,
float& cum_travs_pct);
public: public:
ConcurrentG1Refine(); ConcurrentG1Refine();
~ConcurrentG1Refine(); ~ConcurrentG1Refine();
...@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj {
// If this is the first entry for the slot, writes into the cache and // If this is the first entry for the slot, writes into the cache and
// returns NULL. If it causes an eviction, returns the evicted pointer. // returns NULL. If it causes an eviction, returns the evicted pointer.
// Otherwise, its a cache hit, and returns NULL. // Otherwise, its a cache hit, and returns NULL.
jbyte* cache_insert(jbyte* card_ptr); jbyte* cache_insert(jbyte* card_ptr, bool* defer);
// Process the cached entries. // Process the cached entries.
void clean_up_cache(int worker_i, G1RemSet* g1rs); void clean_up_cache(int worker_i, G1RemSet* g1rs);
...@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj { ...@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj {
} }
void clear_and_record_card_counts(); void clear_and_record_card_counts();
void print_final_card_counts();
static size_t thread_num(); static size_t thread_num();
}; };
...@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const { ...@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
} }
void G1CollectedHeap::print_tracing_info() const { void G1CollectedHeap::print_tracing_info() const {
concurrent_g1_refine()->print_final_card_counts();
// We'll overload this to mean "trace GC pause statistics." // We'll overload this to mean "trace GC pause statistics."
if (TraceGen0Time || TraceGen1Time) { if (TraceGen0Time || TraceGen1Time) {
// The "G1CollectorPolicy" is keeping track of these stats, so delegate // The "G1CollectorPolicy" is keeping track of these stats, so delegate
......
...@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() : ...@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() :
_summary(new Summary()), _summary(new Summary()),
_abandoned_summary(new AbandonedSummary()), _abandoned_summary(new AbandonedSummary()),
#ifndef PRODUCT
_cur_clear_ct_time_ms(0.0), _cur_clear_ct_time_ms(0.0),
_min_clear_cc_time_ms(-1.0),
_max_clear_cc_time_ms(-1.0),
_cur_clear_cc_time_ms(0.0),
_cum_clear_cc_time_ms(0.0),
_num_cc_clears(0L),
#endif
_region_num_young(0), _region_num_young(0),
_region_num_tenured(0), _region_num_tenured(0),
...@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) { ...@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
print_stats(1, "Object Copying", obj_copy_time); print_stats(1, "Object Copying", obj_copy_time);
} }
} }
#ifndef PRODUCT
print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms);
print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms);
print_stats(1, "Min Clear CC", _min_clear_cc_time_ms);
print_stats(1, "Max Clear CC", _max_clear_cc_time_ms);
if (_num_cc_clears > 0) {
print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears));
}
#endif
print_stats(1, "Other", other_time_ms); print_stats(1, "Other", other_time_ms);
for (int i = 0; i < _aux_num; ++i) { for (int i = 0; i < _aux_num; ++i) {
if (_cur_aux_times_set[i]) { if (_cur_aux_times_set[i]) {
......
...@@ -112,7 +112,6 @@ protected: ...@@ -112,7 +112,6 @@ protected:
return 8*M; return 8*M;
} }
double _cur_collection_start_sec; double _cur_collection_start_sec;
size_t _cur_collection_pause_used_at_start_bytes; size_t _cur_collection_pause_used_at_start_bytes;
size_t _cur_collection_pause_used_regions_at_start; size_t _cur_collection_pause_used_regions_at_start;
...@@ -122,6 +121,15 @@ protected: ...@@ -122,6 +121,15 @@ protected:
double _cur_clear_ct_time_ms; double _cur_clear_ct_time_ms;
bool _satb_drain_time_set; bool _satb_drain_time_set;
#ifndef PRODUCT
// Card Table Count Cache stats
double _min_clear_cc_time_ms; // min
double _max_clear_cc_time_ms; // max
double _cur_clear_cc_time_ms; // clearing time during current pause
double _cum_clear_cc_time_ms; // cummulative clearing time
jlong _num_cc_clears; // number of times the card count cache has been cleared
#endif
double _cur_CH_strong_roots_end_sec; double _cur_CH_strong_roots_end_sec;
double _cur_CH_strong_roots_dur_ms; double _cur_CH_strong_roots_dur_ms;
double _cur_G1_strong_roots_end_sec; double _cur_G1_strong_roots_end_sec;
...@@ -931,6 +939,18 @@ public: ...@@ -931,6 +939,18 @@ public:
_cur_aux_times_ms[i] += ms; _cur_aux_times_ms[i] += ms;
} }
#ifndef PRODUCT
void record_cc_clear_time(double ms) {
if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
_min_clear_cc_time_ms = ms;
if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms)
_max_clear_cc_time_ms = ms;
_cur_clear_cc_time_ms = ms;
_cum_clear_cc_time_ms += ms;
_num_cc_clears++;
}
#endif
// Record the fact that "bytes" bytes allocated in a region. // Record the fact that "bytes" bytes allocated in a region.
void record_before_bytes(size_t bytes); void record_before_bytes(size_t bytes);
void record_after_bytes(size_t bytes); void record_after_bytes(size_t bytes);
......
...@@ -676,6 +676,55 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm, ...@@ -676,6 +676,55 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
static IntHistogram out_of_histo(50, 50); static IntHistogram out_of_histo(50, 50);
void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) {
// Construct the region representing the card.
HeapWord* start = _ct_bs->addr_for(card_ptr);
// And find the region containing it.
HeapRegion* r = _g1->heap_region_containing(start);
assert(r != NULL, "unexpected null");
HeapWord* end = _ct_bs->addr_for(card_ptr + 1);
MemRegion dirtyRegion(start, end);
#if CARD_REPEAT_HISTO
init_ct_freq_table(_g1->g1_reserved_obj_bytes());
ct_freq_note_card(_ct_bs->index_for(start));
#endif
UpdateRSOopClosure update_rs_oop_cl(this, worker_i);
update_rs_oop_cl.set_from(r);
FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r, &update_rs_oop_cl);
// Undirty the card.
*card_ptr = CardTableModRefBS::clean_card_val();
// We must complete this write before we do any of the reads below.
OrderAccess::storeload();
// And process it, being careful of unallocated portions of TLAB's.
HeapWord* stop_point =
r->oops_on_card_seq_iterate_careful(dirtyRegion,
&filter_then_update_rs_oop_cl);
// If stop_point is non-null, then we encountered an unallocated region
// (perhaps the unfilled portion of a TLAB.) For now, we'll dirty the
// card and re-enqueue: if we put off the card until a GC pause, then the
// unallocated portion will be filled in. Alternatively, we might try
// the full complexity of the technique used in "regular" precleaning.
if (stop_point != NULL) {
// The card might have gotten re-dirtied and re-enqueued while we
// worked. (In fact, it's pretty likely.)
if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
*card_ptr = CardTableModRefBS::dirty_card_val();
MutexLockerEx x(Shared_DirtyCardQ_lock,
Mutex::_no_safepoint_check_flag);
DirtyCardQueue* sdcq =
JavaThread::dirty_card_queue_set().shared_dirty_card_queue();
sdcq->enqueue(card_ptr);
}
} else {
out_of_histo.add_entry(filter_then_update_rs_oop_cl.out_of_region());
_conc_refine_cards++;
}
}
void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
// If the card is no longer dirty, nothing to do. // If the card is no longer dirty, nothing to do.
if (*card_ptr != CardTableModRefBS::dirty_card_val()) return; if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
...@@ -716,61 +765,63 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) { ...@@ -716,61 +765,63 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
return; return;
} }
// Should we defer it? // Should we defer processing the card?
//
// Previously the result from the insert_cache call would be
// either card_ptr (implying that card_ptr was currently "cold"),
// null (meaning we had inserted the card ptr into the "hot"
// cache, which had some headroom), or a "hot" card ptr
// extracted from the "hot" cache.
//
// Now that the _card_counts cache in the ConcurrentG1Refine
// instance is an evicting hash table, the result we get back
// could be from evicting the card ptr in an already occupied
// bucket (in which case we have replaced the card ptr in the
// bucket with card_ptr and "defer" is set to false). To avoid
// having a data structure (updates to which would need a lock)
// to hold these unprocessed dirty cards, we need to immediately
// process card_ptr. The actions needed to be taken on return
// from cache_insert are summarized in the following table:
//
// res defer action
// --------------------------------------------------------------
// null false card evicted from _card_counts & replaced with
// card_ptr; evicted ptr added to hot cache.
// No need to process res; immediately process card_ptr
//
// null true card not evicted from _card_counts; card_ptr added
// to hot cache.
// Nothing to do.
//
// non-null false card evicted from _card_counts & replaced with
// card_ptr; evicted ptr is currently "cold" or
// caused an eviction from the hot cache.
// Immediately process res; process card_ptr.
//
// non-null true card not evicted from _card_counts; card_ptr is
// currently cold, or caused an eviction from hot
// cache.
// Immediately process res; no need to process card_ptr.
jbyte* res = card_ptr;
bool defer = false;
if (_cg1r->use_cache()) { if (_cg1r->use_cache()) {
card_ptr = _cg1r->cache_insert(card_ptr); jbyte* res = _cg1r->cache_insert(card_ptr, &defer);
// If it was not an eviction, nothing to do. if (res != NULL && (res != card_ptr || defer)) {
if (card_ptr == NULL) return; start = _ct_bs->addr_for(res);
// OK, we have to reset the card start, region, etc.
start = _ct_bs->addr_for(card_ptr);
r = _g1->heap_region_containing(start); r = _g1->heap_region_containing(start);
if (r == NULL) { if (r == NULL) {
guarantee(_g1->is_in_permanent(start), "Or else where?"); assert(_g1->is_in_permanent(start), "Or else where?");
return; // Not in the G1 heap (might be in perm, for example.) } else {
}
guarantee(!r->is_young(), "It was evicted in the current minor cycle."); guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
// Process card pointer we get back from the hot card cache
concurrentRefineOneCard_impl(res, worker_i);
} }
HeapWord* end = _ct_bs->addr_for(card_ptr + 1);
MemRegion dirtyRegion(start, end);
#if CARD_REPEAT_HISTO
init_ct_freq_table(_g1->g1_reserved_obj_bytes());
ct_freq_note_card(_ct_bs->index_for(start));
#endif
UpdateRSOopClosure update_rs_oop_cl(this, worker_i);
update_rs_oop_cl.set_from(r);
FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r, &update_rs_oop_cl);
// Undirty the card.
*card_ptr = CardTableModRefBS::clean_card_val();
// We must complete this write before we do any of the reads below.
OrderAccess::storeload();
// And process it, being careful of unallocated portions of TLAB's.
HeapWord* stop_point =
r->oops_on_card_seq_iterate_careful(dirtyRegion,
&filter_then_update_rs_oop_cl);
// If stop_point is non-null, then we encountered an unallocated region
// (perhaps the unfilled portion of a TLAB.) For now, we'll dirty the
// card and re-enqueue: if we put off the card until a GC pause, then the
// unallocated portion will be filled in. Alternatively, we might try
// the full complexity of the technique used in "regular" precleaning.
if (stop_point != NULL) {
// The card might have gotten re-dirtied and re-enqueued while we
// worked. (In fact, it's pretty likely.)
if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
*card_ptr = CardTableModRefBS::dirty_card_val();
MutexLockerEx x(Shared_DirtyCardQ_lock,
Mutex::_no_safepoint_check_flag);
DirtyCardQueue* sdcq =
JavaThread::dirty_card_queue_set().shared_dirty_card_queue();
sdcq->enqueue(card_ptr);
} }
} else { }
out_of_histo.add_entry(filter_then_update_rs_oop_cl.out_of_region());
_conc_refine_cards++; if (!defer) {
concurrentRefineOneCard_impl(card_ptr, worker_i);
} }
} }
......
...@@ -157,6 +157,10 @@ protected: ...@@ -157,6 +157,10 @@ protected:
} }
} }
// The routine that performs the actual work of refining a dirty
// card.
void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i);
protected: protected:
template <class T> void write_ref_nv(HeapRegion* from, T* p); template <class T> void write_ref_nv(HeapRegion* from, T* p);
template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid); template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid);
......
...@@ -187,10 +187,6 @@ ...@@ -187,10 +187,6 @@
develop(intx, G1ConcRSLogCacheSize, 10, \ develop(intx, G1ConcRSLogCacheSize, 10, \
"Log base 2 of the length of conc RS hot-card cache.") \ "Log base 2 of the length of conc RS hot-card cache.") \
\ \
develop(bool, G1ConcRSCountTraversals, false, \
"If true, gather data about the number of times CR traverses " \
"cards ") \
\
develop(intx, G1ConcRSHotCardLimit, 4, \ develop(intx, G1ConcRSHotCardLimit, 4, \
"The threshold that defines (>=) a hot card.") \ "The threshold that defines (>=) a hot card.") \
\ \
...@@ -264,6 +260,10 @@ ...@@ -264,6 +260,10 @@
\ \
product(uintx, G1ParallelRSetThreads, 0, \ product(uintx, G1ParallelRSetThreads, 0, \
"If non-0 is the number of parallel rem set update threads, " \ "If non-0 is the number of parallel rem set update threads, " \
"otherwise the value is determined ergonomically.") "otherwise the value is determined ergonomically.") \
\
develop(intx, G1CardCountCacheExpandThreshold, 16, \
"Expand the card count cache if the number of collisions for " \
"a particular entry exceeds this value.")
G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG) G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
...@@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp ...@@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp
concurrentG1Refine.cpp concurrentG1RefineThread.hpp concurrentG1Refine.cpp concurrentG1RefineThread.hpp
concurrentG1Refine.cpp copy.hpp concurrentG1Refine.cpp copy.hpp
concurrentG1Refine.cpp g1CollectedHeap.inline.hpp concurrentG1Refine.cpp g1CollectedHeap.inline.hpp
concurrentG1Refine.cpp g1CollectorPolicy.hpp
concurrentG1Refine.cpp g1RemSet.hpp concurrentG1Refine.cpp g1RemSet.hpp
concurrentG1Refine.cpp space.inline.hpp concurrentG1Refine.cpp space.inline.hpp
concurrentG1Refine.cpp heapRegionSeq.inline.hpp
concurrentG1Refine.hpp globalDefinitions.hpp concurrentG1Refine.hpp globalDefinitions.hpp
concurrentG1Refine.hpp allocation.hpp concurrentG1Refine.hpp allocation.hpp
concurrentG1Refine.hpp cardTableModRefBS.hpp
concurrentG1Refine.hpp thread.hpp concurrentG1Refine.hpp thread.hpp
concurrentG1RefineThread.cpp concurrentG1Refine.hpp concurrentG1RefineThread.cpp concurrentG1Refine.hpp
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册