提交 6195261b 编写于 作者: T tschatzl

8019342: G1: High "Other" time most likely due to card redirtying

Summary: Parallelize card redirtying to decrease the time it takes.
Reviewed-by: brutisso
上级 50a69aab
......@@ -29,7 +29,7 @@
#include "gc_implementation/g1/g1HotCardCache.hpp"
#include "runtime/java.hpp"
ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) :
ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) :
_threads(NULL), _n_threads(0),
_hot_card_cache(g1h)
{
......@@ -61,7 +61,7 @@ ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) :
ConcurrentG1RefineThread *next = NULL;
for (uint i = _n_threads - 1; i != UINT_MAX; i--) {
ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i);
assert(t != NULL, "Conc refine should have been created");
if (t->osthread() == NULL) {
vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread");
......
......@@ -71,7 +71,7 @@ class ConcurrentG1Refine: public CHeapObj<mtGC> {
void reset_threshold_step();
public:
ConcurrentG1Refine(G1CollectedHeap* g1h);
ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure);
~ConcurrentG1Refine();
void init(); // Accomplish some initialization that has to wait.
......
......@@ -33,8 +33,10 @@
ConcurrentG1RefineThread::
ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next,
CardTableEntryClosure* refine_closure,
uint worker_id_offset, uint worker_id) :
ConcurrentGCThread(),
_refine_closure(refine_closure),
_worker_id_offset(worker_id_offset),
_worker_id(worker_id),
_active(false),
......@@ -204,7 +206,7 @@ void ConcurrentG1RefineThread::run() {
if (_next != NULL && !_next->is_active() && curr_buffer_num > _next->_threshold) {
_next->activate();
}
} while (dcqs.apply_closure_to_completed_buffer(_worker_id + _worker_id_offset, cg1r()->green_zone()));
} while (dcqs.apply_closure_to_completed_buffer(_refine_closure, _worker_id + _worker_id_offset, cg1r()->green_zone()));
// We can exit the loop above while being active if there was a yield request.
if (is_active()) {
......
......@@ -28,6 +28,7 @@
#include "gc_implementation/shared/concurrentGCThread.hpp"
// Forward Decl.
class CardTableEntryClosure;
class ConcurrentG1Refine;
// The G1 Concurrent Refinement Thread (could be several in the future).
......@@ -49,6 +50,9 @@ class ConcurrentG1RefineThread: public ConcurrentGCThread {
Monitor* _monitor;
ConcurrentG1Refine* _cg1r;
// The closure applied to completed log buffers.
CardTableEntryClosure* _refine_closure;
int _thread_threshold_step;
// This thread activation threshold
int _threshold;
......@@ -68,6 +72,7 @@ public:
virtual void run();
// Constructor
ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next,
CardTableEntryClosure* refine_closure,
uint worker_id_offset, uint worker_id);
void initialize();
......
......@@ -70,7 +70,7 @@ bool DirtyCardQueue::apply_closure_to_buffer(CardTableEntryClosure* cl,
DirtyCardQueueSet::DirtyCardQueueSet(bool notify_when_complete) :
PtrQueueSet(notify_when_complete),
_closure(NULL),
_mut_process_closure(NULL),
_shared_dirty_card_queue(this, true /*perm*/),
_free_ids(NULL),
_processed_buffers_mut(0), _processed_buffers_rs_thread(0)
......@@ -83,10 +83,11 @@ uint DirtyCardQueueSet::num_par_ids() {
return (uint)os::processor_count();
}
void DirtyCardQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
void DirtyCardQueueSet::initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock,
int process_completed_threshold,
int max_completed_queue,
Mutex* lock, PtrQueueSet* fl_owner) {
_mut_process_closure = cl;
PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold,
max_completed_queue, fl_owner);
set_buffer_size(G1UpdateBufferSize);
......@@ -98,18 +99,15 @@ void DirtyCardQueueSet::handle_zero_index_for_thread(JavaThread* t) {
t->dirty_card_queue().handle_zero_index();
}
void DirtyCardQueueSet::set_closure(CardTableEntryClosure* closure) {
_closure = closure;
}
void DirtyCardQueueSet::iterate_closure_all_threads(bool consume,
void DirtyCardQueueSet::iterate_closure_all_threads(CardTableEntryClosure* cl,
bool consume,
uint worker_i) {
assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
for(JavaThread* t = Threads::first(); t; t = t->next()) {
bool b = t->dirty_card_queue().apply_closure(_closure, consume);
bool b = t->dirty_card_queue().apply_closure(cl, consume);
guarantee(b, "Should not be interrupted.");
}
bool b = shared_dirty_card_queue()->apply_closure(_closure,
bool b = shared_dirty_card_queue()->apply_closure(cl,
consume,
worker_i);
guarantee(b, "Should not be interrupted.");
......@@ -143,7 +141,7 @@ bool DirtyCardQueueSet::mut_process_buffer(void** buf) {
bool b = false;
if (worker_i != UINT_MAX) {
b = DirtyCardQueue::apply_closure_to_buffer(_closure, buf, 0,
b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure, buf, 0,
_sz, true, worker_i);
if (b) Atomic::inc(&_processed_buffers_mut);
......@@ -218,18 +216,11 @@ bool DirtyCardQueueSet::apply_closure_to_completed_buffer(CardTableEntryClosure*
return res;
}
bool DirtyCardQueueSet::apply_closure_to_completed_buffer(uint worker_i,
int stop_at,
bool during_pause) {
return apply_closure_to_completed_buffer(_closure, worker_i,
stop_at, during_pause);
}
void DirtyCardQueueSet::apply_closure_to_all_completed_buffers() {
void DirtyCardQueueSet::apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) {
BufferNode* nd = _completed_buffers_head;
while (nd != NULL) {
bool b =
DirtyCardQueue::apply_closure_to_buffer(_closure,
DirtyCardQueue::apply_closure_to_buffer(cl,
BufferNode::make_buffer_from_node(nd),
0, _sz, false);
guarantee(b, "Should not stop early.");
......@@ -237,6 +228,24 @@ void DirtyCardQueueSet::apply_closure_to_all_completed_buffers() {
}
}
void DirtyCardQueueSet::par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl) {
BufferNode* nd = _cur_par_buffer_node;
while (nd != NULL) {
BufferNode* next = (BufferNode*)nd->next();
BufferNode* actual = (BufferNode*)Atomic::cmpxchg_ptr((void*)next, (volatile void*)&_cur_par_buffer_node, (void*)nd);
if (actual == nd) {
bool b =
DirtyCardQueue::apply_closure_to_buffer(cl,
BufferNode::make_buffer_from_node(actual),
0, _sz, false);
guarantee(b, "Should not stop early.");
nd = next;
} else {
nd = actual;
}
}
}
// Deallocates any completed log buffers
void DirtyCardQueueSet::clear() {
BufferNode* buffers_to_delete = NULL;
......
......@@ -73,7 +73,8 @@ public:
class DirtyCardQueueSet: public PtrQueueSet {
CardTableEntryClosure* _closure;
// The closure used in mut_process_buffer().
CardTableEntryClosure* _mut_process_closure;
DirtyCardQueue _shared_dirty_card_queue;
......@@ -88,10 +89,12 @@ class DirtyCardQueueSet: public PtrQueueSet {
jint _processed_buffers_mut;
jint _processed_buffers_rs_thread;
// Current buffer node used for parallel iteration.
BufferNode* volatile _cur_par_buffer_node;
public:
DirtyCardQueueSet(bool notify_when_complete = true);
void initialize(Monitor* cbl_mon, Mutex* fl_lock,
void initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock,
int process_completed_threshold,
int max_completed_queue,
Mutex* lock, PtrQueueSet* fl_owner = NULL);
......@@ -102,32 +105,14 @@ public:
static void handle_zero_index_for_thread(JavaThread* t);
// Register "blk" as "the closure" for all queues. Only one such closure
// is allowed. The "apply_closure_to_completed_buffer" method will apply
// this closure to a completed buffer, and "iterate_closure_all_threads"
// applies it to partially-filled buffers (the latter should only be done
// with the world stopped).
void set_closure(CardTableEntryClosure* closure);
// If there is a registered closure for buffers, apply it to all entries
// in all currently-active buffers. This should only be applied at a
// safepoint. (Currently must not be called in parallel; this should
// change in the future.) If "consume" is true, processed entries are
// discarded.
void iterate_closure_all_threads(bool consume = true,
// Apply the given closure to all entries in all currently-active buffers.
// This should only be applied at a safepoint. (Currently must not be called
// in parallel; this should change in the future.) If "consume" is true,
// processed entries are discarded.
void iterate_closure_all_threads(CardTableEntryClosure* cl,
bool consume = true,
uint worker_i = 0);
// If there exists some completed buffer, pop it, then apply the
// registered closure to all its elements, nulling out those elements
// processed. If all elements are processed, returns "true". If no
// completed buffers exist, returns false. If a completed buffer exists,
// but is only partially completed before a "yield" happens, the
// partially completed buffer (with its processed elements set to NULL)
// is returned to the completed buffer set, and this call returns false.
bool apply_closure_to_completed_buffer(uint worker_i = 0,
int stop_at = 0,
bool during_pause = false);
// If there exists some completed buffer, pop it, then apply the
// specified closure to all its elements, nulling out those elements
// processed. If all elements are processed, returns "true". If no
......@@ -149,7 +134,12 @@ public:
// Applies the current closure to all completed buffers,
// non-consumptively.
void apply_closure_to_all_completed_buffers();
void apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl);
void reset_for_par_iteration() { _cur_par_buffer_node = _completed_buffers_head; }
// Applies the current closure to all completed buffers, non-consumptively.
// Parallel version.
void par_apply_closure_to_all_completed_buffers(CardTableEntryClosure* cl);
DirtyCardQueue* shared_dirty_card_queue() {
return &_shared_dirty_card_queue;
......
......@@ -98,16 +98,12 @@ size_t G1CollectedHeap::_humongous_object_threshold_in_words = 0;
// Local to this file.
class RefineCardTableEntryClosure: public CardTableEntryClosure {
G1RemSet* _g1rs;
ConcurrentG1Refine* _cg1r;
bool _concurrent;
public:
RefineCardTableEntryClosure(G1RemSet* g1rs,
ConcurrentG1Refine* cg1r) :
_g1rs(g1rs), _cg1r(cg1r), _concurrent(true)
{}
RefineCardTableEntryClosure() : _concurrent(true) { }
bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
bool oops_into_cset = _g1rs->refine_card(card_ptr, worker_i, false);
bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false);
// This path is executed by the concurrent refine or mutator threads,
// concurrently, and so we do not care if card_ptr contains references
// that point into the collection set.
......@@ -120,6 +116,7 @@ public:
// Otherwise, we finished successfully; return true.
return true;
}
void set_concurrent(bool b) { _concurrent = b; }
};
......@@ -475,9 +472,8 @@ void G1CollectedHeap::check_ct_logs_at_safepoint() {
// First clear the logged cards.
ClearLoggedCardTableEntryClosure clear;
dcqs.set_closure(&clear);
dcqs.apply_closure_to_all_completed_buffers();
dcqs.iterate_closure_all_threads(false);
dcqs.apply_closure_to_all_completed_buffers(&clear);
dcqs.iterate_closure_all_threads(&clear, false);
clear.print_histo();
// Now ensure that there's no dirty cards.
......@@ -490,9 +486,8 @@ void G1CollectedHeap::check_ct_logs_at_safepoint() {
guarantee(count2.n() == 0, "Card table should be clean.");
RedirtyLoggedCardTableEntryClosure redirty;
JavaThread::dirty_card_queue_set().set_closure(&redirty);
dcqs.apply_closure_to_all_completed_buffers();
dcqs.iterate_closure_all_threads(false);
dcqs.apply_closure_to_all_completed_buffers(&redirty);
dcqs.iterate_closure_all_threads(&redirty, false);
gclog_or_tty->print_cr("Log entries = %d, dirty cards = %d.",
clear.calls(), orig_count);
guarantee(redirty.calls() == clear.calls(),
......@@ -505,8 +500,6 @@ void G1CollectedHeap::check_ct_logs_at_safepoint() {
orig_count, count3.n());
guarantee(count3.n() >= orig_count, "Should have restored them all.");
}
JavaThread::dirty_card_queue_set().set_closure(_refine_cte_cl);
}
// Private class members.
......@@ -2000,7 +1993,9 @@ jint G1CollectedHeap::initialize() {
Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap");
Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap");
_cg1r = new ConcurrentG1Refine(this);
_refine_cte_cl = new RefineCardTableEntryClosure();
_cg1r = new ConcurrentG1Refine(this, _refine_cte_cl);
// Reserve the maximum.
......@@ -2095,24 +2090,21 @@ jint G1CollectedHeap::initialize() {
// Perform any initialization actions delegated to the policy.
g1_policy()->init();
_refine_cte_cl =
new RefineCardTableEntryClosure(g1_rem_set(),
concurrent_g1_refine());
JavaThread::dirty_card_queue_set().set_closure(_refine_cte_cl);
JavaThread::satb_mark_queue_set().initialize(SATB_Q_CBL_mon,
SATB_Q_FL_lock,
G1SATBProcessCompletedThreshold,
Shared_SATB_Q_lock);
JavaThread::dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl,
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
concurrent_g1_refine()->yellow_zone(),
concurrent_g1_refine()->red_zone(),
Shared_DirtyCardQ_lock);
if (G1DeferredRSUpdate) {
dirty_card_queue_set().initialize(DirtyCardQ_CBL_mon,
dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
-1, // never trigger processing
-1, // no limit on length
......@@ -2122,7 +2114,8 @@ jint G1CollectedHeap::initialize() {
// Initialize the card queue set used to hold cards containing
// references into the collection set.
_into_cset_dirty_card_queue_set.initialize(DirtyCardQ_CBL_mon,
_into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code
DirtyCardQ_CBL_mon,
DirtyCardQ_FL_lock,
-1, // never trigger processing
-1, // no limit on length
......@@ -5284,20 +5277,59 @@ void G1CollectedHeap::unlink_string_and_symbol_table(BoolObjectClosure* is_alive
}
class RedirtyLoggedCardTableEntryFastClosure : public CardTableEntryClosure {
public:
private:
size_t _num_processed;
public:
RedirtyLoggedCardTableEntryFastClosure() : CardTableEntryClosure(), _num_processed(0) { }
bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
*card_ptr = CardTableModRefBS::dirty_card_val();
_num_processed++;
return true;
}
size_t num_processed() const { return _num_processed; }
};
class G1RedirtyLoggedCardsTask : public AbstractGangTask {
private:
DirtyCardQueueSet* _queue;
public:
G1RedirtyLoggedCardsTask(DirtyCardQueueSet* queue) : AbstractGangTask("Redirty Cards"), _queue(queue) { }
virtual void work(uint worker_id) {
double start_time = os::elapsedTime();
RedirtyLoggedCardTableEntryFastClosure cl;
if (G1CollectedHeap::heap()->use_parallel_gc_threads()) {
_queue->par_apply_closure_to_all_completed_buffers(&cl);
} else {
_queue->apply_closure_to_all_completed_buffers(&cl);
}
G1GCPhaseTimes* timer = G1CollectedHeap::heap()->g1_policy()->phase_times();
timer->record_redirty_logged_cards_time_ms(worker_id, (os::elapsedTime() - start_time) * 1000.0);
timer->record_redirty_logged_cards_processed_cards(worker_id, cl.num_processed());
}
};
void G1CollectedHeap::redirty_logged_cards() {
guarantee(G1DeferredRSUpdate, "Must only be called when using deferred RS updates.");
double redirty_logged_cards_start = os::elapsedTime();
RedirtyLoggedCardTableEntryFastClosure redirty;
dirty_card_queue_set().set_closure(&redirty);
dirty_card_queue_set().apply_closure_to_all_completed_buffers();
uint n_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
_g1h->workers()->active_workers() : 1);
G1RedirtyLoggedCardsTask redirty_task(&dirty_card_queue_set());
dirty_card_queue_set().reset_for_par_iteration();
if (use_parallel_gc_threads()) {
set_par_threads(n_workers);
workers()->run_task(&redirty_task);
set_par_threads(0);
} else {
redirty_task.work(0);
}
DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set();
dcq.merge_bufferlists(&dirty_card_queue_set());
......
......@@ -173,6 +173,8 @@ G1GCPhaseTimes::G1GCPhaseTimes(uint max_gc_threads) :
_last_gc_worker_end_times_ms(_max_gc_threads, "%.1lf", false),
_last_gc_worker_times_ms(_max_gc_threads, "%.1lf"),
_last_gc_worker_other_times_ms(_max_gc_threads, "%.1lf"),
_last_redirty_logged_cards_time_ms(_max_gc_threads, "%.1lf"),
_last_redirty_logged_cards_processed_cards(_max_gc_threads, SIZE_FORMAT),
_cur_string_dedup_queue_fixup_worker_times_ms(_max_gc_threads, "%.1lf"),
_cur_string_dedup_table_fixup_worker_times_ms(_max_gc_threads, "%.1lf")
{
......@@ -198,6 +200,10 @@ void G1GCPhaseTimes::note_gc_start(uint active_gc_threads) {
_last_gc_worker_end_times_ms.reset();
_last_gc_worker_times_ms.reset();
_last_gc_worker_other_times_ms.reset();
_last_redirty_logged_cards_time_ms.reset();
_last_redirty_logged_cards_processed_cards.reset();
}
void G1GCPhaseTimes::note_gc_end() {
......@@ -233,6 +239,9 @@ void G1GCPhaseTimes::note_gc_end() {
_last_gc_worker_times_ms.verify();
_last_gc_worker_other_times_ms.verify();
_last_redirty_logged_cards_time_ms.verify();
_last_redirty_logged_cards_processed_cards.verify();
}
void G1GCPhaseTimes::note_string_dedup_fixup_start() {
......@@ -352,6 +361,10 @@ void G1GCPhaseTimes::print(double pause_time_sec) {
print_stats(2, "Ref Enq", _cur_ref_enq_time_ms);
if (G1DeferredRSUpdate) {
print_stats(2, "Redirty Cards", _recorded_redirty_logged_cards_time_ms);
if (G1Log::finest()) {
_last_redirty_logged_cards_time_ms.print(3, "Parallel Redirty");
_last_redirty_logged_cards_processed_cards.print(3, "Redirtied Cards");
}
}
print_stats(2, "Free CSet",
(_recorded_young_free_cset_time_ms +
......
......@@ -151,6 +151,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
double _recorded_young_cset_choice_time_ms;
double _recorded_non_young_cset_choice_time_ms;
WorkerDataArray<double> _last_redirty_logged_cards_time_ms;
WorkerDataArray<size_t> _last_redirty_logged_cards_processed_cards;
double _recorded_redirty_logged_cards_time_ms;
double _recorded_young_free_cset_time_ms;
......@@ -293,6 +295,14 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
_recorded_non_young_cset_choice_time_ms = time_ms;
}
void record_redirty_logged_cards_time_ms(uint worker_i, double time_ms) {
_last_redirty_logged_cards_time_ms.set(worker_i, time_ms);
}
void record_redirty_logged_cards_processed_cards(uint worker_i, size_t processed_buffers) {
_last_redirty_logged_cards_processed_cards.set(worker_i, processed_buffers);
}
void record_redirty_logged_cards_time_ms(double time_ms) {
_recorded_redirty_logged_cards_time_ms = time_ms;
}
......
......@@ -23,7 +23,7 @@
/*
* @test TestPrintGCDetails
* @bug 8035406 8027295 8035398
* @bug 8035406 8027295 8035398 8019342
* @summary Ensure that the PrintGCDetails output for a minor GC with G1
* includes the expected necessary messages.
* @key gc
......@@ -48,6 +48,8 @@ public class TestGCLogMessages {
OutputAnalyzer output = new OutputAnalyzer(pb.start());
output.shouldNotContain("[Redirty Cards");
output.shouldNotContain("[Parallel Redirty");
output.shouldNotContain("[Redirtied Cards");
output.shouldNotContain("[Code Root Purge");
output.shouldNotContain("[String Dedup Fixup");
output.shouldNotContain("[Young Free CSet");
......@@ -63,6 +65,8 @@ public class TestGCLogMessages {
output = new OutputAnalyzer(pb.start());
output.shouldContain("[Redirty Cards");
output.shouldNotContain("[Parallel Redirty");
output.shouldNotContain("[Redirtied Cards");
output.shouldContain("[Code Root Purge");
output.shouldContain("[String Dedup Fixup");
output.shouldNotContain("[Young Free CSet");
......@@ -80,6 +84,8 @@ public class TestGCLogMessages {
output = new OutputAnalyzer(pb.start());
output.shouldContain("[Redirty Cards");
output.shouldContain("[Parallel Redirty");
output.shouldContain("[Redirtied Cards");
output.shouldContain("[Code Root Purge");
output.shouldContain("[String Dedup Fixup");
output.shouldContain("[Young Free CSet");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册