From 477822c8a2cd1d7bd50ffc7fbdc8cb51932d6bf6 Mon Sep 17 00:00:00 2001 From: johnc Date: Tue, 25 Jan 2011 10:56:22 -0800 Subject: [PATCH] 6608385: G1: need to support parallel reference processing Summary: Implement support for ParallelRefProcEnabled in the reference processing that takes place at the end of G1 concurrent marking. Reviewed-by: tonyp, ysr --- .../gc_implementation/g1/concurrentMark.cpp | 300 ++++++++++++++++-- .../gc_implementation/g1/concurrentMark.hpp | 11 +- .../vm/gc_implementation/g1/g1_globals.hpp | 8 + src/share/vm/runtime/arguments.cpp | 6 + 4 files changed, 296 insertions(+), 29 deletions(-) diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.cpp b/src/share/vm/gc_implementation/g1/concurrentMark.cpp index e082c1b21..175810b45 100644 --- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp +++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp @@ -1055,7 +1055,12 @@ public: do { double start_vtime_sec = os::elapsedVTime(); double start_time_sec = os::elapsedTime(); - the_task->do_marking_step(10.0); + double mark_step_duration_ms = G1ConcMarkStepDurationMillis; + + the_task->do_marking_step(mark_step_duration_ms, + true /* do_stealing */, + true /* do_termination */); + double end_time_sec = os::elapsedTime(); double end_vtime_sec = os::elapsedVTime(); double elapsed_vtime_sec = end_vtime_sec - start_vtime_sec; @@ -1111,7 +1116,8 @@ void ConcurrentMark::markFromRoots() { _restart_for_overflow = false; - set_phase(MAX2((size_t) 1, parallel_marking_threads()), true); + size_t active_workers = MAX2((size_t) 1, parallel_marking_threads()); + set_phase(active_workers, true /* concurrent */); CMConcurrentMarkingTask markingTask(this, cmThread()); if (parallel_marking_threads() > 0) @@ -1176,6 +1182,12 @@ void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) { /* silent */ false, /* use_prev_marking */ false); } + assert(!restart_for_overflow(), "sanity"); + } + + // Reset the marking state if marking completed + if (!restart_for_overflow()) { + set_non_marking_state(); } #if VERIFY_OBJS_PROCESSED @@ -1853,6 +1865,8 @@ void ConcurrentMark::completeCleanup() { assert(local_free_list.is_empty(), "post-condition"); } +// Support closures for reference procssing in G1 + bool G1CMIsAliveClosure::do_object_b(oop obj) { HeapWord* addr = (HeapWord*)obj; return addr != NULL && @@ -1873,11 +1887,17 @@ class G1CMKeepAliveClosure: public OopClosure { virtual void do_oop( oop* p) { do_oop_work(p); } template void do_oop_work(T* p) { - oop thisOop = oopDesc::load_decode_heap_oop(p); - HeapWord* addr = (HeapWord*)thisOop; - if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(thisOop)) { + oop obj = oopDesc::load_decode_heap_oop(p); + HeapWord* addr = (HeapWord*)obj; + + if (_cm->verbose_high()) + gclog_or_tty->print_cr("\t[0] we're looking at location " + "*"PTR_FORMAT" = "PTR_FORMAT, + p, (void*) obj); + + if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) { _bitMap->mark(addr); - _cm->mark_stack_push(thisOop); + _cm->mark_stack_push(obj); } } }; @@ -1899,6 +1919,199 @@ class G1CMDrainMarkingStackClosure: public VoidClosure { } }; +// 'Keep Alive' closure used by parallel reference processing. +// An instance of this closure is used in the parallel reference processing +// code rather than an instance of G1CMKeepAliveClosure. We could have used +// the G1CMKeepAliveClosure as it is MT-safe. Also reference objects are +// placed on to discovered ref lists once so we can mark and push with no +// need to check whether the object has already been marked. Using the +// G1CMKeepAliveClosure would mean, however, having all the worker threads +// operating on the global mark stack. This means that an individual +// worker would be doing lock-free pushes while it processes its own +// discovered ref list followed by drain call. If the discovered ref lists +// are unbalanced then this could cause interference with the other +// workers. Using a CMTask (and its embedded local data structures) +// avoids that potential interference. +class G1CMParKeepAliveAndDrainClosure: public OopClosure { + ConcurrentMark* _cm; + CMTask* _task; + CMBitMap* _bitMap; + int _ref_counter_limit; + int _ref_counter; + public: + G1CMParKeepAliveAndDrainClosure(ConcurrentMark* cm, + CMTask* task, + CMBitMap* bitMap) : + _cm(cm), _task(task), _bitMap(bitMap), + _ref_counter_limit(G1RefProcDrainInterval) + { + assert(_ref_counter_limit > 0, "sanity"); + _ref_counter = _ref_counter_limit; + } + + virtual void do_oop(narrowOop* p) { do_oop_work(p); } + virtual void do_oop( oop* p) { do_oop_work(p); } + + template void do_oop_work(T* p) { + if (!_cm->has_overflown()) { + oop obj = oopDesc::load_decode_heap_oop(p); + if (_cm->verbose_high()) + gclog_or_tty->print_cr("\t[%d] we're looking at location " + "*"PTR_FORMAT" = "PTR_FORMAT, + _task->task_id(), p, (void*) obj); + + _task->deal_with_reference(obj); + _ref_counter--; + + if (_ref_counter == 0) { + // We have dealt with _ref_counter_limit references, pushing them and objects + // reachable from them on to the local stack (and possibly the global stack). + // Call do_marking_step() to process these entries. We call the routine in a + // loop, which we'll exit if there's nothing more to do (i.e. we're done + // with the entries that we've pushed as a result of the deal_with_reference + // calls above) or we overflow. + // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag + // while there may still be some work to do. (See the comment at the + // beginning of CMTask::do_marking_step() for those conditions - one of which + // is reaching the specified time target.) It is only when + // CMTask::do_marking_step() returns without setting the has_aborted() flag + // that the marking has completed. + do { + double mark_step_duration_ms = G1ConcMarkStepDurationMillis; + _task->do_marking_step(mark_step_duration_ms, + false /* do_stealing */, + false /* do_termination */); + } while (_task->has_aborted() && !_cm->has_overflown()); + _ref_counter = _ref_counter_limit; + } + } else { + if (_cm->verbose_high()) + gclog_or_tty->print_cr("\t[%d] CM Overflow", _task->task_id()); + } + } +}; + +class G1CMParDrainMarkingStackClosure: public VoidClosure { + ConcurrentMark* _cm; + CMTask* _task; + public: + G1CMParDrainMarkingStackClosure(ConcurrentMark* cm, CMTask* task) : + _cm(cm), _task(task) + {} + + void do_void() { + do { + if (_cm->verbose_high()) + gclog_or_tty->print_cr("\t[%d] Drain: Calling do marking_step", _task->task_id()); + + // We call CMTask::do_marking_step() to completely drain the local and + // global marking stacks. The routine is called in a loop, which we'll + // exit if there's nothing more to do (i.e. we'completely drained the + // entries that were pushed as a result of applying the + // G1CMParKeepAliveAndDrainClosure to the entries on the discovered ref + // lists above) or we overflow the global marking stack. + // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag + // while there may still be some work to do. (See the comment at the + // beginning of CMTask::do_marking_step() for those conditions - one of which + // is reaching the specified time target.) It is only when + // CMTask::do_marking_step() returns without setting the has_aborted() flag + // that the marking has completed. + + _task->do_marking_step(1000000000.0 /* something very large */, + true /* do_stealing */, + true /* do_termination */); + } while (_task->has_aborted() && !_cm->has_overflown()); + } +}; + +// Implementation of AbstractRefProcTaskExecutor for G1 +class G1RefProcTaskExecutor: public AbstractRefProcTaskExecutor { +private: + G1CollectedHeap* _g1h; + ConcurrentMark* _cm; + CMBitMap* _bitmap; + WorkGang* _workers; + int _active_workers; + +public: + G1RefProcTaskExecutor(G1CollectedHeap* g1h, + ConcurrentMark* cm, + CMBitMap* bitmap, + WorkGang* workers, + int n_workers) : + _g1h(g1h), _cm(cm), _bitmap(bitmap), + _workers(workers), _active_workers(n_workers) + { } + + // Executes the given task using concurrent marking worker threads. + virtual void execute(ProcessTask& task); + virtual void execute(EnqueueTask& task); +}; + +class G1RefProcTaskProxy: public AbstractGangTask { + typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask; + ProcessTask& _proc_task; + G1CollectedHeap* _g1h; + ConcurrentMark* _cm; + CMBitMap* _bitmap; + +public: + G1RefProcTaskProxy(ProcessTask& proc_task, + G1CollectedHeap* g1h, + ConcurrentMark* cm, + CMBitMap* bitmap) : + AbstractGangTask("Process reference objects in parallel"), + _proc_task(proc_task), _g1h(g1h), _cm(cm), _bitmap(bitmap) + {} + + virtual void work(int i) { + CMTask* marking_task = _cm->task(i); + G1CMIsAliveClosure g1_is_alive(_g1h); + G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task, _bitmap); + G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task); + + _proc_task.work(i, g1_is_alive, g1_par_keep_alive, g1_par_drain); + } +}; + +void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) { + assert(_workers != NULL, "Need parallel worker threads."); + + G1RefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap); + + // We need to reset the phase for each task execution so that + // the termination protocol of CMTask::do_marking_step works. + _cm->set_phase(_active_workers, false /* concurrent */); + _g1h->set_par_threads(_active_workers); + _workers->run_task(&proc_task_proxy); + _g1h->set_par_threads(0); +} + +class G1RefEnqueueTaskProxy: public AbstractGangTask { + typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask; + EnqueueTask& _enq_task; + +public: + G1RefEnqueueTaskProxy(EnqueueTask& enq_task) : + AbstractGangTask("Enqueue reference objects in parallel"), + _enq_task(enq_task) + { } + + virtual void work(int i) { + _enq_task.work(i); + } +}; + +void G1RefProcTaskExecutor::execute(EnqueueTask& enq_task) { + assert(_workers != NULL, "Need parallel worker threads."); + + G1RefEnqueueTaskProxy enq_task_proxy(enq_task); + + _g1h->set_par_threads(_active_workers); + _workers->run_task(&enq_task_proxy); + _g1h->set_par_threads(0); +} + void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) { ResourceMark rm; HandleMark hm; @@ -1917,18 +2130,52 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) { G1CMDrainMarkingStackClosure g1_drain_mark_stack(nextMarkBitMap(), &_markStack, &g1_keep_alive); - // XXXYYY Also: copy the parallel ref processing code from CMS. - rp->process_discovered_references(&g1_is_alive, - &g1_keep_alive, - &g1_drain_mark_stack, - NULL); + // We use the work gang from the G1CollectedHeap and we utilize all + // the worker threads. + int active_workers = MAX2(MIN2(g1h->workers()->total_workers(), (int)_max_task_num), 1); + + G1RefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(), + g1h->workers(), active_workers); + + if (rp->processing_is_mt()) { + // Set the degree of MT here. If the discovery is done MT, there + // may have been a different number of threads doing the discovery + // and a different number of discovered lists may have Ref objects. + // That is OK as long as the Reference lists are balanced (see + // balance_all_queues() and balance_queues()). + rp->set_mt_degree(active_workers); + + rp->process_discovered_references(&g1_is_alive, + &g1_keep_alive, + &g1_drain_mark_stack, + &par_task_executor); + + // The work routines of the parallel keep_alive and drain_marking_stack + // will set the has_overflown flag if we overflow the global marking + // stack. + } else { + rp->process_discovered_references(&g1_is_alive, + &g1_keep_alive, + &g1_drain_mark_stack, + NULL); + + } + assert(_markStack.overflow() || _markStack.isEmpty(), - "mark stack should be empty (unless it overflowed)"); + "mark stack should be empty (unless it overflowed)"); if (_markStack.overflow()) { + // Should have been done already when we tried to push an + // entry on to the global mark stack. But let's do it again. set_has_overflown(); } - rp->enqueue_discovered_references(); + if (rp->processing_is_mt()) { + assert(rp->num_q() == active_workers, "why not"); + rp->enqueue_discovered_references(&par_task_executor); + } else { + rp->enqueue_discovered_references(); + } + rp->verify_no_references_recorded(); assert(!rp->discovery_enabled(), "should have been disabled"); @@ -1955,7 +2202,9 @@ public: CMTask* task = _cm->task(worker_i); task->record_start_time(); do { - task->do_marking_step(1000000000.0 /* something very large */); + task->do_marking_step(1000000000.0 /* something very large */, + true /* do_stealing */, + true /* do_termination */); } while (task->has_aborted() && !_cm->has_overflown()); // If we overflow, then we do not want to restart. We instead // want to abort remark and do concurrent marking again. @@ -1978,7 +2227,7 @@ void ConcurrentMark::checkpointRootsFinalWork() { G1CollectedHeap::StrongRootsScope srs(g1h); // this is remark, so we'll use up all available threads int active_workers = ParallelGCThreads; - set_phase(active_workers, false); + set_phase(active_workers, false /* concurrent */); CMRemarkTask remarkTask(this); // We will start all available threads, even if we decide that the @@ -1992,7 +2241,7 @@ void ConcurrentMark::checkpointRootsFinalWork() { G1CollectedHeap::StrongRootsScope srs(g1h); // this is remark, so we'll use up all available threads int active_workers = 1; - set_phase(active_workers, false); + set_phase(active_workers, false /* concurrent */); CMRemarkTask remarkTask(this); // We will start all available threads, even if we decide that the @@ -2005,9 +2254,6 @@ void ConcurrentMark::checkpointRootsFinalWork() { print_stats(); - if (!restart_for_overflow()) - set_non_marking_state(); - #if VERIFY_OBJS_PROCESSED if (_scan_obj_cl.objs_processed != ThreadLocalObjQueue::objs_enqueued) { gclog_or_tty->print_cr("Processed = %d, enqueued = %d.", @@ -3124,7 +3370,7 @@ void CMTask::deal_with_reference(oop obj) { // do nothing } #else // _CHECK_BOTH_FINGERS_ - // we will only check the global finger + // we will only check the global finger if (objAddr < global_finger) { // see long comment above @@ -3249,7 +3495,7 @@ void CMTask::regular_clock_call() { double elapsed_time_ms = curr_time_ms - _start_time_ms; if (elapsed_time_ms > _time_target_ms) { set_has_aborted(); - _has_aborted_timed_out = true; + _has_timed_out = true; statsOnly( ++_aborted_timed_out ); return; } @@ -3754,7 +4000,9 @@ void CMTask::print_stats() { *****************************************************************************/ -void CMTask::do_marking_step(double time_target_ms) { +void CMTask::do_marking_step(double time_target_ms, + bool do_stealing, + bool do_termination) { assert(time_target_ms >= 1.0, "minimum granularity is 1ms"); assert(concurrent() == _cm->concurrent(), "they should be the same"); @@ -3794,7 +4042,7 @@ void CMTask::do_marking_step(double time_target_ms) { // clear all flags clear_has_aborted(); - _has_aborted_timed_out = false; + _has_timed_out = false; _draining_satb_buffers = false; ++_calls; @@ -3970,7 +4218,7 @@ void CMTask::do_marking_step(double time_target_ms) { drain_global_stack(false); // Attempt at work stealing from other task's queues. - if (!has_aborted()) { + if (do_stealing && !has_aborted()) { // We have not aborted. This means that we have finished all that // we could. Let's try to do some stealing... @@ -4011,7 +4259,7 @@ void CMTask::do_marking_step(double time_target_ms) { // We still haven't aborted. Now, let's try to get into the // termination protocol. - if (!has_aborted()) { + if (do_termination && !has_aborted()) { // We cannot check whether the global stack is empty, since other // tasks might be concurrently pushing objects on it. We also cannot // check if the region stack is empty because if a thread is aborting @@ -4087,7 +4335,7 @@ void CMTask::do_marking_step(double time_target_ms) { statsOnly( ++_aborted ); - if (_has_aborted_timed_out) { + if (_has_timed_out) { double diff_ms = elapsed_time_ms - _time_target_ms; // Keep statistics of how well we did with respect to hitting // our target only if we actually timed out (if we aborted for diff --git a/src/share/vm/gc_implementation/g1/concurrentMark.hpp b/src/share/vm/gc_implementation/g1/concurrentMark.hpp index 9aa961bae..e256d4e1f 100644 --- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp +++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp @@ -353,6 +353,10 @@ class ConcurrentMark: public CHeapObj { friend class CMConcurrentMarkingTask; friend class G1ParNoteEndTask; friend class CalcLiveObjectsClosure; + friend class G1RefProcTaskProxy; + friend class G1RefProcTaskExecutor; + friend class G1CMParKeepAliveAndDrainClosure; + friend class G1CMParDrainMarkingStackClosure; protected: ConcurrentMarkThread* _cmThread; // the thread doing the work @@ -936,7 +940,7 @@ private: // if this is true, then the task has aborted for some reason bool _has_aborted; // set when the task aborts because it has met its time quota - bool _has_aborted_timed_out; + bool _has_timed_out; // true when we're draining SATB buffers; this avoids the task // aborting due to SATB buffers being available (as we're already // dealing with them) @@ -1041,7 +1045,7 @@ public: // trying not to exceed the given duration. However, it might exit // prematurely, according to some conditions (i.e. SATB buffers are // available for processing). - void do_marking_step(double target_ms); + void do_marking_step(double target_ms, bool do_stealing, bool do_termination); // These two calls start and stop the timer void record_start_time() { @@ -1063,7 +1067,8 @@ public: bool has_aborted() { return _has_aborted; } void set_has_aborted() { _has_aborted = true; } void clear_has_aborted() { _has_aborted = false; } - bool claimed() { return _claimed; } + bool has_timed_out() { return _has_timed_out; } + bool claimed() { return _claimed; } // Support routines for the partially scanned region that may be // recorded as a result of aborting while draining the CMRegionStack diff --git a/src/share/vm/gc_implementation/g1/g1_globals.hpp b/src/share/vm/gc_implementation/g1/g1_globals.hpp index b6fcf614f..0e5d23189 100644 --- a/src/share/vm/gc_implementation/g1/g1_globals.hpp +++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp @@ -81,6 +81,14 @@ product(intx, G1MarkRegionStackSize, 1024 * 1024, \ "Size of the region stack for concurrent marking.") \ \ + product(double, G1ConcMarkStepDurationMillis, 10.0, \ + "Target duration of individual concurrent marking steps " \ + "in milliseconds.") \ + \ + product(intx, G1RefProcDrainInterval, 10, \ + "The number of discovered reference objects to process before " \ + "draining concurrent marking work queues.") \ + \ develop(bool, G1SATBBarrierPrintNullPreVals, false, \ "If true, count frac of ptr writes with null pre-vals.") \ \ diff --git a/src/share/vm/runtime/arguments.cpp b/src/share/vm/runtime/arguments.cpp index b106d08fc..ff1abc55b 100644 --- a/src/share/vm/runtime/arguments.cpp +++ b/src/share/vm/runtime/arguments.cpp @@ -1941,10 +1941,16 @@ bool Arguments::check_vm_args_consistency() { status = false; } +#ifndef SERIALGC if (UseG1GC) { status = status && verify_percentage(InitiatingHeapOccupancyPercent, "InitiatingHeapOccupancyPercent"); + status = status && verify_min_value(G1RefProcDrainInterval, 1, + "G1RefProcDrainInterval"); + status = status && verify_min_value((intx)G1ConcMarkStepDurationMillis, 1, + "G1ConcMarkStepDurationMillis"); } +#endif status = status && verify_interval(RefDiscoveryPolicy, ReferenceProcessor::DiscoveryPolicyMin, -- GitLab