[Misc] print the detail time when processing the gc roots

Summary: - port D64010 to dragonwell - print the detail time when doing root processing in generational heap collection Test Plan: hotspot/test/gc/TestPrintGCRootsTraceTime.java hotspot/test/gc/ Reviewers: 传胜 Differential Revision: https://aone.alibaba-inc.com/code/D851700

[Misc] print the detail time when processing the gc roots
Summary: - port D64010 to dragonwell - print the detail time when doing root processing in generational heap collection Test Plan: hotspot/test/gc/TestPrintGCRootsTraceTime.java hotspot/test/gc/ Reviewers: 传胜 Differential Revision: https://aone.alibaba-inc.com/code/D851700
87e611ef · tongbao.ztb · 云矅 · 0dfd4203 · 87e611ef · 87e611ef
17 changed file
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -3058,6 +3058,8 @@ void CMSCollector::verify_after_remark_work_1() {
  HandleMark  hm;
  GenCollectedHeap* gch = GenCollectedHeap::heap();

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+
  // Get a clear set of claim bits for the roots processing to work with.
  ClassLoaderDataGraph::clear_claimed_marks();

@@ -3065,6 +3067,8 @@ void CMSCollector::verify_after_remark_work_1() {
  MarkRefsIntoClosure notOlder(_span, verification_mark_bm());
  gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.

+  phase_times->note_gc_start(1);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
  gch->gen_process_roots(_cmsGen->level(),
                         true,   // younger gens are roots
                         true,   // activate StrongRootsScope
@@ -3072,7 +3076,10 @@ void CMSCollector::verify_after_remark_work_1() {
                         should_unload_classes(),
                         &notOlder,
                         NULL,
-                         NULL);  // SSS: Provide correct closure
+                         NULL,   // SSS: Provide correct closure
+                         phase_times,
+                         0);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());

  // Now mark from the roots
  MarkFromRootsClosure markFromRootsClosure(this, _span,
@@ -3123,6 +3130,7 @@ void CMSCollector::verify_after_remark_work_2() {
  HandleMark  hm;
  GenCollectedHeap* gch = GenCollectedHeap::heap();

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
  // Get a clear set of claim bits for the roots processing to work with.
  ClassLoaderDataGraph::clear_claimed_marks();

@@ -3133,6 +3141,8 @@ void CMSCollector::verify_after_remark_work_2() {

  gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.

+  phase_times->note_gc_start(1);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
  gch->gen_process_roots(_cmsGen->level(),
                         true,   // younger gens are roots
                         true,   // activate StrongRootsScope
@@ -3140,8 +3150,10 @@ void CMSCollector::verify_after_remark_work_2() {
                         should_unload_classes(),
                         &notOlder,
                         NULL,
-                         &cld_closure);
-
+                         &cld_closure,
+                         phase_times,
+                         0);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());
  // Now mark from the roots
  MarkFromRootsVerifyClosure markFromRootsClosure(this, _span,
    verification_mark_bm(), markBitMap(), verification_mark_stack());
@@ -3715,6 +3727,7 @@ void CMSCollector::checkpointRootsInitialWork(bool asynch) {
  // Update the saved marks which may affect the root scans.
  gch->save_marks();

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
  // weak reference processing has not started yet.
  ref_processor()->set_enqueuing_is_done(false);

@@ -3737,6 +3750,7 @@ void CMSCollector::checkpointRootsInitialWork(bool asynch) {
      FlexibleWorkGang* workers = gch->workers();
      assert(workers != NULL, "Need parallel worker threads.");
      int n_workers = workers->active_workers();
+      phase_times->note_gc_start(n_workers);
      CMSParInitialMarkTask tsk(this, n_workers);
      gch->set_par_threads(n_workers);
      initialize_sequential_subtasks_for_young_gen_rescan(n_workers);
@@ -3752,6 +3766,8 @@ void CMSCollector::checkpointRootsInitialWork(bool asynch) {
      // The serial version.
      CLDToOopClosure cld_closure(&notOlder, true);
      gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.
+      phase_times->note_gc_start(1);
+      phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
      gch->gen_process_roots(_cmsGen->level(),
                             true,   // younger gens are roots
                             true,   // activate StrongRootsScope
@@ -3759,7 +3775,14 @@ void CMSCollector::checkpointRootsInitialWork(bool asynch) {
                             should_unload_classes(),
                             &notOlder,
                             NULL,
-                             &cld_closure);
+                             &cld_closure,
+                             phase_times,
+                             0);
+      phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());
+    }
+    phase_times->note_gc_end();
+    if (PrintGCRootsTraceTime && PrintGCDetails) {
+      phase_times->log_gc_details();
    }
  }

@@ -5234,6 +5257,8 @@ void CMSParInitialMarkTask::work(uint worker_id) {
  // ---------- scan from roots --------------
  _timer.start();
  GenCollectedHeap* gch = GenCollectedHeap::heap();
+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
  Par_MarkRefsIntoClosure par_mri_cl(_collector->_span, &(_collector->_markBitMap));

  // ---------- young gen roots --------------
@@ -5260,7 +5285,9 @@ void CMSParInitialMarkTask::work(uint worker_id) {
                         _collector->should_unload_classes(),
                         &par_mri_cl,
                         NULL,
-                         &cld_closure);
+                         &cld_closure,
+                         phase_times,
+                         worker_id);
  assert(_collector->should_unload_classes()
         || (_collector->CMSCollector::roots_scanning_options() & GenCollectedHeap::SO_AllCodeCache),
         "if we didn't scan the code cache, we have to be ready to drop nmethods with expired weak oops");
@@ -5270,6 +5297,7 @@ void CMSParInitialMarkTask::work(uint worker_id) {
      "Finished remaining root initial mark scan work in %dth thread: %3.3f sec",
      worker_id, _timer.seconds());
  }
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, worker_id, os::elapsedTime());
 }

 // Parallel remark task
@@ -5366,6 +5394,8 @@ void CMSParRemarkTask::work(uint worker_id) {
  // ---------- rescan from roots --------------
  _timer.start();
  GenCollectedHeap* gch = GenCollectedHeap::heap();
+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
  Par_MarkRefsIntoAndScanClosure par_mrias_cl(_collector,
    _collector->_span, _collector->ref_processor(),
    &(_collector->_markBitMap),
@@ -5396,7 +5426,9 @@ void CMSParRemarkTask::work(uint worker_id) {
                         _collector->should_unload_classes(),
                         &par_mrias_cl,
                         NULL,
-                         NULL);     // The dirty klasses will be handled below
+                         NULL,      // The dirty klasses will be handled below
+                         phase_times,
+                         worker_id);

  assert(_collector->should_unload_classes()
         || (_collector->CMSCollector::roots_scanning_options() & GenCollectedHeap::SO_AllCodeCache),
@@ -5481,6 +5513,7 @@ void CMSParRemarkTask::work(uint worker_id) {
      "Finished work stealing in %dth thread: %3.3f sec",
      worker_id, _timer.seconds());
  }
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, worker_id, os::elapsedTime());
 }

 // Note that parameter "i" is not used.
@@ -5898,6 +5931,8 @@ void CMSCollector::do_remark_parallel() {
  // claimed by the parallel threads.
  cms_space->initialize_sequential_subtasks_for_rescan(n_workers);

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  phase_times->note_gc_start(n_workers);
  // It turns out that even when we're using 1 thread, doing the work in a
  // separate thread causes wide variance in run times.  We can't help this
  // in the multi-threaded case, but we special-case n=1 here to get
@@ -5915,6 +5950,11 @@ void CMSCollector::do_remark_parallel() {
    tsk.work(0);
  }

+  phase_times->note_gc_end();
+  if (PrintGCRootsTraceTime && PrintGCDetails) {
+    phase_times->log_gc_details();
+  }
+
  gch->set_par_threads(0);  // 0 ==> non-parallel.
  // restore, single-threaded for now, any preserved marks
  // as a result of work_q overflow
@@ -5927,7 +5967,7 @@ void CMSCollector::do_remark_non_parallel() {
  HandleMark   hm;
  GenCollectedHeap* gch = GenCollectedHeap::heap();
  ReferenceProcessorMTDiscoveryMutator mt(ref_processor(), false);
-
+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
  MarkRefsIntoAndScanClosure
    mrias_cl(_span, ref_processor(), &_markBitMap, NULL /* not precleaning */,
             &_markStack, this,
@@ -5981,6 +6021,8 @@ void CMSCollector::do_remark_non_parallel() {
    gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.
    GenCollectedHeap::StrongRootsScope srs(gch);

+    phase_times->note_gc_start(1);
+    phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
    gch->gen_process_roots(_cmsGen->level(),
                           true,  // younger gens as roots
                           false, // use the local StrongRootsScope
@@ -5988,7 +6030,10 @@ void CMSCollector::do_remark_non_parallel() {
                           should_unload_classes(),
                           &mrias_cl,
                           NULL,
-                           NULL); // The dirty klasses will be handled below
+                           NULL,  // The dirty klasses will be handled below
+                           phase_times,
+                           0);
+    phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());

    assert(should_unload_classes()
           || (roots_scanning_options() & GenCollectedHeap::SO_AllCodeCache),

--- a/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp
+++ b/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.cpp
 /*
- * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -27,223 +27,10 @@
 #include "gc_implementation/g1/g1GCPhaseTimes.hpp"
 #include "gc_implementation/g1/g1Log.hpp"
 #include "gc_implementation/g1/g1StringDedup.hpp"
+#include "gc_implementation/shared/workerDataArray.inline.hpp"
 #include "memory/allocation.hpp"
 #include "runtime/os.hpp"

-// Helper class for avoiding interleaved logging
-class LineBuffer: public StackObj {
-
-private:
-  static const int BUFFER_LEN = 1024;
-  static const int INDENT_CHARS = 3;
-  char _buffer[BUFFER_LEN];
-  int _indent_level;
-  int _cur;
-
-  void vappend(const char* format, va_list ap)  ATTRIBUTE_PRINTF(2, 0) {
-    int res = os::vsnprintf(&_buffer[_cur], BUFFER_LEN - _cur, format, ap);
-    if (res > BUFFER_LEN) {
-      DEBUG_ONLY(warning("buffer too small in LineBuffer");)
-      _buffer[BUFFER_LEN -1] = 0;
-      _cur = BUFFER_LEN; // vsnprintf above should not add to _buffer if we are called again
-    } else if (res != -1) {
-      _cur += res;
-    }
-  }
-
-public:
-  explicit LineBuffer(int indent_level): _indent_level(indent_level), _cur(0) {
-    for (; (_cur < BUFFER_LEN && _cur < (_indent_level * INDENT_CHARS)); _cur++) {
-      _buffer[_cur] = ' ';
-    }
-  }
-
-#ifndef PRODUCT
-  ~LineBuffer() {
-    assert(_cur == _indent_level * INDENT_CHARS, "pending data in buffer - append_and_print_cr() not called?");
-  }
-#endif
-
-  void append(const char* format, ...)  ATTRIBUTE_PRINTF(2, 3) {
-    va_list ap;
-    va_start(ap, format);
-    vappend(format, ap);
-    va_end(ap);
-  }
-
-  void print_cr() {
-    gclog_or_tty->print_cr("%s", _buffer);
-    _cur = _indent_level * INDENT_CHARS;
-  }
-
-  void append_and_print_cr(const char* format, ...)  ATTRIBUTE_PRINTF(2, 3) {
-    va_list ap;
-    va_start(ap, format);
-    vappend(format, ap);
-    va_end(ap);
-    print_cr();
-  }
-};
-
-template <class T>
-class WorkerDataArray  : public CHeapObj<mtGC> {
-  friend class G1GCParPhasePrinter;
-  T*          _data;
-  uint        _length;
-  const char* _title;
-  bool        _print_sum;
-  int         _log_level;
-  uint        _indent_level;
-  bool        _enabled;
-
-  WorkerDataArray<size_t>* _thread_work_items;
-
-  NOT_PRODUCT(T uninitialized();)
-
-  // We are caching the sum and average to only have to calculate them once.
-  // This is not done in an MT-safe way. It is intended to allow single
-  // threaded code to call sum() and average() multiple times in any order
-  // without having to worry about the cost.
-  bool   _has_new_data;
-  T      _sum;
-  T      _min;
-  T      _max;
-  double _average;
-
- public:
-  WorkerDataArray(uint length, const char* title, bool print_sum, int log_level, uint indent_level) :
-    _title(title), _length(0), _print_sum(print_sum), _log_level(log_level), _indent_level(indent_level),
-    _has_new_data(true), _thread_work_items(NULL), _enabled(true) {
-    assert(length > 0, "Must have some workers to store data for");
-    _length = length;
-    _data = NEW_C_HEAP_ARRAY(T, _length, mtGC);
-  }
-
-  ~WorkerDataArray() {
-    FREE_C_HEAP_ARRAY(T, _data, mtGC);
-  }
-
-  void link_thread_work_items(WorkerDataArray<size_t>* thread_work_items) {
-    _thread_work_items = thread_work_items;
-  }
-
-  WorkerDataArray<size_t>* thread_work_items() { return _thread_work_items; }
-
-  void set(uint worker_i, T value) {
-    assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
-    assert(_data[worker_i] == WorkerDataArray<T>::uninitialized(), err_msg("Overwriting data for worker %d in %s", worker_i, _title));
-    _data[worker_i] = value;
-    _has_new_data = true;
-  }
-
-  void set_thread_work_item(uint worker_i, size_t value) {
-    assert(_thread_work_items != NULL, "No sub count");
-    _thread_work_items->set(worker_i, value);
-  }
-
-  T get(uint worker_i) {
-    assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
-    assert(_data[worker_i] != WorkerDataArray<T>::uninitialized(), err_msg("No data added for worker %d", worker_i));
-    return _data[worker_i];
-  }
-
-  void add(uint worker_i, T value) {
-    assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
-    assert(_data[worker_i] != WorkerDataArray<T>::uninitialized(), err_msg("No data to add to for worker %d", worker_i));
-    _data[worker_i] += value;
-    _has_new_data = true;
-  }
-
-  double average(uint active_threads){
-    calculate_totals(active_threads);
-    return _average;
-  }
-
-  T sum(uint active_threads) {
-    calculate_totals(active_threads);
-    return _sum;
-  }
-
-  T minimum(uint active_threads) {
-    calculate_totals(active_threads);
-    return _min;
-  }
-
-  T maximum(uint active_threads) {
-    calculate_totals(active_threads);
-    return _max;
-  }
-
-  void reset() PRODUCT_RETURN;
-  void verify(uint active_threads) PRODUCT_RETURN;
-
-  void set_enabled(bool enabled) { _enabled = enabled; }
-
-  int log_level() { return _log_level;  }
-
- private:
-
-  void calculate_totals(uint active_threads){
-    if (!_has_new_data) {
-      return;
-    }
-
-    _sum = (T)0;
-    _min = _data[0];
-    _max = _min;
-    assert(active_threads <= _length, "Wrong number of active threads");
-    for (uint i = 0; i < active_threads; ++i) {
-      T val = _data[i];
-      _sum += val;
-      _min = MIN2(_min, val);
-      _max = MAX2(_max, val);
-    }
-    _average = (double)_sum / (double)active_threads;
-    _has_new_data = false;
-  }
-};
-
-
-#ifndef PRODUCT
-
-template <>
-size_t WorkerDataArray<size_t>::uninitialized() {
-  return (size_t)-1;
-}
-
-template <>
-double WorkerDataArray<double>::uninitialized() {
-  return -1.0;
-}
-
-template <class T>
-void WorkerDataArray<T>::reset() {
-  for (uint i = 0; i < _length; i++) {
-    _data[i] = WorkerDataArray<T>::uninitialized();
-  }
-  if (_thread_work_items != NULL) {
-    _thread_work_items->reset();
-  }
-}
-
-template <class T>
-void WorkerDataArray<T>::verify(uint active_threads) {
-  if (!_enabled) {
-    return;
-  }
-
-  assert(active_threads <= _length, "Wrong number of active threads");
-  for (uint i = 0; i < active_threads; i++) {
-    assert(_data[i] != WorkerDataArray<T>::uninitialized(),
-        err_msg("Invalid data for worker %u in '%s'", i, _title));
-  }
-  if (_thread_work_items != NULL) {
-    _thread_work_items->verify(active_threads);
-  }
-}
-
-#endif
-
 G1GCPhaseTimes::G1GCPhaseTimes(uint max_gc_threads) :
  _max_gc_threads(max_gc_threads)
 {

--- a/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp
+++ b/src/share/vm/gc_implementation/g1/g1GCPhaseTimes.hpp
 /*
- * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #define SHARE_VM_GC_IMPLEMENTATION_G1_G1GCPHASETIMESLOG_HPP

 #include "memory/allocation.hpp"
+#include "gc_implementation/shared/workerDataArray.hpp"

 class LineBuffer;


--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@@ -598,6 +598,9 @@ void ParNewGenTask::set_for_termination(int active_workers) {

 void ParNewGenTask::work(uint worker_id) {
  GenCollectedHeap* gch = GenCollectedHeap::heap();
+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
+
  // Since this is being done in a separate thread, need new resource
  // and handle marks.
  ResourceMark rm;
@@ -627,12 +630,15 @@ void ParNewGenTask::work(uint worker_id) {
                         GenCollectedHeap::StrongAndWeakRoots,
                         &par_scan_state.to_space_root_closure(),
                         &par_scan_state.older_gen_closure(),
-                         &cld_scan_closure);
+                         &cld_scan_closure,
+                         phase_times,
+                         worker_id);

  par_scan_state.end_strong_roots();

  // "evacuate followers".
  par_scan_state.evacuate_followers_closure().do_void();
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, worker_id, os::elapsedTime());
 }

 #ifdef _MSC_VER
@@ -919,6 +925,7 @@ void ParNewGeneration::collect(bool   full,

  GenCollectedHeap* gch = GenCollectedHeap::heap();

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
  _gc_timer->register_gc_start();

  assert(gch->kind() == CollectedHeap::GenCollectedHeap,
@@ -931,6 +938,7 @@ void ParNewGeneration::collect(bool   full,
                                   workers->active_workers(),
                                   Threads::number_of_non_daemon_threads());
  workers->set_active_workers(active_workers);
+  phase_times->note_gc_start(active_workers);
  assert(gch->n_gens() == 2,
         "Par collection currently only works with single older gen.");
  _next_gen = gch->next_gen(this);
@@ -999,6 +1007,10 @@ void ParNewGeneration::collect(bool   full,
  thread_state_set.reset(0 /* Bad value in debug if not reset */,
                         promotion_failed());

+  phase_times->note_gc_end();
+  if (PrintGCRootsTraceTime && PrintGCDetails) {
+    phase_times->log_gc_details();
+  }
  // Process (weak) reference objects found during scavenge.
  ReferenceProcessor* rp = ref_processor();
  IsAliveClosure is_alive(this);

--- a/src/share/vm/gc_implementation/shared/gcTraceTime.cpp
+++ b/src/share/vm/gc_implementation/shared/gcTraceTime.cpp
@@ -26,6 +26,7 @@
 #include "gc_implementation/shared/gcTimer.hpp"
 #include "gc_implementation/shared/gcTrace.hpp"
 #include "gc_implementation/shared/gcTraceTime.hpp"
+#include "gc_implementation/shared/workerDataArray.inline.hpp"
 #include "runtime/globals.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
@@ -34,7 +35,6 @@
 #include "utilities/ostream.hpp"
 #include "utilities/ticks.inline.hpp"

-
 GCTraceTime::GCTraceTime(const char* title, bool doit, bool print_cr, GCTimer* timer, GCId gc_id) :
    _title(title), _doit(doit), _print_cr(print_cr), _timer(timer), _start_counter() {
  if (_doit || _timer != NULL) {
@@ -82,3 +82,205 @@ GCTraceTime::~GCTraceTime() {
    gclog_or_tty->flush();
  }
 }
+
+//========================================================================
+// GenGC Log level class
+class GenGCLog : public AllStatic {
+ public:
+  typedef enum {
+    LevelNone,
+    LevelFine,
+    LevelFinest,
+  } LogLevel;
+
+ private:
+  static LogLevel _level;
+ public:
+  inline static bool fine() {
+    return _level >= LevelFine;
+  }
+
+  inline static bool finest() {
+    return _level == LevelFinest;
+  }
+
+  static LogLevel level() {
+    return _level;
+  }
+};
+
+GenGCLog::LogLevel GenGCLog::_level = GenGCLog::LevelNone;
+
+GenGCPhaseTimes::GenGCPhaseTimes(uint max_gc_threads) :
+  _max_gc_threads(max_gc_threads)
+{
+  assert(max_gc_threads > 0, "Must have some GC threads");
+
+  _gc_par_phases[GCWorkerStart] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Start (ms)", false, GenGCLog::LevelFine, 2);
+  _gc_par_phases[RootProcess] = new WorkerDataArray<double>(max_gc_threads, "Root Processing (ms)", true, GenGCLog::LevelFine, 2);
+
+  // Root scanning phases
+  _gc_par_phases[ThreadRoots] = new WorkerDataArray<double>(max_gc_threads, "Thread Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[StringTableRoots] = new WorkerDataArray<double>(max_gc_threads, "StringTable Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[UniverseRoots] = new WorkerDataArray<double>(max_gc_threads, "Universe Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[JNIRoots] = new WorkerDataArray<double>(max_gc_threads, "JNI Handles Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[ObjectSynchronizerRoots] = new WorkerDataArray<double>(max_gc_threads, "ObjectSynchronizer Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[FlatProfilerRoots] = new WorkerDataArray<double>(max_gc_threads, "FlatProfiler Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[ManagementRoots] = new WorkerDataArray<double>(max_gc_threads, "Management Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[SystemDictionaryRoots] = new WorkerDataArray<double>(max_gc_threads, "SystemDictionary Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[CLDGRoots] = new WorkerDataArray<double>(max_gc_threads, "CLDG Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[JVMTIRoots] = new WorkerDataArray<double>(max_gc_threads, "JVMTI Roots (ms)", true, GenGCLog::LevelFinest, 3);
+  _gc_par_phases[CodeCacheRoots] = new WorkerDataArray<double>(max_gc_threads, "CodeCache Roots (ms)", true, GenGCLog::LevelFinest, 3);
+
+  _gc_par_phases[OldGenScan] = new WorkerDataArray<double>(max_gc_threads, "older-gen scanning (ms)", true, GenGCLog::LevelFine, 3);
+
+  _gc_par_phases[Other] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Other (ms)", true, GenGCLog::LevelFine,     2);
+  _gc_par_phases[GCWorkerTotal] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Total (ms)", true, GenGCLog::LevelFine, 2);
+  _gc_par_phases[GCWorkerEnd] = new WorkerDataArray<double>(max_gc_threads, "GC Worker End (ms)", false, GenGCLog::LevelFine, 2);
+}
+
+void GenGCPhaseTimes::note_gc_start(uint active_gc_threads) {
+  assert(active_gc_threads > 0, "The number of threads must be > 0");
+  assert(active_gc_threads <= _max_gc_threads, "The number of active threads must be <= the max number of threads");
+  _active_gc_threads = active_gc_threads;
+
+  for (int i = 0; i < GCParPhasesSentinel; i++) {
+    _gc_par_phases[i]->reset();
+  }
+}
+
+void GenGCPhaseTimes::note_gc_end() {
+  for (uint i = 0; i < _active_gc_threads; i++) {
+    double worker_time = _gc_par_phases[GCWorkerEnd]->get(i) - _gc_par_phases[GCWorkerStart]->get(i);
+    record_time_secs(GCWorkerTotal, i , worker_time);
+
+    double worker_known_time = _gc_par_phases[RootProcess]->get(i);
+    record_time_secs(Other, i, worker_time - worker_known_time);
+  }
+
+  for (int i = 0; i < GCParPhasesSentinel; i++) {
+    _gc_par_phases[i]->verify(_active_gc_threads);
+  }
+}
+
+void GenGCPhaseTimes::print_stats(int level, const char* str, double value) {
+  LineBuffer(level).append_and_print_cr("[%s: %.1lf ms]", str, value);
+}
+
+void GenGCPhaseTimes::print_stats(int level, const char* str, double value, uint workers) {
+  LineBuffer(level).append_and_print_cr("[%s: %.1lf ms, GC Workers: %u]", str, value, workers);
+}
+
+// record the time a phase took in seconds
+void GenGCPhaseTimes::record_time_secs(GCParPhases phase, uint worker_i, double secs) {
+  _gc_par_phases[phase]->set(worker_i, secs);
+}
+
+// add a number of seconds to a phase
+void GenGCPhaseTimes::add_time_secs(GCParPhases phase, uint worker_i, double secs) {
+  _gc_par_phases[phase]->add(worker_i, secs);
+}
+
+// return the average time for a phase in milliseconds
+double GenGCPhaseTimes::average_time_ms(GCParPhases phase) {
+  return _gc_par_phases[phase]->average(_active_gc_threads) * 1000.0;
+}
+
+double GenGCPhaseTimes::get_time_ms(GCParPhases phase, uint worker_i) {
+  return _gc_par_phases[phase]->get(worker_i) * 1000.0;
+}
+
+double GenGCPhaseTimes::sum_time_ms(GCParPhases phase) {
+  return _gc_par_phases[phase]->sum(_active_gc_threads) * 1000.0;
+}
+
+double GenGCPhaseTimes::min_time_ms(GCParPhases phase) {
+  return _gc_par_phases[phase]->minimum(_active_gc_threads) * 1000.0;
+}
+
+double GenGCPhaseTimes::max_time_ms(GCParPhases phase) {
+  return _gc_par_phases[phase]->maximum(_active_gc_threads) * 1000.0;
+}
+
+double GenGCPhaseTimes::average_thread_work_items(GCParPhases phase) {
+  assert(_gc_par_phases[phase]->thread_work_items() != NULL, "No sub count");
+  return _gc_par_phases[phase]->thread_work_items()->average(_active_gc_threads);
+}
+
+class GenGCParPhasePrinter : public StackObj {
+  GenGCPhaseTimes* _phase_times;
+ public:
+  GenGCParPhasePrinter(GenGCPhaseTimes* phase_times) : _phase_times(phase_times) {}
+
+  void print(GenGCPhaseTimes::GCParPhases phase_id) {
+    WorkerDataArray<double>* phase = _phase_times->_gc_par_phases[phase_id];
+
+    if (phase->_length == 1) {
+      print_single_length(phase_id, phase);
+    } else {
+      print_multi_length(phase_id, phase);
+    }
+  }
+
+ private:
+
+  void print_single_length(GenGCPhaseTimes::GCParPhases phase_id, WorkerDataArray<double>* phase) {
+    // No need for min, max, average and sum for only one worker
+    LineBuffer buf(phase->_indent_level);
+    buf.append_and_print_cr("[%s:  %.1lf]", phase->_title, _phase_times->get_time_ms(phase_id, 0));
+  }
+
+  void print_time_values(LineBuffer& buf, GenGCPhaseTimes::GCParPhases phase_id, WorkerDataArray<double>* phase) {
+    uint active_length = _phase_times->_active_gc_threads;
+    for (uint i = 0; i < active_length; ++i) {
+      buf.append("  %.1lf", _phase_times->get_time_ms(phase_id, i));
+    }
+    buf.print_cr();
+  }
+
+  void print_multi_length(GenGCPhaseTimes::GCParPhases phase_id, WorkerDataArray<double>* phase) {
+    LineBuffer buf(phase->_indent_level);
+    buf.append("[%s:", phase->_title);
+
+    print_time_values(buf, phase_id, phase);
+
+    buf.append(" Min: %.1lf, Avg: %.1lf, Max: %.1lf, Diff: %.1lf",
+        _phase_times->min_time_ms(phase_id), _phase_times->average_time_ms(phase_id), _phase_times->max_time_ms(phase_id),
+        _phase_times->max_time_ms(phase_id) - _phase_times->min_time_ms(phase_id));
+
+    if (phase->_print_sum) {
+      // for things like the start and end times the sum is not
+      // that relevant
+      buf.append(", Sum: %.1lf", _phase_times->sum_time_ms(phase_id));
+    }
+
+    buf.append_and_print_cr("]");
+  }
+};
+
+void GenGCPhaseTimes::print() {
+  GenGCParPhasePrinter par_phase_printer(this);
+
+  for (int i = 0; i <= GCMainParPhasesLast; i++) {
+    par_phase_printer.print((GCParPhases) i);
+  }
+}
+
+void GenGCPhaseTimes::log_gc_details() {
+  this->print();
+  gclog_or_tty->flush();
+}
+
+GenGCParPhaseTimesTracker::GenGCParPhaseTimesTracker(GenGCPhaseTimes* phase_times, GenGCPhaseTimes::GCParPhases phase, uint worker_id) :
+    _phase_times(phase_times), _phase(phase), _worker_id(worker_id) {
+  if (_phase_times != NULL) {
+    _start_time = os::elapsedTime();
+  }
+}
+
+GenGCParPhaseTimesTracker::~GenGCParPhaseTimesTracker() {
+  if (_phase_times != NULL) {
+    _phase_times->record_time_secs(_phase, _worker_id, os::elapsedTime() - _start_time);
+  }
+}
+
--- a/src/share/vm/gc_implementation/shared/gcTraceTime.hpp
+++ b/src/share/vm/gc_implementation/shared/gcTraceTime.hpp
 /*
- * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -28,8 +28,11 @@
 #include "gc_implementation/shared/gcTrace.hpp"
 #include "prims/jni_md.h"
 #include "utilities/ticks.hpp"
+#include "gc_implementation/shared/workerDataArray.hpp"

 class GCTimer;
+class LineBuffer;
+template <class T> class WorkerDataArray;

 class GCTraceTime {
  const char* _title;
@@ -43,4 +46,83 @@ class GCTraceTime {
  ~GCTraceTime();
 };

+class GenGCPhaseTimes : public CHeapObj<mtGC> {
+  friend class GenGCParPhasePrinter;
+
+  uint _active_gc_threads;
+  uint _max_gc_threads;
+
+ public:
+  enum GCParPhases {
+    GCWorkerStart,
+    RootProcess,
+    ThreadRoots,
+    StringTableRoots,
+    UniverseRoots,
+    JNIRoots,
+    ObjectSynchronizerRoots,
+    FlatProfilerRoots,
+    ManagementRoots,
+    SystemDictionaryRoots,
+    CLDGRoots,
+    CodeCacheRoots,
+    JVMTIRoots,
+    OldGenScan,
+    Other,
+    GCWorkerTotal,
+    GCWorkerEnd,
+    GCParPhasesSentinel
+  };
+private:
+  // Markers for grouping the phases in the GCPhases enum above
+  static const int GCMainParPhasesLast = GCWorkerEnd;
+
+  WorkerDataArray<double>* _gc_par_phases[GCParPhasesSentinel];
+
+  double _cur_collection_par_time_ms;
+
+  // Helper methods for detailed logging
+  void print_stats(int level, const char* str, double value);
+  void print_stats(int level, const char* str, double value, uint workers);
+
+ public:
+  GenGCPhaseTimes(uint max_gc_threads);
+  void note_gc_start(uint active_gc_threads);
+  void note_gc_end();
+  void print();
+  void log_gc_details();
+
+  // record the time a phase took in seconds
+  void record_time_secs(GCParPhases phase, uint worker_i, double secs);
+
+  // add a number of seconds to a phase
+  void add_time_secs(GCParPhases phase, uint worker_i, double secs);
+
+  // return the average time for a phase in milliseconds
+  double average_time_ms(GCParPhases phase);
+
+ private:
+  double get_time_ms(GCParPhases phase, uint worker_i);
+  double sum_time_ms(GCParPhases phase);
+  double min_time_ms(GCParPhases phase);
+  double max_time_ms(GCParPhases phase);
+  double average_thread_work_items(GCParPhases phase);
+
+ public:
+
+  void record_par_time(double ms) {
+    _cur_collection_par_time_ms = ms;
+  }
+};
+
+class GenGCParPhaseTimesTracker : public StackObj {
+  double _start_time;
+  GenGCPhaseTimes::GCParPhases _phase;
+  GenGCPhaseTimes* _phase_times;
+  uint _worker_id;
+public:
+  GenGCParPhaseTimesTracker(GenGCPhaseTimes* phase_times, GenGCPhaseTimes::GCParPhases phase, uint worker_id);
+  ~GenGCParPhaseTimesTracker();
+};
+
 #endif // SHARE_VM_GC_IMPLEMENTATION_SHARED_GCTRACETIME_HPP
--- a/src/share/vm/gc_implementation/shared/workerDataArray.cpp
+++ b/src/share/vm/gc_implementation/shared/workerDataArray.cpp
+/*
+ * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc_implementation/shared/workerDataArray.inline.hpp"
+#include "utilities/ostream.hpp"
+
+#ifndef PRODUCT
+
+template <>
+size_t WorkerDataArray<size_t>::uninitialized() {
+  return (size_t)-1;
+}
+
+template <>
+double WorkerDataArray<double>::uninitialized() {
+  return -1.0;
+}
+
+#endif
--- a/src/share/vm/gc_implementation/shared/workerDataArray.hpp
+++ b/src/share/vm/gc_implementation/shared/workerDataArray.hpp
+/*
+ * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_HPP
+
+#include "memory/allocation.hpp"
+#include "utilities/debug.hpp"
+
+// These classes (WorkerDataArray and LineBuffer) are moved here from the original gc_implementation/g1/g1GCPhaseTimes.cpp
+// And can cache the data for the parallel gc workers' performance data in G1GC and ParNew + CMS GC
+
+template <class T>
+class WorkerDataArray  : public CHeapObj<mtGC> {
+  friend class G1GCParPhasePrinter;
+  friend class GenGCParPhasePrinter;
+  T*          _data;
+  uint        _length;
+  const char* _title;
+  bool        _print_sum;
+  int         _log_level;
+  uint        _indent_level;
+  bool        _enabled;
+
+  WorkerDataArray<size_t>* _thread_work_items;
+
+  NOT_PRODUCT(T uninitialized();)
+
+  // We are caching the sum and average to only have to calculate them once.
+  // This is not done in an MT-safe way. It is intended to allow single
+  // threaded code to call sum() and average() multiple times in any order
+  // without having to worry about the cost.
+  bool   _has_new_data;
+  T      _sum;
+  T      _min;
+  T      _max;
+  double _average;
+
+ public:
+  WorkerDataArray(uint length, const char* title, bool print_sum, int log_level, uint indent_level);
+
+  ~WorkerDataArray();
+
+  void link_thread_work_items(WorkerDataArray<size_t>* thread_work_items);
+
+  WorkerDataArray<size_t>* thread_work_items() { return _thread_work_items; }
+
+  void set(uint worker_i, T value);
+
+  void set_thread_work_item(uint worker_i, size_t value);
+
+  T get(uint worker_i);
+
+  void add(uint worker_i, T value);
+
+  double average(uint active_threads);
+
+  T sum(uint active_threads);
+
+  T minimum(uint active_threads);
+
+  T maximum(uint active_threads);
+
+  void reset() PRODUCT_RETURN;
+  void verify(uint active_threads) PRODUCT_RETURN;
+
+  void set_enabled(bool enabled) { _enabled = enabled; }
+
+  int log_level() { return _log_level;  }
+
+ private:
+
+  void calculate_totals(uint active_threads);
+};
+
+//=========================================================================
+// Helper class for avoiding interleaved logging
+class LineBuffer: public StackObj {
+
+private:
+  static const int BUFFER_LEN = 1024;
+  static const int INDENT_CHARS = 3;
+  char _buffer[BUFFER_LEN];
+  int _indent_level;
+  int _cur;
+
+  void vappend(const char* format, va_list ap)  ATTRIBUTE_PRINTF(2, 0) {
+    int res = vsnprintf(&_buffer[_cur], BUFFER_LEN - _cur, format, ap);
+    if (res != -1) {
+      _cur += res;
+    } else {
+      DEBUG_ONLY(warning("buffer too small in LineBuffer");)
+      _buffer[BUFFER_LEN -1] = 0;
+      _cur = BUFFER_LEN; // vsnprintf above should not add to _buffer if we are called again
+    }
+  }
+
+public:
+  explicit LineBuffer(int indent_level): _indent_level(indent_level), _cur(0) {
+    for (; (_cur < BUFFER_LEN && _cur < (_indent_level * INDENT_CHARS)); _cur++) {
+      _buffer[_cur] = ' ';
+    }
+  }
+ 
+#ifndef PRODUCT
+  ~LineBuffer() {
+    assert(_cur == _indent_level * INDENT_CHARS, "pending data in buffer - append_and_print_cr() not called?");
+  }
+#endif
+
+  void append(const char* format, ...)  ATTRIBUTE_PRINTF(2, 3) {
+    va_list ap;
+    va_start(ap, format);
+    vappend(format, ap);
+    va_end(ap);
+  }
+
+  void print_cr() {
+    gclog_or_tty->print_cr("%s", _buffer);
+    _cur = _indent_level * INDENT_CHARS;
+  }
+
+  void append_and_print_cr(const char* format, ...)  ATTRIBUTE_PRINTF(2, 3) {
+    va_list ap;
+    va_start(ap, format);
+    vappend(format, ap);
+    va_end(ap);
+    print_cr();
+  }
+};
+
+#endif // SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_HPP
--- a/src/share/vm/gc_implementation/shared/workerDataArray.inline.hpp
+++ b/src/share/vm/gc_implementation/shared/workerDataArray.inline.hpp
+/*
+ * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_INLINE_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_INLINE_HPP
+
+#include "gc_implementation/shared/workerDataArray.hpp"
+#include "memory/allocation.hpp"
+#include "utilities/debug.hpp"
+#include "runtime/os.hpp"
+#include "utilities/ostream.hpp"
+
+template <class T>
+WorkerDataArray<T>::WorkerDataArray(uint length, const char* title, bool print_sum, int log_level, uint indent_level):
+  _title(title), _length(0), _print_sum(print_sum), _log_level(log_level), _indent_level(indent_level),
+  _has_new_data(true), _thread_work_items(NULL), _enabled(true) {
+  assert(length > 0, "Must have some workers to store data for");
+  _length = length;
+  _data = NEW_C_HEAP_ARRAY(T, _length, mtGC);
+}
+
+template <class T>
+WorkerDataArray<T>::~WorkerDataArray() {
+  FREE_C_HEAP_ARRAY(T, _data, mtGC);
+}
+
+template <class T>
+void WorkerDataArray<T>::link_thread_work_items(WorkerDataArray<size_t>* thread_work_items) {
+  _thread_work_items = thread_work_items;
+}
+
+template <class T>
+void WorkerDataArray<T>::set(uint worker_i, T value) {
+  assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
+  assert(_data[worker_i] == WorkerDataArray<T>::uninitialized(), err_msg("Overwriting data for worker %d in %s", worker_i, _title));
+  _data[worker_i] = value;
+  _has_new_data = true;
+}
+
+template <class T>
+void WorkerDataArray<T>::set_thread_work_item(uint worker_i, size_t value) {
+  assert(_thread_work_items != NULL, "No sub count");
+  _thread_work_items->set(worker_i, value);
+}
+
+template <class T>
+T WorkerDataArray<T>::get(uint worker_i) {
+  assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
+  assert(_data[worker_i] != WorkerDataArray<T>::uninitialized(), err_msg("No data added for worker %d", worker_i));
+  return _data[worker_i];
+}
+
+template <class T>
+void WorkerDataArray<T>::add(uint worker_i, T value) {
+  assert(worker_i < _length, err_msg("Worker %d is greater than max: %d", worker_i, _length));
+  assert(_data[worker_i] != WorkerDataArray<T>::uninitialized(), err_msg("No data to add to for worker %d", worker_i));
+  _data[worker_i] += value;
+  _has_new_data = true;
+}
+
+template <class T>
+double WorkerDataArray<T>::average(uint active_threads){
+  calculate_totals(active_threads);
+  return _average;
+}
+
+template <class T>
+T WorkerDataArray<T>::sum(uint active_threads) {
+  calculate_totals(active_threads);
+  return _sum;
+}
+
+template <class T>
+T WorkerDataArray<T>::minimum(uint active_threads) {
+  calculate_totals(active_threads);
+  return _min;
+}
+
+template <class T>
+T WorkerDataArray<T>::maximum(uint active_threads) {
+  calculate_totals(active_threads);
+  return _max;
+}
+
+template <class T>
+void WorkerDataArray<T>::calculate_totals(uint active_threads){
+  if (!_has_new_data) {
+    return;
+  }
+
+  _sum = (T)0;
+  _min = _data[0];
+  _max = _min;
+  assert(active_threads <= _length, "Wrong number of active threads");
+  for (uint i = 0; i < active_threads; ++i) {
+    T val = _data[i];
+    _sum += val;
+    _min = MIN2(_min, val);
+    _max = MAX2(_max, val);
+  }
+  _average = (double)_sum / (double)active_threads;
+  _has_new_data = false;
+}
+
+#ifndef PRODUCT
+
+template <class T>
+void WorkerDataArray<T>::reset() {
+  for (uint i = 0; i < _length; i++) {
+    _data[i] = WorkerDataArray<T>::uninitialized();
+  }
+  if (_thread_work_items != NULL) {
+    _thread_work_items->reset();
+  }
+}
+
+template <class T>
+void WorkerDataArray<T>::verify(uint active_threads) {
+  if (!_enabled) {
+    return;
+  }
+
+  assert(active_threads <= _length, "Wrong number of active threads");
+  for (uint i = 0; i < active_threads; i++) {
+    assert(_data[i] != WorkerDataArray<T>::uninitialized(),
+        err_msg("Invalid data for worker %u in '%s'", i, _title));
+  }
+  if (_thread_work_items != NULL) {
+    _thread_work_items->verify(active_threads);
+  }
+}
+#endif // PRODUCT
+#endif // SHARE_VM_GC_IMPLEMENTATION_SHARED_WORKERDATAARRAY_INLINE_HPP
--- a/src/share/vm/memory/collectorPolicy.cpp
+++ b/src/share/vm/memory/collectorPolicy.cpp
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -200,7 +200,10 @@ GenCollectorPolicy::GenCollectorPolicy() :
    _max_gen0_size(0),
    _gen_alignment(0),
    _generations(NULL)
-{}
+{
+  int parallel_gc_threads = CollectedHeap::use_parallel_gc_threads() ? ParallelGCThreads : 1;
+  _phase_times  = new GenGCPhaseTimes(parallel_gc_threads);
+}

 size_t GenCollectorPolicy::scale_by_NewRatio_aligned(size_t base_size) {
  return align_size_down_bounded(base_size / (NewRatio + 1), _gen_alignment);

--- a/src/share/vm/memory/collectorPolicy.hpp
+++ b/src/share/vm/memory/collectorPolicy.hpp
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -30,6 +30,7 @@
 #include "memory/generationSpec.hpp"
 #include "memory/genRemSet.hpp"
 #include "utilities/macros.hpp"
+#include "gc_implementation/shared/gcTraceTime.hpp"

 // This class (or more correctly, subtypes of this class)
 // are used to define global garbage collector attributes.
@@ -232,6 +233,8 @@ friend class TestGenCollectorPolicy;

  GenerationSpec **_generations;

+  GenGCPhaseTimes* _phase_times;
+
  // Return true if an allocation should be attempted in the older
  // generation if it fails in the younger generation.  Return
  // false, otherwise.
@@ -260,6 +263,8 @@ friend class TestGenCollectorPolicy;
 public:
  GenCollectorPolicy();

+  GenGCPhaseTimes* phase_times()                  const { return _phase_times;                  }
+
  // Accessors
  size_t min_gen0_size()     { return _min_gen0_size; }
  size_t initial_gen0_size() { return _initial_gen0_size; }

--- a/src/share/vm/memory/defNewGeneration.cpp
+++ b/src/share/vm/memory/defNewGeneration.cpp
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -625,6 +625,8 @@ void DefNewGeneration::collect(bool   full,
  assert(gch->no_allocs_since_save_marks(0),
         "save marks have not been newly set.");

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  phase_times->note_gc_start(1);
  gch->gen_process_roots(_level,
                         true,  // Process younger gens, if any,
                                // as strong roots.
@@ -633,7 +635,9 @@ void DefNewGeneration::collect(bool   full,
                         GenCollectedHeap::StrongAndWeakRoots,
                         &fsc_with_no_gc_barrier,
                         &fsc_with_gc_barrier,
-                         &cld_scan_closure);
+                         &cld_scan_closure,
+                         phase_times,
+                         0);

  // "evacuate followers".
  evacuate_followers.do_void();

--- a/src/share/vm/memory/genCollectedHeap.cpp
+++ b/src/share/vm/memory/genCollectedHeap.cpp
 /*
- * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -619,7 +619,9 @@ void GenCollectedHeap::process_roots(bool activate_scope,
                                     OopClosure* weak_roots,
                                     CLDClosure* strong_cld_closure,
                                     CLDClosure* weak_cld_closure,
-                                     CodeBlobToOopClosure* code_roots) {
+                                     CodeBlobToOopClosure* code_roots,
+                                     GenGCPhaseTimes* phase_times,
+                                     uint worker_id) {
  StrongRootsScope srs(this, activate_scope);

  // General roots.
@@ -630,45 +632,86 @@ void GenCollectedHeap::process_roots(bool activate_scope,
  // could be trying to change the termination condition while the task
  // is executing in another GC worker.

+  //CLDGRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::CLDGRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_ClassLoaderDataGraph_oops_do)) {
      ClassLoaderDataGraph::roots_cld_do(strong_cld_closure, weak_cld_closure);
    }
-
+  }
  // Some CLDs contained in the thread frames should be considered strong.
  // Don't process them if they will be processed during the ClassLoaderDataGraph phase.
  CLDClosure* roots_from_clds_p = (strong_cld_closure != weak_cld_closure) ? strong_cld_closure : NULL;
  // Only process code roots from thread stacks if we aren't visiting the entire CodeCache anyway
  CodeBlobToOopClosure* roots_from_code_p = (so & SO_AllCodeCache) ? NULL : code_roots;
  
+  // ThreadRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::ThreadRoots, worker_id);
    Threads::possibly_parallel_oops_do(strong_roots, roots_from_clds_p, roots_from_code_p);
+  }

+  //UniverseRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::UniverseRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_Universe_oops_do)) {
      Universe::oops_do(strong_roots);
    }
+  }
+
  // Global (strong) JNI handles
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::JNIRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_JNIHandles_oops_do)) {
      JNIHandles::oops_do(strong_roots);
    }
+  }

+  // ObjectSynchronizerRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::ObjectSynchronizerRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_ObjectSynchronizer_oops_do)) {
      ObjectSynchronizer::oops_do(strong_roots);
    }
+  }
+
+  // FlatProfilerRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::FlatProfilerRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_FlatProfiler_oops_do)) {
      FlatProfiler::oops_do(strong_roots);
    }
+  }
+
+  // ManagementRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::ManagementRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_Management_oops_do)) {
      Management::oops_do(strong_roots);
    }
+  }
+
+  // JVMTIRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::JVMTIRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_jvmti_oops_do)) {
      JvmtiExport::oops_do(strong_roots);
    }
+  }

+  // SystemDictionaryRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::SystemDictionaryRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_SystemDictionary_oops_do)) {
      SystemDictionary::roots_oops_do(strong_roots, weak_roots);
    }
+  }

  // All threads execute the following. A specific chunk of buckets
  // from the StringTable are the individual tasks.
+  // StringTableRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::StringTableRoots, worker_id);
    if (weak_roots != NULL) {
      if (CollectedHeap::use_parallel_gc_threads()) {
        StringTable::possibly_parallel_oops_do(weak_roots);
@@ -676,7 +719,11 @@ void GenCollectedHeap::process_roots(bool activate_scope,
        StringTable::oops_do(weak_roots);
      }
    }
+  }

+  // CodeCacheRoots
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::CodeCacheRoots, worker_id);
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_CodeCache_oops_do)) {
      if (so & SO_ScavengeCodeCache) {
        assert(code_roots != NULL, "must supply closure for code cache");
@@ -696,6 +743,7 @@ void GenCollectedHeap::process_roots(bool activate_scope,
      DEBUG_ONLY(CodeBlobToOopClosure assert_code_is_non_scavengable(&assert_is_non_scavengable_closure, !CodeBlobToOopClosure::FixRelocations));
      DEBUG_ONLY(CodeCache::asserted_non_scavengable_nmethods_do(&assert_code_is_non_scavengable));
    }
+  }

 }

@@ -706,9 +754,12 @@ void GenCollectedHeap::gen_process_roots(int level,
                                         bool only_strong_roots,
                                         OopsInGenClosure* not_older_gens,
                                         OopsInGenClosure* older_gens,
-                                         CLDClosure* cld_closure) {
+                                         CLDClosure* cld_closure,
+                                         GenGCPhaseTimes* phase_times,
+                                         uint worker_id) {
  const bool is_adjust_phase = !only_strong_roots && !younger_gens_as_roots;

+  GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::RootProcess, worker_id);
  bool is_moving_collection = false;
  if (level == 0 || is_adjust_phase) {
    // young collections are always moving
@@ -722,7 +773,7 @@ void GenCollectedHeap::gen_process_roots(int level,
  process_roots(activate_scope, so,
                not_older_gens, weak_roots,
                cld_closure, weak_cld_closure,
-                &mark_code_closure);
+                &mark_code_closure, phase_times, worker_id);

  if (younger_gens_as_roots) {
    if (!_process_strong_tasks->is_task_claimed(GCH_PS_younger_gens)) {
@@ -733,6 +784,9 @@ void GenCollectedHeap::gen_process_roots(int level,
      not_older_gens->reset_generation();
    }
  }
+
+  {
+    GenGCParPhaseTimesTracker x(phase_times, GenGCPhaseTimes::OldGenScan, worker_id);
    // When collection is parallel, all threads get to cooperate to do
    // older-gen scanning.
    for (int i = level+1; i < _n_gens; i++) { 
@@ -740,7 +794,7 @@ void GenCollectedHeap::gen_process_roots(int level,
      rem_set()->younger_refs_iterate(_gens[i], older_gens);
      older_gens->reset_generation();
    }
-
+  }
  _process_strong_tasks->all_tasks_completed();
 }


--- a/src/share/vm/memory/genCollectedHeap.hpp
+++ b/src/share/vm/memory/genCollectedHeap.hpp
 /*
- * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -29,6 +29,7 @@
 #include "memory/collectorPolicy.hpp"
 #include "memory/generation.hpp"
 #include "memory/sharedHeap.hpp"
+#include "gc_implementation/shared/gcTraceTime.hpp"

 class SubTasksDone;

@@ -416,7 +417,9 @@ public:
                     OopClosure* weak_roots,
                     CLDClosure* strong_cld_closure,
                     CLDClosure* weak_cld_closure,
-                     CodeBlobToOopClosure* code_roots);
+                     CodeBlobToOopClosure* code_roots,
+                     GenGCPhaseTimes* phase_times,
+                     uint worker_id);

  void gen_process_roots(int level,
                         bool younger_gens_as_roots,
@@ -427,7 +430,9 @@ public:
                         OopsInGenClosure* older_gens,
                         CLDClosure* cld_closure,
                         CLDClosure* weak_cld_closure,
-                         CodeBlobClosure* code_closure);
+                         CodeBlobClosure* code_closure,
+                         GenGCPhaseTimes* phase_times,
+                         uint worker_id);

 public:
  static const bool StrongAndWeakRoots = false;
@@ -440,7 +445,9 @@ public:
                         bool only_strong_roots,
                         OopsInGenClosure* not_older_gens,
                         OopsInGenClosure* older_gens,
-                         CLDClosure* cld_closure);
+                         CLDClosure* cld_closure,
+                         GenGCPhaseTimes* phase_times,
+                         uint worker_id);

  // Apply "root_closure" to all the weak roots of the system.
  // These include JNI weak roots, string table,

--- a/src/share/vm/memory/genMarkSweep.cpp
+++ b/src/share/vm/memory/genMarkSweep.cpp
@@ -197,7 +197,7 @@ void GenMarkSweep::mark_sweep_phase1(int level,
  trace(" 1");

  GenCollectedHeap* gch = GenCollectedHeap::heap();
-
+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
  // Because follow_root_closure is created statically, cannot
  // use OopsInGenClosure constructor which takes a generation,
  // as the Universe has not been created when the static constructors
@@ -206,7 +206,8 @@ void GenMarkSweep::mark_sweep_phase1(int level,

  // Need new claim bits before marking starts.
  ClassLoaderDataGraph::clear_claimed_marks();
-
+  phase_times->note_gc_start(1);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
  gch->gen_process_roots(level,
                         false, // Younger gens are not roots.
                         true,  // activate StrongRootsScope
@@ -214,8 +215,10 @@ void GenMarkSweep::mark_sweep_phase1(int level,
                         ClassUnloading,
                         &follow_root_closure,
                         &follow_root_closure,
-                         &follow_cld_closure);
-
+                         &follow_cld_closure,
+                         phase_times,
+                         0);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());
  // Process reference objects found during marking
  {
    ref_processor()->setup_policy(clear_all_softrefs);
@@ -225,6 +228,11 @@ void GenMarkSweep::mark_sweep_phase1(int level,
    gc_tracer()->report_gc_reference_stats(stats);
  }

+  phase_times->note_gc_end();
+  if (PrintGCRootsTraceTime && PrintGCDetails) {
+    phase_times->log_gc_details();
+  }
+
  // This is the point where the entire marking should have completed.
  assert(_marking_stack.is_empty(), "Marking should have completed");

@@ -246,7 +254,6 @@ void GenMarkSweep::mark_sweep_phase1(int level,
  gc_tracer()->report_object_count_after_gc(&is_alive);
 }

-
 void GenMarkSweep::mark_sweep_phase2() {
  // Now all live objects are marked, compute the new object addresses.

@@ -279,6 +286,8 @@ public:
 void GenMarkSweep::mark_sweep_phase3(int level) {
  GenCollectedHeap* gch = GenCollectedHeap::heap();

+  GenGCPhaseTimes* phase_times = gch->gen_policy()->phase_times();
+  
  // Adjust the pointers to reflect the new locations
  GCTraceTime tm("phase 3", PrintGC && Verbose, true, _gc_timer, _gc_tracer->gc_id());
  trace("3");
@@ -292,6 +301,9 @@ void GenMarkSweep::mark_sweep_phase3(int level) {
  // are run.
  adjust_pointer_closure.set_orig_generation(gch->get_gen(level));

+  phase_times->note_gc_start(1);
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerStart, 0, os::elapsedTime());
+
  gch->gen_process_roots(level,
                         false, // Younger gens are not roots.
                         true,  // activate StrongRootsScope
@@ -299,7 +311,15 @@ void GenMarkSweep::mark_sweep_phase3(int level) {
                         GenCollectedHeap::StrongAndWeakRoots,
                         &adjust_pointer_closure,
                         &adjust_pointer_closure,
-                         &adjust_cld_closure);
+                         &adjust_cld_closure,
+                         phase_times,
+                         0);
+
+  phase_times->record_time_secs(GenGCPhaseTimes::GCWorkerEnd, 0, os::elapsedTime());
+  phase_times->note_gc_end();
+  if (PrintGCRootsTraceTime && PrintGCDetails) {
+    phase_times->log_gc_details();
+  }

  gch->gen_process_weak_roots(&adjust_pointer_closure);


--- a/src/share/vm/runtime/globals_ext.hpp
+++ b/src/share/vm/runtime/globals_ext.hpp
@@ -30,6 +30,9 @@
                                                                                \
  manageable(bool, PrintYoungGenHistoAfterParNewGC, false,                      \
            "print the young generation class histogram after parNew GC")       \
+                                                                                \
+  manageable(bool, PrintGCRootsTraceTime, false,                                \
+             "Print GC Trace Time")                                             \

  //add new AJVM specific flags here


--- a/test/gc/TestPrintGCRootsTraceTime.java
+++ b/test/gc/TestPrintGCRootsTraceTime.java
+/*
+ * Copyright (c) 2019 Alibaba Group Holding Limited. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Alibaba designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+/*
+ * @test TestPrintGCDetailTraceTime
+ * @summary test print gc detail time
+ * @library /testlibrary
+ * @build TestPrintGCRootsTraceTime
+ * @run main/othervm TestPrintGCRootsTraceTime
+ */
+
+import java.util.ArrayList;
+import com.oracle.java.testlibrary.*;
+
+public class TestPrintGCRootsTraceTime {
+    public static void main(String[] args) throws Exception {
+        ProcessBuilder pb = null;
+        OutputAnalyzer output = null;
+        // test with the flag on
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmx200m",
+                "-XX:+UseParNewGC",
+                "-XX:+UseConcMarkSweepGC",
+                "-XX:+PrintGCDetails",
+                "-verbose:gc",
+                "-XX:+PrintGCRootsTraceTime",
+                Foo.class.getName()
+                );
+        output = new OutputAnalyzer(pb.start());
+        output.shouldContain("Root Processing");
+        String [] strOut = output.getStdout().split("\n");
+        for (int i = 0; i < 11; i++) {
+            System.out.println(strOut[i]);
+        }
+        output.shouldHaveExitValue(0);
+
+        // test with the flag off
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseParNewGC",
+                "-XX:+UseConcMarkSweepGC",
+                "-XX:+PrintGCDetails",
+                "-verbose:gc",
+                "-XX:-PrintGCRootsTraceTime",
+                Foo.class.getName()
+                );
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test serial gc
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseSerialGC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test g1 gc
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseG1GC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test ps gc
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseParallelGC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test ps young gc
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseParallelGC",
+                "-XX:-UseParallelOldGC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing"); 
+        output.shouldHaveExitValue(0);
+
+        // test cms without ParNew
+         pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseConcMarkSweepGC",
+                "-XX:-UseParNewGC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldNotContain("Root Processing"); 
+        output.shouldHaveExitValue(0);
+
+        // test cms
+         pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseConcMarkSweepGC",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test cms with parallel gc threads = 1
+        pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+UseConcMarkSweepGC",
+                "-XX:+PrintGCDetails",
+                "-XX:ParallelGCThreads=1",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                Foo.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldContain("Root Processing");
+        output.shouldHaveExitValue(0);
+
+        // test system gc
+         pb = ProcessTools.createJavaProcessBuilder( "-Xmx500m",
+                "-Xmn200m",
+                "-XX:+PrintGCDetails",
+                "-XX:+PrintGCRootsTraceTime",
+                "-verbose:gc",
+                TestSystemGC.class.getName());
+        output = new OutputAnalyzer(pb.start());
+        output.shouldHaveExitValue(0);
+    }
+
+    // class for testing gc
+    public static class Foo {
+        public static void main(String []args) {
+            int count = 0;
+            ArrayList list = new ArrayList();
+            for (int i = 0; ; i++) {
+                byte[] data = new byte[1024];
+                list.add(data);
+                if (i % 1024 == 0) {
+                    count++;
+                    list.clear();
+                    if (count > 1024) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    // class for testing system gc
+    public static class TestSystemGC {
+        public static void main(String []args) throws Exception {
+            System.gc();
+        }
+    }
+}