aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjohnc <none@none>2009-08-04 16:00:17 -0700
committerjohnc <none@none>2009-08-04 16:00:17 -0700
commitdab69f3892592cc049d5aac776374e3d81d383d8 (patch)
treeacc1deff15db58b38eaabf403ba0f43a806a6ad0
parent4608cef8f3df46f58bf6726104f1a0091ea9f5e6 (diff)
6819077: G1: first GC thread coming late into the GC.
Summary: The first worker thread is delayed when entering the GC because it clears the card count table that is used in identifying hot cards. Replace the card count table with a dynamically sized evicting hash table that includes an epoch based counter. Reviewed-by: iveresov, tonyp
-rw-r--r--src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp384
-rw-r--r--src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp117
-rw-r--r--src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp2
-rw-r--r--src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp16
-rw-r--r--src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp22
-rw-r--r--src/share/vm/gc_implementation/g1/g1RemSet.cpp153
-rw-r--r--src/share/vm/gc_implementation/g1/g1RemSet.hpp4
-rw-r--r--src/share/vm/gc_implementation/g1/g1_globals.hpp10
-rw-r--r--src/share/vm/gc_implementation/includeDB_gc_g13
9 files changed, 483 insertions, 228 deletions
diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
index 4b6aafbd2..8ec60012c 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.cpp
@@ -25,11 +25,21 @@
#include "incls/_precompiled.incl"
#include "incls/_concurrentG1Refine.cpp.incl"
+// Possible sizes for the card counts cache: odd primes that roughly double in size.
+// (See jvmtiTagMap.cpp).
+int ConcurrentG1Refine::_cc_cache_sizes[] = {
+ 16381, 32771, 76831, 150001, 307261,
+ 614563, 1228891, 2457733, 4915219, 9830479,
+ 19660831, 39321619, 78643219, 157286461, -1
+ };
+
ConcurrentG1Refine::ConcurrentG1Refine() :
- _card_counts(NULL), _cur_card_count_histo(NULL), _cum_card_count_histo(NULL),
+ _card_counts(NULL), _card_epochs(NULL),
+ _n_card_counts(0), _max_n_card_counts(0),
+ _cache_size_index(0), _expand_card_counts(false),
_hot_cache(NULL),
_def_use_cache(false), _use_cache(false),
- _n_periods(0), _total_cards(0), _total_travs(0),
+ _n_periods(0),
_threads(NULL), _n_threads(0)
{
if (G1ConcRefine) {
@@ -57,26 +67,39 @@ size_t ConcurrentG1Refine::thread_num() {
}
void ConcurrentG1Refine::init() {
- G1CollectedHeap* g1h = G1CollectedHeap::heap();
- if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
- _n_card_counts =
- (unsigned) (g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
- _card_counts = NEW_C_HEAP_ARRAY(unsigned char, _n_card_counts);
- for (size_t i = 0; i < _n_card_counts; i++) _card_counts[i] = 0;
- ModRefBarrierSet* bs = g1h->mr_bs();
- guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
- CardTableModRefBS* ctbs = (CardTableModRefBS*)bs;
- _ct_bot = ctbs->byte_for_const(g1h->reserved_region().start());
- if (G1ConcRSCountTraversals) {
- _cur_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
- _cum_card_count_histo = NEW_C_HEAP_ARRAY(unsigned, 256);
- for (int i = 0; i < 256; i++) {
- _cur_card_count_histo[i] = 0;
- _cum_card_count_histo[i] = 0;
- }
- }
- }
if (G1ConcRSLogCacheSize > 0) {
+ _g1h = G1CollectedHeap::heap();
+ _max_n_card_counts =
+ (unsigned) (_g1h->g1_reserved_obj_bytes() >> CardTableModRefBS::card_shift);
+
+ size_t max_card_num = ((size_t)1 << (sizeof(unsigned)*BitsPerByte-1)) - 1;
+ guarantee(_max_n_card_counts < max_card_num, "card_num representation");
+
+ int desired = _max_n_card_counts / InitialCacheFraction;
+ for (_cache_size_index = 0;
+ _cc_cache_sizes[_cache_size_index] >= 0; _cache_size_index++) {
+ if (_cc_cache_sizes[_cache_size_index] >= desired) break;
+ }
+ _cache_size_index = MAX2(0, (_cache_size_index - 1));
+
+ int initial_size = _cc_cache_sizes[_cache_size_index];
+ if (initial_size < 0) initial_size = _max_n_card_counts;
+
+ // Make sure we don't go bigger than we will ever need
+ _n_card_counts = MIN2((unsigned) initial_size, _max_n_card_counts);
+
+ _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
+ _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
+
+ Copy::fill_to_bytes(&_card_counts[0],
+ _n_card_counts * sizeof(CardCountCacheEntry));
+ Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
+
+ ModRefBarrierSet* bs = _g1h->mr_bs();
+ guarantee(bs->is_a(BarrierSet::CardTableModRef), "Precondition");
+ _ct_bs = (CardTableModRefBS*)bs;
+ _ct_bot = _ct_bs->byte_for_const(_g1h->reserved_region().start());
+
_def_use_cache = true;
_use_cache = true;
_hot_cache_size = (1 << G1ConcRSLogCacheSize);
@@ -86,7 +109,7 @@ void ConcurrentG1Refine::init() {
// For refining the cards in the hot cache in parallel
int n_workers = (ParallelGCThreads > 0 ?
- g1h->workers()->total_workers() : 1);
+ _g1h->workers()->total_workers() : 1);
_hot_cache_par_chunk_size = MAX2(1, _hot_cache_size / n_workers);
_hot_cache_par_claimed_idx = 0;
}
@@ -101,15 +124,11 @@ void ConcurrentG1Refine::stop() {
}
ConcurrentG1Refine::~ConcurrentG1Refine() {
- if (G1ConcRSLogCacheSize > 0 || G1ConcRSCountTraversals) {
- assert(_card_counts != NULL, "Logic");
- FREE_C_HEAP_ARRAY(unsigned char, _card_counts);
- assert(_cur_card_count_histo != NULL, "Logic");
- FREE_C_HEAP_ARRAY(unsigned, _cur_card_count_histo);
- assert(_cum_card_count_histo != NULL, "Logic");
- FREE_C_HEAP_ARRAY(unsigned, _cum_card_count_histo);
- }
if (G1ConcRSLogCacheSize > 0) {
+ assert(_card_counts != NULL, "Logic");
+ FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
+ assert(_card_epochs != NULL, "Logic");
+ FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
assert(_hot_cache != NULL, "Logic");
FREE_C_HEAP_ARRAY(jbyte*, _hot_cache);
}
@@ -129,42 +148,165 @@ void ConcurrentG1Refine::threads_do(ThreadClosure *tc) {
}
}
+bool ConcurrentG1Refine::is_young_card(jbyte* card_ptr) {
+ HeapWord* start = _ct_bs->addr_for(card_ptr);
+ HeapRegion* r = _g1h->heap_region_containing(start);
+ if (r != NULL && r->is_young()) {
+ return true;
+ }
+ // This card is not associated with a heap region
+ // so can't be young.
+ return false;
+}
+
+jbyte* ConcurrentG1Refine::add_card_count(jbyte* card_ptr, int* count, bool* defer) {
+ unsigned new_card_num = ptr_2_card_num(card_ptr);
+ unsigned bucket = hash(new_card_num);
+ assert(0 <= bucket && bucket < _n_card_counts, "Bounds");
+
+ CardCountCacheEntry* count_ptr = &_card_counts[bucket];
+ CardEpochCacheEntry* epoch_ptr = &_card_epochs[bucket];
+
+ // We have to construct a new entry if we haven't updated the counts
+ // during the current period, or if the count was updated for a
+ // different card number.
+ unsigned int new_epoch = (unsigned int) _n_periods;
+ julong new_epoch_entry = make_epoch_entry(new_card_num, new_epoch);
-int ConcurrentG1Refine::add_card_count(jbyte* card_ptr) {
- size_t card_num = (card_ptr - _ct_bot);
- guarantee(0 <= card_num && card_num < _n_card_counts, "Bounds");
- unsigned char cnt = _card_counts[card_num];
- if (cnt < 255) _card_counts[card_num]++;
- return cnt;
- _total_travs++;
+ while (true) {
+ // Fetch the previous epoch value
+ julong prev_epoch_entry = epoch_ptr->_value;
+ julong cas_res;
+
+ if (extract_epoch(prev_epoch_entry) != new_epoch) {
+ // This entry has not yet been updated during this period.
+ // Note: we update the epoch value atomically to ensure
+ // that there is only one winner that updates the cached
+ // card_ptr value even though all the refine threads share
+ // the same epoch value.
+
+ cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
+ (volatile jlong*)&epoch_ptr->_value,
+ (jlong) prev_epoch_entry);
+
+ if (cas_res == prev_epoch_entry) {
+ // We have successfully won the race to update the
+ // epoch and card_num value. Make it look like the
+ // count and eviction count were previously cleared.
+ count_ptr->_count = 1;
+ count_ptr->_evict_count = 0;
+ *count = 0;
+ // We can defer the processing of card_ptr
+ *defer = true;
+ return card_ptr;
+ }
+ // We did not win the race to update the epoch field, so some other
+ // thread must have done it. The value that gets returned by CAS
+ // should be the new epoch value.
+ assert(extract_epoch(cas_res) == new_epoch, "unexpected epoch");
+ // We could 'continue' here or just re-read the previous epoch value
+ prev_epoch_entry = epoch_ptr->_value;
+ }
+
+ // The epoch entry for card_ptr has been updated during this period.
+ unsigned old_card_num = extract_card_num(prev_epoch_entry);
+
+ // The card count that will be returned to caller
+ *count = count_ptr->_count;
+
+ // Are we updating the count for the same card?
+ if (new_card_num == old_card_num) {
+ // Same card - just update the count. We could have more than one
+ // thread racing to update count for the current card. It should be
+ // OK not to use a CAS as the only penalty should be some missed
+ // increments of the count which delays identifying the card as "hot".
+
+ if (*count < max_jubyte) count_ptr->_count++;
+ // We can defer the processing of card_ptr
+ *defer = true;
+ return card_ptr;
+ }
+
+ // Different card - evict old card info
+ if (count_ptr->_evict_count < max_jubyte) count_ptr->_evict_count++;
+ if (count_ptr->_evict_count > G1CardCountCacheExpandThreshold) {
+ // Trigger a resize the next time we clear
+ _expand_card_counts = true;
+ }
+
+ cas_res = (julong) Atomic::cmpxchg((jlong) new_epoch_entry,
+ (volatile jlong*)&epoch_ptr->_value,
+ (jlong) prev_epoch_entry);
+
+ if (cas_res == prev_epoch_entry) {
+ // We successfully updated the card num value in the epoch entry
+ count_ptr->_count = 0; // initialize counter for new card num
+
+ // Even though the region containg the card at old_card_num was not
+ // in the young list when old_card_num was recorded in the epoch
+ // cache it could have been added to the free list and subsequently
+ // added to the young list in the intervening time. If the evicted
+ // card is in a young region just return the card_ptr and the evicted
+ // card will not be cleaned. See CR 6817995.
+
+ jbyte* old_card_ptr = card_num_2_ptr(old_card_num);
+ if (is_young_card(old_card_ptr)) {
+ *count = 0;
+ // We can defer the processing of card_ptr
+ *defer = true;
+ return card_ptr;
+ }
+
+ // We do not want to defer processing of card_ptr in this case
+ // (we need to refine old_card_ptr and card_ptr)
+ *defer = false;
+ return old_card_ptr;
+ }
+ // Someone else beat us - try again.
+ }
}
-jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr) {
- int count = add_card_count(card_ptr);
- // Count previously unvisited cards.
- if (count == 0) _total_cards++;
- // We'll assume a traversal unless we store it in the cache.
+jbyte* ConcurrentG1Refine::cache_insert(jbyte* card_ptr, bool* defer) {
+ int count;
+ jbyte* cached_ptr = add_card_count(card_ptr, &count, defer);
+ assert(cached_ptr != NULL, "bad cached card ptr");
+ assert(!is_young_card(cached_ptr), "shouldn't get a card in young region");
+
+ // The card pointer we obtained from card count cache is not hot
+ // so do not store it in the cache; return it for immediate
+ // refining.
if (count < G1ConcRSHotCardLimit) {
- _total_travs++;
- return card_ptr;
+ return cached_ptr;
}
- // Otherwise, it's hot.
+
+ // Otherwise, the pointer we got from the _card_counts is hot.
jbyte* res = NULL;
MutexLockerEx x(HotCardCache_lock, Mutex::_no_safepoint_check_flag);
if (_n_hot == _hot_cache_size) {
- _total_travs++;
res = _hot_cache[_hot_cache_idx];
_n_hot--;
}
// Now _n_hot < _hot_cache_size, and we can insert at _hot_cache_idx.
- _hot_cache[_hot_cache_idx] = card_ptr;
+ _hot_cache[_hot_cache_idx] = cached_ptr;
_hot_cache_idx++;
if (_hot_cache_idx == _hot_cache_size) _hot_cache_idx = 0;
_n_hot++;
+
+ if (res != NULL) {
+ // Even though the region containg res was not in the young list
+ // when it was recorded in the hot cache it could have been added
+ // to the free list and subsequently added to the young list in
+ // the intervening time. If res is in a young region, return NULL
+ // so that res is not cleaned. See CR 6817995.
+
+ if (is_young_card(res)) {
+ res = NULL;
+ }
+ }
+
return res;
}
-
void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
assert(!use_cache(), "cache should be disabled");
int start_idx;
@@ -186,114 +328,52 @@ void ConcurrentG1Refine::clean_up_cache(int worker_i, G1RemSet* g1rs) {
}
}
-void ConcurrentG1Refine::clear_and_record_card_counts() {
- if (G1ConcRSLogCacheSize == 0 && !G1ConcRSCountTraversals) return;
- _n_periods++;
- if (G1ConcRSCountTraversals) {
- for (size_t i = 0; i < _n_card_counts; i++) {
- unsigned char bucket = _card_counts[i];
- _cur_card_count_histo[bucket]++;
- _card_counts[i] = 0;
- }
- gclog_or_tty->print_cr("Card counts:");
- for (int i = 0; i < 256; i++) {
- if (_cur_card_count_histo[i] > 0) {
- gclog_or_tty->print_cr(" %3d: %9d", i, _cur_card_count_histo[i]);
- _cum_card_count_histo[i] += _cur_card_count_histo[i];
- _cur_card_count_histo[i] = 0;
- }
- }
- } else {
- assert(G1ConcRSLogCacheSize > 0, "Logic");
- Copy::fill_to_words((HeapWord*)(&_card_counts[0]),
- _n_card_counts / HeapWordSize);
- }
-}
+void ConcurrentG1Refine::expand_card_count_cache() {
+ if (_n_card_counts < _max_n_card_counts) {
+ int new_idx = _cache_size_index+1;
+ int new_size = _cc_cache_sizes[new_idx];
+ if (new_size < 0) new_size = _max_n_card_counts;
-void
-ConcurrentG1Refine::
-print_card_count_histo_range(unsigned* histo, int from, int to,
- float& cum_card_pct,
- float& cum_travs_pct) {
- unsigned cards = 0;
- unsigned travs = 0;
- guarantee(to <= 256, "Precondition");
- for (int i = from; i < to-1; i++) {
- cards += histo[i];
- travs += histo[i] * i;
- }
- if (to == 256) {
- unsigned histo_card_sum = 0;
- unsigned histo_trav_sum = 0;
- for (int i = 1; i < 255; i++) {
- histo_trav_sum += histo[i] * i;
- }
- cards += histo[255];
- // correct traversals for the last one.
- unsigned travs_255 = (unsigned) (_total_travs - histo_trav_sum);
- travs += travs_255;
-
- } else {
- cards += histo[to-1];
- travs += histo[to-1] * (to-1);
- }
- float fperiods = (float)_n_periods;
- float f_tot_cards = (float)_total_cards/fperiods;
- float f_tot_travs = (float)_total_travs/fperiods;
- if (cards > 0) {
- float fcards = (float)cards/fperiods;
- float ftravs = (float)travs/fperiods;
- if (to == 256) {
- gclog_or_tty->print(" %4d- %10.2f%10.2f", from, fcards, ftravs);
- } else {
- gclog_or_tty->print(" %4d-%4d %10.2f%10.2f", from, to-1, fcards, ftravs);
+ // Make sure we don't go bigger than we will ever need
+ new_size = MIN2((unsigned) new_size, _max_n_card_counts);
+
+ // Expand the card count and card epoch tables
+ if (new_size > (int)_n_card_counts) {
+ // We can just free and allocate a new array as we're
+ // not interested in preserving the contents
+ assert(_card_counts != NULL, "Logic!");
+ assert(_card_epochs != NULL, "Logic!");
+ FREE_C_HEAP_ARRAY(CardCountCacheEntry, _card_counts);
+ FREE_C_HEAP_ARRAY(CardEpochCacheEntry, _card_epochs);
+ _n_card_counts = new_size;
+ _card_counts = NEW_C_HEAP_ARRAY(CardCountCacheEntry, _n_card_counts);
+ _card_epochs = NEW_C_HEAP_ARRAY(CardEpochCacheEntry, _n_card_counts);
+ _cache_size_index = new_idx;
}
- float pct_cards = fcards*100.0/f_tot_cards;
- cum_card_pct += pct_cards;
- float pct_travs = ftravs*100.0/f_tot_travs;
- cum_travs_pct += pct_travs;
- gclog_or_tty->print_cr("%10.2f%10.2f%10.2f%10.2f",
- pct_cards, cum_card_pct,
- pct_travs, cum_travs_pct);
}
}
-void ConcurrentG1Refine::print_final_card_counts() {
- if (!G1ConcRSCountTraversals) return;
-
- gclog_or_tty->print_cr("Did %d total traversals of %d distinct cards.",
- _total_travs, _total_cards);
- float fperiods = (float)_n_periods;
- gclog_or_tty->print_cr(" This is an average of %8.2f traversals, %8.2f cards, "
- "per collection.", (float)_total_travs/fperiods,
- (float)_total_cards/fperiods);
- gclog_or_tty->print_cr(" This is an average of %8.2f traversals/distinct "
- "dirty card.\n",
- _total_cards > 0 ?
- (float)_total_travs/(float)_total_cards : 0.0);
-
-
- gclog_or_tty->print_cr("Histogram:\n\n%10s %10s%10s%10s%10s%10s%10s",
- "range", "# cards", "# travs", "% cards", "(cum)",
- "% travs", "(cum)");
- gclog_or_tty->print_cr("------------------------------------------------------------"
- "-------------");
- float cum_cards_pct = 0.0;
- float cum_travs_pct = 0.0;
- for (int i = 1; i < 10; i++) {
- print_card_count_histo_range(_cum_card_count_histo, i, i+1,
- cum_cards_pct, cum_travs_pct);
- }
- for (int i = 10; i < 100; i += 10) {
- print_card_count_histo_range(_cum_card_count_histo, i, i+10,
- cum_cards_pct, cum_travs_pct);
+void ConcurrentG1Refine::clear_and_record_card_counts() {
+ if (G1ConcRSLogCacheSize == 0) return;
+
+#ifndef PRODUCT
+ double start = os::elapsedTime();
+#endif
+
+ if (_expand_card_counts) {
+ expand_card_count_cache();
+ _expand_card_counts = false;
+ // Only need to clear the epochs.
+ Copy::fill_to_bytes(&_card_epochs[0], _n_card_counts * sizeof(CardEpochCacheEntry));
}
- print_card_count_histo_range(_cum_card_count_histo, 100, 150,
- cum_cards_pct, cum_travs_pct);
- print_card_count_histo_range(_cum_card_count_histo, 150, 200,
- cum_cards_pct, cum_travs_pct);
- print_card_count_histo_range(_cum_card_count_histo, 150, 255,
- cum_cards_pct, cum_travs_pct);
- print_card_count_histo_range(_cum_card_count_histo, 255, 256,
- cum_cards_pct, cum_travs_pct);
+
+ int this_epoch = (int) _n_periods;
+ assert((this_epoch+1) <= max_jint, "to many periods");
+ // Update epoch
+ _n_periods++;
+
+#ifndef PRODUCT
+ double elapsed = os::elapsedTime() - start;
+ _g1h->g1_policy()->record_cc_clear_time(elapsed * 1000.0);
+#endif
}
diff --git a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
index 279996448..a6ee5484a 100644
--- a/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
+++ b/src/share/vm/gc_implementation/g1/concurrentG1Refine.hpp
@@ -29,18 +29,77 @@ class G1RemSet;
class ConcurrentG1Refine: public CHeapObj {
ConcurrentG1RefineThread** _threads;
int _n_threads;
+
// The cache for card refinement.
- bool _use_cache;
- bool _def_use_cache;
- size_t _n_periods;
- size_t _total_cards;
- size_t _total_travs;
-
- unsigned char* _card_counts;
- unsigned _n_card_counts;
- const jbyte* _ct_bot;
- unsigned* _cur_card_count_histo;
- unsigned* _cum_card_count_histo;
+ bool _use_cache;
+ bool _def_use_cache;
+
+ size_t _n_periods; // Used as clearing epoch
+
+ // An evicting cache of the number of times each card
+ // is accessed. Reduces, but does not eliminate, the amount
+ // of duplicated processing of dirty cards.
+
+ enum SomePrivateConstants {
+ epoch_bits = 32,
+ card_num_shift = epoch_bits,
+ epoch_mask = AllBits,
+ card_num_mask = AllBits,
+
+ // The initial cache size is approximately this fraction
+ // of a maximal cache (i.e. the size needed for all cards
+ // in the heap)
+ InitialCacheFraction = 512
+ };
+
+ const static julong card_num_mask_in_place =
+ (julong) card_num_mask << card_num_shift;
+
+ typedef struct {
+ julong _value; // | card_num | epoch |
+ } CardEpochCacheEntry;
+
+ julong make_epoch_entry(unsigned int card_num, unsigned int epoch) {
+ assert(0 <= card_num && card_num < _max_n_card_counts, "Bounds");
+ assert(0 <= epoch && epoch <= _n_periods, "must be");
+
+ return ((julong) card_num << card_num_shift) | epoch;
+ }
+
+ unsigned int extract_epoch(julong v) {
+ return (v & epoch_mask);
+ }
+
+ unsigned int extract_card_num(julong v) {
+ return (v & card_num_mask_in_place) >> card_num_shift;
+ }
+
+ typedef struct {
+ unsigned char _count;
+ unsigned char _evict_count;
+ } CardCountCacheEntry;
+
+ CardCountCacheEntry* _card_counts;
+ CardEpochCacheEntry* _card_epochs;
+
+ // The current number of buckets in the card count cache
+ unsigned _n_card_counts;
+
+ // The max number of buckets required for the number of
+ // cards for the entire reserved heap
+ unsigned _max_n_card_counts;
+
+ // Possible sizes of the cache: odd primes that roughly double in size.
+ // (See jvmtiTagMap.cpp).
+ static int _cc_cache_sizes[];
+
+ // The index in _cc_cache_sizes corresponding to the size of
+ // _card_counts.
+ int _cache_size_index;
+
+ bool _expand_card_counts;
+
+ const jbyte* _ct_bot;
jbyte** _hot_cache;
int _hot_cache_size;
@@ -50,12 +109,37 @@ class ConcurrentG1Refine: public CHeapObj {
int _hot_cache_par_chunk_size;
volatile int _hot_cache_par_claimed_idx;
+ // Needed to workaround 6817995
+ CardTableModRefBS* _ct_bs;
+ G1CollectedHeap* _g1h;
+
+ // Expands the array that holds the card counts to the next size up
+ void expand_card_count_cache();
+
+ // hash a given key (index of card_ptr) with the specified size
+ static unsigned int hash(size_t key, int size) {
+ return (unsigned int) key % size;
+ }
+
+ // hash a given key (index of card_ptr)
+ unsigned int hash(size_t key) {
+ return hash(key, _n_card_counts);
+ }
+
+ unsigned ptr_2_card_num(jbyte* card_ptr) {
+ return (unsigned) (card_ptr - _ct_bot);
+ }
+
+ jbyte* card_num_2_ptr(unsigned card_num) {
+ return (jbyte*) (_ct_bot + card_num);
+ }
+
// Returns the count of this card after incrementing it.
- int add_card_count(jbyte* card_ptr);
+ jbyte* add_card_count(jbyte* card_ptr, int* count, bool* defer);
+
+ // Returns true if this card is in a young region
+ bool is_young_card(jbyte* card_ptr);
- void print_card_count_histo_range(unsigned* histo, int from, int to,
- float& cum_card_pct,
- float& cum_travs_pct);
public:
ConcurrentG1Refine();
~ConcurrentG1Refine();
@@ -69,7 +153,7 @@ class ConcurrentG1Refine: public CHeapObj {
// If this is the first entry for the slot, writes into the cache and
// returns NULL. If it causes an eviction, returns the evicted pointer.
// Otherwise, its a cache hit, and returns NULL.
- jbyte* cache_insert(jbyte* card_ptr);
+ jbyte* cache_insert(jbyte* card_ptr, bool* defer);
// Process the cached entries.
void clean_up_cache(int worker_i, G1RemSet* g1rs);
@@ -93,7 +177,6 @@ class ConcurrentG1Refine: public CHeapObj {
}
void clear_and_record_card_counts();
- void print_final_card_counts();
static size_t thread_num();
};
diff --git a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 86872d6e0..8b75a1913 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -2414,8 +2414,6 @@ void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
}
void G1CollectedHeap::print_tracing_info() const {
- concurrent_g1_refine()->print_final_card_counts();
-
// We'll overload this to mean "trace GC pause statistics."
if (TraceGen0Time || TraceGen1Time) {
// The "G1CollectorPolicy" is keeping track of these stats, so delegate
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
index bb219c59c..c0ddd2d4d 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@@ -94,7 +94,14 @@ G1CollectorPolicy::G1CollectorPolicy() :
_summary(new Summary()),
_abandoned_summary(new AbandonedSummary()),
+#ifndef PRODUCT
_cur_clear_ct_time_ms(0.0),
+ _min_clear_cc_time_ms(-1.0),
+ _max_clear_cc_time_ms(-1.0),
+ _cur_clear_cc_time_ms(0.0),
+ _cum_clear_cc_time_ms(0.0),
+ _num_cc_clears(0L),
+#endif
_region_num_young(0),
_region_num_tenured(0),
@@ -1648,6 +1655,15 @@ void G1CollectorPolicy::record_collection_pause_end(bool abandoned) {
print_stats(1, "Object Copying", obj_copy_time);
}
}
+#ifndef PRODUCT
+ print_stats(1, "Cur Clear CC", _cur_clear_cc_time_ms);
+ print_stats(1, "Cum Clear CC", _cum_clear_cc_time_ms);
+ print_stats(1, "Min Clear CC", _min_clear_cc_time_ms);
+ print_stats(1, "Max Clear CC", _max_clear_cc_time_ms);
+ if (_num_cc_clears > 0) {
+ print_stats(1, "Avg Clear CC", _cum_clear_cc_time_ms / ((double)_num_cc_clears));
+ }
+#endif
print_stats(1, "Other", other_time_ms);
for (int i = 0; i < _aux_num; ++i) {
if (_cur_aux_times_set[i]) {
diff --git a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
index 8fa4ef4a8..3bb7023b2 100644
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -112,7 +112,6 @@ protected:
return 8*M;
}
-
double _cur_collection_start_sec;
size_t _cur_collection_pause_used_at_start_bytes;
size_t _cur_collection_pause_used_regions_at_start;
@@ -122,6 +121,15 @@ protected:
double _cur_clear_ct_time_ms;
bool _satb_drain_time_set;
+#ifndef PRODUCT
+ // Card Table Count Cache stats
+ double _min_clear_cc_time_ms; // min
+ double _max_clear_cc_time_ms; // max
+ double _cur_clear_cc_time_ms; // clearing time during current pause
+ double _cum_clear_cc_time_ms; // cummulative clearing time
+ jlong _num_cc_clears; // number of times the card count cache has been cleared
+#endif
+
double _cur_CH_strong_roots_end_sec;
double _cur_CH_strong_roots_dur_ms;
double _cur_G1_strong_roots_end_sec;
@@ -931,6 +939,18 @@ public:
_cur_aux_times_ms[i] += ms;
}
+#ifndef PRODUCT
+ void record_cc_clear_time(double ms) {
+ if (_min_clear_cc_time_ms < 0.0 || ms <= _min_clear_cc_time_ms)
+ _min_clear_cc_time_ms = ms;
+ if (_max_clear_cc_time_ms < 0.0 || ms >= _max_clear_cc_time_ms)
+ _max_clear_cc_time_ms = ms;
+ _cur_clear_cc_time_ms = ms;
+ _cum_clear_cc_time_ms += ms;
+ _num_cc_clears++;
+ }
+#endif
+
// Record the fact that "bytes" bytes allocated in a region.
void record_before_bytes(size_t bytes);
void record_after_bytes(size_t bytes);
diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.cpp b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
index 9c3602617..224263e54 100644
--- a/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -676,61 +676,12 @@ void HRInto_G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
static IntHistogram out_of_histo(50, 50);
-void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
- // If the card is no longer dirty, nothing to do.
- if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
-
+void HRInto_G1RemSet::concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i) {
// Construct the region representing the card.
HeapWord* start = _ct_bs->addr_for(card_ptr);
// And find the region containing it.
HeapRegion* r = _g1->heap_region_containing(start);
- if (r == NULL) {
- guarantee(_g1->is_in_permanent(start), "Or else where?");
- return; // Not in the G1 heap (might be in perm, for example.)
- }
- // Why do we have to check here whether a card is on a young region,
- // given that we dirty young regions and, as a result, the
- // post-barrier is supposed to filter them out and never to enqueue
- // them? When we allocate a new region as the "allocation region" we
- // actually dirty its cards after we release the lock, since card
- // dirtying while holding the lock was a performance bottleneck. So,
- // as a result, it is possible for other threads to actually
- // allocate objects in the region (after the acquire the lock)
- // before all the cards on the region are dirtied. This is unlikely,
- // and it doesn't happen often, but it can happen. So, the extra
- // check below filters out those cards.
- if (r->is_young()) {
- return;
- }
- // While we are processing RSet buffers during the collection, we
- // actually don't want to scan any cards on the collection set,
- // since we don't want to update remebered sets with entries that
- // point into the collection set, given that live objects from the
- // collection set are about to move and such entries will be stale
- // very soon. This change also deals with a reliability issue which
- // involves scanning a card in the collection set and coming across
- // an array that was being chunked and looking malformed. Note,
- // however, that if evacuation fails, we have to scan any objects
- // that were not moved and create any missing entries.
- if (r->in_collection_set()) {
- return;
- }
-
- // Should we defer it?
- if (_cg1r->use_cache()) {
- card_ptr = _cg1r->cache_insert(card_ptr);
- // If it was not an eviction, nothing to do.
- if (card_ptr == NULL) return;
-
- // OK, we have to reset the card start, region, etc.
- start = _ct_bs->addr_for(card_ptr);
- r = _g1->heap_region_containing(start);
- if (r == NULL) {
- guarantee(_g1->is_in_permanent(start), "Or else where?");
- return; // Not in the G1 heap (might be in perm, for example.)
- }
- guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
- }
+ assert(r != NULL, "unexpected null");
HeapWord* end = _ct_bs->addr_for(card_ptr + 1);
MemRegion dirtyRegion(start, end);
@@ -774,6 +725,106 @@ void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
}
}
+void HRInto_G1RemSet::concurrentRefineOneCard(jbyte* card_ptr, int worker_i) {
+ // If the card is no longer dirty, nothing to do.
+ if (*card_ptr != CardTableModRefBS::dirty_card_val()) return;
+
+ // Construct the region representing the card.
+ HeapWord* start = _ct_bs->addr_for(card_ptr);
+ // And find the region containing it.
+ HeapRegion* r = _g1->heap_region_containing(start);
+ if (r == NULL) {
+ guarantee(_g1->is_in_permanent(start), "Or else where?");
+ return; // Not in the G1 heap (might be in perm, for example.)
+ }
+ // Why do we have to check here whether a card is on a young region,
+ // given that we dirty young regions and, as a result, the
+ // post-barrier is supposed to filter them out and never to enqueue
+ // them? When we allocate a new region as the "allocation region" we
+ // actually dirty its cards after we release the lock, since card
+ // dirtying while holding the lock was a performance bottleneck. So,
+ // as a result, it is possible for other threads to actually
+ // allocate objects in the region (after the acquire the lock)
+ // before all the cards on the region are dirtied. This is unlikely,
+ // and it doesn't happen often, but it can happen. So, the extra
+ // check below filters out those cards.
+ if (r->is_young()) {
+ return;
+ }
+ // While we are processing RSet buffers during the collection, we
+ // actually don't want to scan any cards on the collection set,
+ // since we don't want to update remebered sets with entries that
+ // point into the collection set, given that live objects from the
+ // collection set are about to move and such entries will be stale
+ // very soon. This change also deals with a reliability issue which
+ // involves scanning a card in the collection set and coming across
+ // an array that was being chunked and looking malformed. Note,
+ // however, that if evacuation fails, we have to scan any objects
+ // that were not moved and create any missing entries.
+ if (r->in_collection_set()) {
+ return;
+ }
+
+ // Should we defer processing the card?
+ //
+ // Previously the result from the insert_cache call would be
+ // either card_ptr (implying that card_ptr was currently "cold"),
+ // null (meaning we had inserted the card ptr into the "hot"
+ // cache, which had some headroom), or a "hot" card ptr
+ // extracted from the "hot" cache.
+ //
+ // Now that the _card_counts cache in the ConcurrentG1Refine
+ // instance is an evicting hash table, the result we get back
+ // could be from evicting the card ptr in an already occupied
+ // bucket (in which case we have replaced the card ptr in the
+ // bucket with card_ptr and "defer" is set to false). To avoid
+ // having a data structure (updates to which would need a lock)
+ // to hold these unprocessed dirty cards, we need to immediately
+ // process card_ptr. The actions needed to be taken on return
+ // from cache_insert are summarized in the following table:
+ //
+ // res defer action
+ // --------------------------------------------------------------
+ // null false card evicted from _card_counts & replaced with
+ // card_ptr; evicted ptr added to hot cache.
+ // No need to process res; immediately process card_ptr
+ //
+ // null true card not evicted from _card_counts; card_ptr added
+ // to hot cache.
+ // Nothing to do.
+ //
+ // non-null false card evicted from _card_counts & replaced with
+ // card_ptr; evicted ptr is currently "cold" or
+ // caused an eviction from the hot cache.
+ // Immediately process res; process card_ptr.
+ //
+ // non-null true card not evicted from _card_counts; card_ptr is
+ // currently cold, or caused an eviction from hot
+ // cache.
+ // Immediately process res; no need to process card_ptr.
+
+ jbyte* res = card_ptr;
+ bool defer = false;
+ if (_cg1r->use_cache()) {
+ jbyte* res = _cg1r->cache_insert(card_ptr, &defer);
+ if (res != NULL && (res != card_ptr || defer)) {
+ start = _ct_bs->addr_for(res);
+ r = _g1->heap_region_containing(start);
+ if (r == NULL) {
+ assert(_g1->is_in_permanent(start), "Or else where?");
+ } else {
+ guarantee(!r->is_young(), "It was evicted in the current minor cycle.");
+ // Process card pointer we get back from the hot card cache
+ concurrentRefineOneCard_impl(res, worker_i);
+ }
+ }
+ }
+
+ if (!defer) {
+ concurrentRefineOneCard_impl(card_ptr, worker_i);
+ }
+}
+
class HRRSStatsIter: public HeapRegionClosure {
size_t _occupied;
size_t _total_mem_sz;
diff --git a/src/share/vm/gc_implementation/g1/g1RemSet.hpp b/src/share/vm/gc_implementation/g1/g1RemSet.hpp
index 60ba420d8..c193157fc 100644
--- a/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@@ -157,6 +157,10 @@ protected:
}
}
+ // The routine that performs the actual work of refining a dirty
+ // card.
+ void concurrentRefineOneCard_impl(jbyte* card_ptr, int worker_i);
+
protected:
template <class T> void write_ref_nv(HeapRegion* from, T* p);
template <class T> void par_write_ref_nv(HeapRegion* from, T* p, int tid);
diff --git a/src/share/vm/gc_implementation/g1/g1_globals.hpp b/src/share/vm/gc_implementation/g1/g1_globals.hpp
index c7748f0d9..e3d154274 100644
--- a/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/src/share/vm/gc_implementation/g1/g1_globals.hpp
@@ -187,10 +187,6 @@
develop(intx, G1ConcRSLogCacheSize, 10, \
"Log base 2 of the length of conc RS hot-card cache.") \
\
- develop(bool, G1ConcRSCountTraversals, false, \
- "If true, gather data about the number of times CR traverses " \
- "cards ") \
- \
develop(intx, G1ConcRSHotCardLimit, 4, \
"The threshold that defines (>=) a hot card.") \
\
@@ -264,6 +260,10 @@
\
product(uintx, G1ParallelRSetThreads, 0, \
"If non-0 is the number of parallel rem set update threads, " \
- "otherwise the value is determined ergonomically.")
+ "otherwise the value is determined ergonomically.") \
+ \
+ develop(intx, G1CardCountCacheExpandThreshold, 16, \
+ "Expand the card count cache if the number of collisions for " \
+ "a particular entry exceeds this value.")
G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
diff --git a/src/share/vm/gc_implementation/includeDB_gc_g1 b/src/share/vm/gc_implementation/includeDB_gc_g1
index 7e2df3fc8..d9d151dc5 100644
--- a/src/share/vm/gc_implementation/includeDB_gc_g1
+++ b/src/share/vm/gc_implementation/includeDB_gc_g1
@@ -45,11 +45,14 @@ concurrentG1Refine.cpp concurrentG1Refine.hpp
concurrentG1Refine.cpp concurrentG1RefineThread.hpp
concurrentG1Refine.cpp copy.hpp
concurrentG1Refine.cpp g1CollectedHeap.inline.hpp
+concurrentG1Refine.cpp g1CollectorPolicy.hpp
concurrentG1Refine.cpp g1RemSet.hpp
concurrentG1Refine.cpp space.inline.hpp
+concurrentG1Refine.cpp heapRegionSeq.inline.hpp
concurrentG1Refine.hpp globalDefinitions.hpp
concurrentG1Refine.hpp allocation.hpp
+concurrentG1Refine.hpp cardTableModRefBS.hpp
concurrentG1Refine.hpp thread.hpp
concurrentG1RefineThread.cpp concurrentG1Refine.hpp