提交 e9562c66 编写于 作者: I iveresov

6684395: Port NUMA-aware allocator to linux

Summary: NUMA-aware allocator port to Linux
Reviewed-by: jmasa, apetrusenko
上级 7e0cbe2a
......@@ -272,7 +272,9 @@ SUNWprivate_1.1 {
jio_snprintf;
jio_vfprintf;
jio_vsnprintf;
fork1;
fork1;
numa_warn;
numa_error;
# Needed because there is no JVM interface for this.
sysThreadAvailableStackWithSlack;
......
......@@ -267,7 +267,9 @@ SUNWprivate_1.1 {
jio_snprintf;
jio_vfprintf;
jio_vsnprintf;
fork1;
fork1;
numa_warn;
numa_error;
# Needed because there is no JVM interface for this.
sysThreadAvailableStackWithSlack;
......
......@@ -2228,20 +2228,42 @@ bool os::commit_memory(char* addr, size_t size, size_t alignment_hint) {
}
void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
void os::free_memory(char *addr, size_t bytes) { }
void os::free_memory(char *addr, size_t bytes) {
uncommit_memory(addr, bytes);
}
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes) { }
bool os::numa_topology_changed() { return false; }
size_t os::numa_get_groups_num() { return 1; }
int os::numa_get_group_id() { return 0; }
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
if (size > 0) {
ids[0] = 0;
return 1;
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
}
bool os::numa_topology_changed() { return false; }
size_t os::numa_get_groups_num() {
int max_node = Linux::numa_max_node();
return max_node > 0 ? max_node + 1 : 1;
}
int os::numa_get_group_id() {
int cpu_id = Linux::sched_getcpu();
if (cpu_id != -1) {
int lgrp_id = Linux::get_node_by_cpu(cpu_id);
if (lgrp_id != -1) {
return lgrp_id;
}
}
return 0;
}
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
for (size_t i = 0; i < size; i++) {
ids[i] = i;
}
return size;
}
bool os::get_page_info(char *start, page_info* info) {
return false;
}
......@@ -2250,6 +2272,74 @@ char *os::scan_pages(char *start, char* end, page_info* page_expected, page_info
return end;
}
extern "C" void numa_warn(int number, char *where, ...) { }
extern "C" void numa_error(char *where) { }
void os::Linux::libnuma_init() {
// sched_getcpu() should be in libc.
set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
dlsym(RTLD_DEFAULT, "sched_getcpu")));
if (sched_getcpu() != -1) { // Does it work?
void *handle = dlopen("libnuma.so", RTLD_LAZY);
if (handle != NULL) {
set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
dlsym(handle, "numa_node_to_cpus")));
set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
dlsym(handle, "numa_max_node")));
set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
dlsym(handle, "numa_available")));
set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
dlsym(handle, "numa_tonode_memory")));
if (numa_available() != -1) {
// Create a cpu -> node mapping
_cpu_to_node = new (ResourceObj::C_HEAP) GrowableArray<int>(0, true);
rebuild_cpu_to_node_map();
}
}
}
}
// rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
// The table is later used in get_node_by_cpu().
void os::Linux::rebuild_cpu_to_node_map() {
int cpu_num = os::active_processor_count();
cpu_to_node()->clear();
cpu_to_node()->at_grow(cpu_num - 1);
int node_num = numa_get_groups_num();
int cpu_map_size = (cpu_num + BitsPerLong - 1) / BitsPerLong;
unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size);
for (int i = 0; i < node_num; i++) {
if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
for (int j = 0; j < cpu_map_size; j++) {
if (cpu_map[j] != 0) {
for (int k = 0; k < BitsPerLong; k++) {
if (cpu_map[j] & (1UL << k)) {
cpu_to_node()->at_put(j * BitsPerLong + k, i);
}
}
}
}
}
}
FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
}
int os::Linux::get_node_by_cpu(int cpu_id) {
if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
return cpu_to_node()->at(cpu_id);
}
return -1;
}
GrowableArray<int>* os::Linux::_cpu_to_node;
os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
os::Linux::numa_available_func_t os::Linux::_numa_available;
os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
bool os::uncommit_memory(char* addr, size_t size) {
return ::mmap(addr, size,
PROT_READ|PROT_WRITE|PROT_EXEC,
......@@ -3552,6 +3642,10 @@ jint os::init_2(void)
Linux::is_floating_stack() ? "floating stack" : "fixed stack");
}
if (UseNUMA) {
Linux::libnuma_init();
}
if (MaxFDLimit) {
// set the number of file descriptors to max. print out error
// if getrlimit/setrlimit fails but continue regardless.
......
......@@ -59,6 +59,8 @@ class Linux {
static bool _is_NPTL;
static bool _supports_fast_thread_cpu_time;
static GrowableArray<int>* _cpu_to_node;
protected:
static julong _physical_memory;
......@@ -79,8 +81,9 @@ class Linux {
static void set_is_LinuxThreads() { _is_NPTL = false; }
static void set_is_floating_stack() { _is_floating_stack = true; }
static void rebuild_cpu_to_node_map();
static GrowableArray<int>* cpu_to_node() { return _cpu_to_node; }
public:
static void init_thread_fpu_state();
static int get_fpu_control_word();
static void set_fpu_control_word(int fpu_control);
......@@ -143,6 +146,7 @@ class Linux {
static bool is_floating_stack() { return _is_floating_stack; }
static void libpthread_init();
static void libnuma_init();
// Minimum stack size a thread can be created with (allowing
// the VM to completely create the thread and enter user code)
......@@ -229,6 +233,38 @@ class Linux {
#undef SR_SUSPENDED
};
private:
typedef int (*sched_getcpu_func_t)(void);
typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
typedef int (*numa_max_node_func_t)(void);
typedef int (*numa_available_func_t)(void);
typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int node);
static sched_getcpu_func_t _sched_getcpu;
static numa_node_to_cpus_func_t _numa_node_to_cpus;
static numa_max_node_func_t _numa_max_node;
static numa_available_func_t _numa_available;
static numa_tonode_memory_func_t _numa_tonode_memory;
static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; }
static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; }
static void set_numa_max_node(numa_max_node_func_t func) { _numa_max_node = func; }
static void set_numa_available(numa_available_func_t func) { _numa_available = func; }
static void set_numa_tonode_memory(numa_tonode_memory_func_t func) { _numa_tonode_memory = func; }
public:
static int sched_getcpu() { return _sched_getcpu != NULL ? _sched_getcpu() : -1; }
static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) {
return _numa_node_to_cpus != NULL ? _numa_node_to_cpus(node, buffer, bufferlen) : -1;
}
static int numa_max_node() { return _numa_max_node != NULL ? _numa_max_node() : -1; }
static int numa_available() { return _numa_available != NULL ? _numa_available() : -1; }
static int numa_tonode_memory(void *start, size_t size, int node) {
return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1;
}
static int get_node_by_cpu(int cpu_id);
};
......
......@@ -120,3 +120,6 @@ inline int os::closedir(DIR *dirp)
RESTARTABLE(_cmd, _result); \
return _result; \
} while(false)
inline bool os::numa_has_static_binding() { return true; }
inline bool os::numa_has_group_homing() { return false; }
......@@ -2602,7 +2602,7 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
}
// Tell the OS to make the range local to the first-touching LWP
void os::numa_make_local(char *addr, size_t bytes) {
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
assert((intptr_t)addr % os::vm_page_size() == 0, "Address should be page-aligned.");
if (madvise(addr, bytes, MADV_ACCESS_LWP) < 0) {
debug_only(warning("MADV_ACCESS_LWP failed."));
......
......@@ -204,3 +204,6 @@ do { \
RESTARTABLE(_cmd, _result); \
return _result; \
} while(false)
inline bool os::numa_has_static_binding() { return false; }
inline bool os::numa_has_group_homing() { return true; }
......@@ -2581,7 +2581,7 @@ bool os::unguard_memory(char* addr, size_t bytes) {
void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
void os::free_memory(char *addr, size_t bytes) { }
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { }
bool os::numa_topology_changed() { return false; }
size_t os::numa_get_groups_num() { return 1; }
int os::numa_get_group_id() { return 0; }
......
......@@ -69,3 +69,6 @@ inline void os::bang_stack_shadow_pages() {
*((int *)(sp - (pages * vm_page_size()))) = 0;
}
}
inline bool os::numa_has_static_binding() { return true; }
inline bool os::numa_has_group_homing() { return false; }
......@@ -169,8 +169,9 @@ class ParallelScavengeHeap : public CollectedHeap {
size_t large_typearray_limit() { return FastAllocateSizeLimit; }
bool supports_inline_contig_alloc() const { return !UseNUMA; }
HeapWord** top_addr() const { return !UseNUMA ? young_gen()->top_addr() : NULL; }
HeapWord** end_addr() const { return !UseNUMA ? young_gen()->end_addr() : NULL; }
HeapWord** top_addr() const { return !UseNUMA ? young_gen()->top_addr() : (HeapWord**)-1; }
HeapWord** end_addr() const { return !UseNUMA ? young_gen()->end_addr() : (HeapWord**)-1; }
void ensure_parsability(bool retire_tlabs);
void accumulate_statistics_all_tlabs();
......
......@@ -46,9 +46,11 @@ void MutableNUMASpace::mangle_unused_area() {
for (int i = 0; i < lgrp_spaces()->length(); i++) {
LGRPSpace *ls = lgrp_spaces()->at(i);
MutableSpace *s = ls->space();
HeapWord *top = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
if (top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
if (!os::numa_has_static_binding()) {
HeapWord *top = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
if (top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
}
}
s->mangle_unused_area();
}
......@@ -70,32 +72,36 @@ void MutableNUMASpace::ensure_parsability() {
area_touched_words);
}
#endif
MemRegion invalid;
HeapWord *crossing_start = (HeapWord*)round_to((intptr_t)s->top(), os::vm_page_size());
HeapWord *crossing_end = (HeapWord*)round_to((intptr_t)(s->top() + area_touched_words),
os::vm_page_size());
if (crossing_start != crossing_end) {
// If object header crossed a small page boundary we mark the area
// as invalid rounding it to a page_size().
HeapWord *start = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
HeapWord *end = MIN2((HeapWord*)round_to((intptr_t)(s->top() + area_touched_words), page_size()),
s->end());
invalid = MemRegion(start, end);
}
if (!os::numa_has_static_binding()) {
MemRegion invalid;
HeapWord *crossing_start = (HeapWord*)round_to((intptr_t)s->top(), os::vm_page_size());
HeapWord *crossing_end = (HeapWord*)round_to((intptr_t)(s->top() + area_touched_words),
os::vm_page_size());
if (crossing_start != crossing_end) {
// If object header crossed a small page boundary we mark the area
// as invalid rounding it to a page_size().
HeapWord *start = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
HeapWord *end = MIN2((HeapWord*)round_to((intptr_t)(s->top() + area_touched_words), page_size()),
s->end());
invalid = MemRegion(start, end);
}
ls->add_invalid_region(invalid);
ls->add_invalid_region(invalid);
}
s->set_top(s->end());
}
} else {
if (!os::numa_has_static_binding()) {
#ifdef ASSERT
MemRegion invalid(s->top(), s->end());
ls->add_invalid_region(invalid);
#else
if (ZapUnusedHeapArea) {
MemRegion invalid(s->top(), s->end());
ls->add_invalid_region(invalid);
} else break;
#else
if (ZapUnusedHeapArea) {
MemRegion invalid(s->top(), s->end());
ls->add_invalid_region(invalid);
} else break;
#endif
}
}
}
}
......@@ -194,7 +200,7 @@ bool MutableNUMASpace::update_layout(bool force) {
}
// Bias region towards the first-touching lgrp. Set the right page sizes.
void MutableNUMASpace::bias_region(MemRegion mr) {
void MutableNUMASpace::bias_region(MemRegion mr, int lgrp_id) {
HeapWord *start = (HeapWord*)round_to((intptr_t)mr.start(), page_size());
HeapWord *end = (HeapWord*)round_down((intptr_t)mr.end(), page_size());
if (end > start) {
......@@ -202,9 +208,13 @@ void MutableNUMASpace::bias_region(MemRegion mr) {
assert((intptr_t)aligned_region.start() % page_size() == 0 &&
(intptr_t)aligned_region.byte_size() % page_size() == 0, "Bad alignment");
assert(region().contains(aligned_region), "Sanity");
os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
// First we tell the OS which page size we want in the given range. The underlying
// large page can be broken down if we require small pages.
os::realign_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size());
// Then we uncommit the pages in the range.
os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
// And make them local/first-touch biased.
os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size(), lgrp_id);
}
}
......@@ -233,10 +243,12 @@ void MutableNUMASpace::update() {
initialize(region(), true);
} else {
bool should_initialize = false;
for (int i = 0; i < lgrp_spaces()->length(); i++) {
if (!lgrp_spaces()->at(i)->invalid_region().is_empty()) {
should_initialize = true;
break;
if (!os::numa_has_static_binding()) {
for (int i = 0; i < lgrp_spaces()->length(); i++) {
if (!lgrp_spaces()->at(i)->invalid_region().is_empty()) {
should_initialize = true;
break;
}
}
}
......@@ -472,8 +484,8 @@ void MutableNUMASpace::initialize(MemRegion mr, bool clear_space) {
intersection = MemRegion(new_region.start(), new_region.start());
}
select_tails(new_region, intersection, &bottom_region, &top_region);
bias_region(bottom_region);
bias_region(top_region);
bias_region(bottom_region, lgrp_spaces()->at(0)->lgrp_id());
bias_region(top_region, lgrp_spaces()->at(lgrp_spaces()->length() - 1)->lgrp_id());
}
// Check if the space layout has changed significantly?
......@@ -545,22 +557,37 @@ void MutableNUMASpace::initialize(MemRegion mr, bool clear_space) {
intersection = MemRegion(new_region.start(), new_region.start());
}
MemRegion invalid_region = ls->invalid_region().intersection(new_region);
if (!invalid_region.is_empty()) {
merge_regions(new_region, &intersection, &invalid_region);
free_region(invalid_region);
if (!os::numa_has_static_binding()) {
MemRegion invalid_region = ls->invalid_region().intersection(new_region);
// Invalid region is a range of memory that could've possibly
// been allocated on the other node. That's relevant only on Solaris where
// there is no static memory binding.
if (!invalid_region.is_empty()) {
merge_regions(new_region, &intersection, &invalid_region);
free_region(invalid_region);
ls->set_invalid_region(MemRegion());
}
}
select_tails(new_region, intersection, &bottom_region, &top_region);
free_region(bottom_region);
free_region(top_region);
if (!os::numa_has_static_binding()) {
// If that's a system with the first-touch policy then it's enough
// to free the pages.
free_region(bottom_region);
free_region(top_region);
} else {
// In a system with static binding we have to change the bias whenever
// we reshape the heap.
bias_region(bottom_region, ls->lgrp_id());
bias_region(top_region, ls->lgrp_id());
}
// If we clear the region, we would mangle it in debug. That would cause page
// allocation in a different place. Hence setting the top directly.
s->initialize(new_region, false);
s->set_top(s->bottom());
ls->set_invalid_region(MemRegion());
set_adaptation_cycles(samples_count());
}
}
......@@ -575,7 +602,7 @@ void MutableNUMASpace::set_top(HeapWord* value) {
HeapWord *top = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
if (s->contains(value)) {
if (top < value && top < s->end()) {
if (!os::numa_has_static_binding() && top < value && top < s->end()) {
ls->add_invalid_region(MemRegion(top, value));
}
s->set_top(value);
......@@ -584,10 +611,10 @@ void MutableNUMASpace::set_top(HeapWord* value) {
if (found_top) {
s->set_top(s->bottom());
} else {
if (top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
}
s->set_top(s->end());
if (!os::numa_has_static_binding() && top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
}
s->set_top(s->end());
}
}
}
......@@ -601,11 +628,23 @@ void MutableNUMASpace::clear() {
}
}
/*
Linux supports static memory binding, therefore the most part of the
logic dealing with the possible invalid page allocation is effectively
disabled. Besides there is no notion of the home node in Linux. A
thread is allowed to migrate freely. Although the scheduler is rather
reluctant to move threads between the nodes. We check for the current
node every allocation. And with a high probability a thread stays on
the same node for some time allowing local access to recently allocated
objects.
*/
HeapWord* MutableNUMASpace::allocate(size_t size) {
int lgrp_id = Thread::current()->lgrp_id();
if (lgrp_id == -1) {
Thread* thr = Thread::current();
int lgrp_id = thr->lgrp_id();
if (lgrp_id == -1 || !os::numa_has_group_homing()) {
lgrp_id = os::numa_get_group_id();
Thread::current()->set_lgrp_id(lgrp_id);
thr->set_lgrp_id(lgrp_id);
}
int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
......@@ -628,22 +667,22 @@ HeapWord* MutableNUMASpace::allocate(size_t size) {
MutableSpace::set_top(s->top());
}
}
// Make the page allocation happen here.
if (p != NULL) {
// Make the page allocation happen here if there is no static binding..
if (p != NULL && !os::numa_has_static_binding()) {
for (HeapWord *i = p; i < p + size; i += os::vm_page_size() >> LogHeapWordSize) {
*(int*)i = 0;
}
}
return p;
}
// This version is lock-free.
HeapWord* MutableNUMASpace::cas_allocate(size_t size) {
int lgrp_id = Thread::current()->lgrp_id();
if (lgrp_id == -1) {
Thread* thr = Thread::current();
int lgrp_id = thr->lgrp_id();
if (lgrp_id == -1 || !os::numa_has_group_homing()) {
lgrp_id = os::numa_get_group_id();
Thread::current()->set_lgrp_id(lgrp_id);
thr->set_lgrp_id(lgrp_id);
}
int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
......@@ -670,8 +709,8 @@ HeapWord* MutableNUMASpace::cas_allocate(size_t size) {
}
}
// Make the page allocation happen here.
if (p != NULL) {
// Make the page allocation happen here if there is no static binding.
if (p != NULL && !os::numa_has_static_binding() ) {
for (HeapWord *i = p; i < p + size; i += os::vm_page_size() >> LogHeapWordSize) {
*(int*)i = 0;
}
......
......@@ -139,8 +139,8 @@ class MutableNUMASpace : public MutableSpace {
// Check if the NUMA topology has changed. Add and remove spaces if needed.
// The update can be forced by setting the force parameter equal to true.
bool update_layout(bool force);
// Bias region towards the first-touching lgrp.
void bias_region(MemRegion mr);
// Bias region towards the lgrp.
void bias_region(MemRegion mr, int lgrp_id);
// Free pages in a given region.
void free_region(MemRegion mr);
// Get current chunk size.
......
......@@ -3181,6 +3181,7 @@ os_<os_family>.cpp events.hpp
os_<os_family>.cpp extendedPC.hpp
os_<os_family>.cpp filemap.hpp
os_<os_family>.cpp globals.hpp
os_<os_family>.cpp growableArray.hpp
os_<os_family>.cpp hpi.hpp
os_<os_family>.cpp icBuffer.hpp
os_<os_family>.cpp interfaceSupport.hpp
......
......@@ -33,6 +33,7 @@ class JavaThread;
class Event;
class DLL;
class FileHandle;
template<class E> class GrowableArray;
// %%%%% Moved ThreadState, START_FN, OSThread to new osThread.hpp. -- Rose
......@@ -206,7 +207,9 @@ class os: AllStatic {
static void realign_memory(char *addr, size_t bytes, size_t alignment_hint);
// NUMA-specific interface
static void numa_make_local(char *addr, size_t bytes);
static bool numa_has_static_binding();
static bool numa_has_group_homing();
static void numa_make_local(char *addr, size_t bytes, int lgrp_hint);
static void numa_make_global(char *addr, size_t bytes);
static size_t numa_get_groups_num();
static size_t numa_get_leaf_groups(int *ids, size_t size);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册