diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index f62d1d56f41d0c5f3e3ef2c12b7dea3eb6578a10..e6ef4401a13804b6a6604bf22671e0c9dc4d39b2 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,7 +13,7 @@
 #define LOCAL_FREE_TARGET		(128)
 #define LOCAL_NR_SCANS			LOCAL_FREE_TARGET
 
-#define PERCPU_FREE_TARGET		(16)
+#define PERCPU_FREE_TARGET		(4)
 #define PERCPU_NR_SCANS			PERCPU_FREE_TARGET
 
 /* Helpers to get the local list index */
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index dcdce1270d38617af84087f532ba16a402d46124..0d449d8032d1ace67ad780f6d2ae86ce7fc6010e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -21,6 +21,7 @@
 #include <sys/mman.h>
 #include <poll.h>
 #include <ctype.h>
+#include <assert.h>
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
@@ -37,15 +38,6 @@ int event_fd[MAX_PROGS];
 int prog_cnt;
 int prog_array_fd = -1;
 
-struct bpf_map_def {
-	unsigned int type;
-	unsigned int key_size;
-	unsigned int value_size;
-	unsigned int max_entries;
-	unsigned int map_flags;
-	unsigned int inner_map_idx;
-};
-
 static int populate_prog_array(const char *event, int prog_fd)
 {
 	int ind = atoi(event), err;
@@ -193,11 +185,14 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	return 0;
 }
 
-static int load_maps(struct bpf_map_def *maps, int len)
+static int load_maps(struct bpf_map_def *maps, int len,
+		     const char **map_names, fixup_map_cb fixup_map)
 {
 	int i;
 
 	for (i = 0; i < len / sizeof(struct bpf_map_def); i++) {
+		if (fixup_map)
+			fixup_map(&maps[i], map_names[i], i);
 
 		if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
 		    maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) {
@@ -280,14 +275,64 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
 	return 0;
 }
 
-int load_bpf_file(char *path)
+static int cmp_symbols(const void *l, const void *r)
+{
+	const GElf_Sym *lsym = (const GElf_Sym *)l;
+	const GElf_Sym *rsym = (const GElf_Sym *)r;
+
+	if (lsym->st_value < rsym->st_value)
+		return -1;
+	else if (lsym->st_value > rsym->st_value)
+		return 1;
+	else
+		return 0;
+}
+
+static int get_sorted_map_names(Elf *elf, Elf_Data *symbols, int maps_shndx,
+				int strtabidx, char **map_names)
+{
+	GElf_Sym map_symbols[MAX_MAPS];
+	int i, nr_maps = 0;
+
+	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+		assert(nr_maps < MAX_MAPS);
+		if (!gelf_getsym(symbols, i, &map_symbols[nr_maps]))
+			continue;
+		if (map_symbols[nr_maps].st_shndx != maps_shndx)
+			continue;
+		nr_maps++;
+	}
+
+	qsort(map_symbols, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+	for (i = 0; i < nr_maps; i++) {
+		char *map_name;
+
+		map_name = elf_strptr(elf, strtabidx, map_symbols[i].st_name);
+		if (!map_name) {
+			printf("cannot get map symbol\n");
+			return 1;
+		}
+
+		map_names[i] = strdup(map_name);
+		if (!map_names[i]) {
+			printf("strdup(%s): %s(%d)\n", map_name,
+			       strerror(errno), errno);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 {
-	int fd, i;
+	int fd, i, ret, maps_shndx = -1, strtabidx = -1;
 	Elf *elf;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr, shdr_prog;
-	Elf_Data *data, *data_prog, *symbols = NULL;
-	char *shname, *shname_prog;
+	Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
+	char *shname, *shname_prog, *map_names[MAX_MAPS] = { NULL };
 
 	/* reset global variables */
 	kern_version = 0;
@@ -335,14 +380,33 @@ int load_bpf_file(char *path)
 			}
 			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
-			processed_sec[i] = true;
-			if (load_maps(data->d_buf, data->d_size))
-				return 1;
+			maps_shndx = i;
+			data_maps = data;
 		} else if (shdr.sh_type == SHT_SYMTAB) {
+			strtabidx = shdr.sh_link;
 			symbols = data;
 		}
 	}
 
+	ret = 1;
+
+	if (!symbols) {
+		printf("missing SHT_SYMTAB section\n");
+		goto done;
+	}
+
+	if (data_maps) {
+		if (get_sorted_map_names(elf, symbols, maps_shndx, strtabidx,
+					 map_names))
+			goto done;
+
+		if (load_maps(data_maps->d_buf, data_maps->d_size,
+			      (const char **)map_names, fixup_map))
+			goto done;
+
+		processed_sec[maps_shndx] = true;
+	}
+
 	/* load programs that need map fixup (relocations) */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 		if (processed_sec[i])
@@ -399,8 +463,22 @@ int load_bpf_file(char *path)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
 
+	ret = 0;
+done:
+	for (i = 0; i < MAX_MAPS; i++)
+		free(map_names[i]);
 	close(fd);
-	return 0;
+	return ret;
+}
+
+int load_bpf_file(char *path)
+{
+	return do_load_bpf_file(path, NULL);
+}
+
+int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
+{
+	return do_load_bpf_file(path, fixup_map);
 }
 
 void read_trace_pipe(void)
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index c827827299b3b77831650e762376b061b6c93021..68f6b2d22507d706dc5d9f64d8fc121b5129ea1f 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,18 @@
 #define MAX_MAPS 32
 #define MAX_PROGS 32
 
+struct bpf_map_def {
+	unsigned int type;
+	unsigned int key_size;
+	unsigned int value_size;
+	unsigned int max_entries;
+	unsigned int map_flags;
+	unsigned int inner_map_idx;
+};
+
+typedef void (*fixup_map_cb)(struct bpf_map_def *map, const char *map_name,
+			     int idx);
+
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
 extern int event_fd[MAX_PROGS];
@@ -25,6 +37,7 @@ extern int prog_cnt;
  * returns zero on success
  */
 int load_bpf_file(char *path);
+int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
 
 void read_trace_pipe(void);
 struct ksym {
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 9da2a3441b0a2e88eb63c3b60b2e8b76949b72fe..245165817fbe63bd53ea9cf7f3a3ab6cf5e7a2fa 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -11,6 +11,7 @@
 #include "bpf_helpers.h"
 
 #define MAX_ENTRIES 1000
+#define MAX_NR_CPUS 1024
 
 struct bpf_map_def SEC("maps") hash_map = {
 	.type = BPF_MAP_TYPE_HASH,
@@ -26,7 +27,7 @@ struct bpf_map_def SEC("maps") lru_hash_map = {
 	.max_entries = 10000,
 };
 
-struct bpf_map_def SEC("maps") percpu_lru_hash_map = {
+struct bpf_map_def SEC("maps") nocommon_lru_hash_map = {
 	.type = BPF_MAP_TYPE_LRU_HASH,
 	.key_size = sizeof(u32),
 	.value_size = sizeof(long),
@@ -34,6 +35,19 @@ struct bpf_map_def SEC("maps") percpu_lru_hash_map = {
 	.map_flags = BPF_F_NO_COMMON_LRU,
 };
 
+struct bpf_map_def SEC("maps") inner_lru_hash_map = {
+	.type = BPF_MAP_TYPE_LRU_HASH,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = MAX_ENTRIES,
+};
+
+struct bpf_map_def SEC("maps") array_of_lru_hashs = {
+	.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	.key_size = sizeof(u32),
+	.max_entries = MAX_NR_CPUS,
+};
+
 struct bpf_map_def SEC("maps") percpu_hash_map = {
 	.type = BPF_MAP_TYPE_PERCPU_HASH,
 	.key_size = sizeof(u32),
@@ -100,6 +114,7 @@ int stress_percpu_hmap(struct pt_regs *ctx)
 		bpf_map_delete_elem(&percpu_hash_map, &key);
 	return 0;
 }
+
 SEC("kprobe/sys_getgid")
 int stress_hmap_alloc(struct pt_regs *ctx)
 {
@@ -128,24 +143,56 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/sys_getpid")
+SEC("kprobe/sys_connect")
 int stress_lru_hmap_alloc(struct pt_regs *ctx)
 {
-	u32 key = bpf_get_prandom_u32();
+	struct sockaddr_in6 *in6;
+	u16 test_case, dst6[8];
+	int addrlen, ret;
+	char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n";
 	long val = 1;
-
-	bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
-
-	return 0;
-}
-
-SEC("kprobe/sys_getppid")
-int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx)
-{
 	u32 key = bpf_get_prandom_u32();
-	long val = 1;
 
-	bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY);
+	in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx);
+	addrlen = (int)PT_REGS_PARM3(ctx);
+
+	if (addrlen != sizeof(*in6))
+		return 0;
+
+	ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr);
+	if (ret)
+		goto done;
+
+	if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
+		return 0;
+
+	test_case = dst6[7];
+
+	if (test_case == 0) {
+		ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
+	} else if (test_case == 1) {
+		ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val,
+					  BPF_ANY);
+	} else if (test_case == 2) {
+		void *nolocal_lru_map;
+		int cpu = bpf_get_smp_processor_id();
+
+		nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs,
+						      &cpu);
+		if (!nolocal_lru_map) {
+			ret = -ENOENT;
+			goto done;
+		}
+
+		ret = bpf_map_update_elem(nolocal_lru_map, &key, &val,
+					  BPF_ANY);
+	} else {
+		ret = -EINVAL;
+	}
+
+done:
+	if (ret)
+		bpf_trace_printk(fmt, sizeof(fmt), ret);
 
 	return 0;
 }
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index e29ff318a79365ae5f84e833b45f9d2a46895421..6ac778153315edcbde80780bd68fbf5e288b677f 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -18,10 +18,14 @@
 #include <string.h>
 #include <time.h>
 #include <sys/resource.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
 #include "libbpf.h"
 #include "bpf_load.h"
 
-#define MAX_CNT 1000000
+#define TEST_BIT(t) (1U << (t))
+#define MAX_NR_CPUS 1024
 
 static __u64 time_get_ns(void)
 {
@@ -31,17 +35,44 @@ static __u64 time_get_ns(void)
 	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
 }
 
-#define HASH_PREALLOC		(1 << 0)
-#define PERCPU_HASH_PREALLOC	(1 << 1)
-#define HASH_KMALLOC		(1 << 2)
-#define PERCPU_HASH_KMALLOC	(1 << 3)
-#define LRU_HASH_PREALLOC	(1 << 4)
-#define PERCPU_LRU_HASH_PREALLOC	(1 << 5)
-#define LPM_KMALLOC		(1 << 6)
-#define HASH_LOOKUP		(1 << 7)
-#define ARRAY_LOOKUP		(1 << 8)
+enum test_type {
+	HASH_PREALLOC,
+	PERCPU_HASH_PREALLOC,
+	HASH_KMALLOC,
+	PERCPU_HASH_KMALLOC,
+	LRU_HASH_PREALLOC,
+	NOCOMMON_LRU_HASH_PREALLOC,
+	LPM_KMALLOC,
+	HASH_LOOKUP,
+	ARRAY_LOOKUP,
+	INNER_LRU_HASH_PREALLOC,
+	NR_TESTS,
+};
+
+const char *test_map_names[NR_TESTS] = {
+	[HASH_PREALLOC] = "hash_map",
+	[PERCPU_HASH_PREALLOC] = "percpu_hash_map",
+	[HASH_KMALLOC] = "hash_map_alloc",
+	[PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc",
+	[LRU_HASH_PREALLOC] = "lru_hash_map",
+	[NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map",
+	[LPM_KMALLOC] = "lpm_trie_map_alloc",
+	[HASH_LOOKUP] = "hash_map",
+	[ARRAY_LOOKUP] = "array_map",
+	[INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map",
+};
 
 static int test_flags = ~0;
+static uint32_t num_map_entries;
+static uint32_t inner_lru_hash_size;
+static int inner_lru_hash_idx = -1;
+static int array_of_lru_hashs_idx = -1;
+static uint32_t max_cnt = 1000000;
+
+static int check_test_flags(enum test_type t)
+{
+	return test_flags & TEST_BIT(t);
+}
 
 static void test_hash_prealloc(int cpu)
 {
@@ -49,34 +80,89 @@ static void test_hash_prealloc(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_getuid);
 	printf("%d:hash_map_perf pre-alloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
-static void test_lru_hash_prealloc(int cpu)
+static void do_test_lru(enum test_type test, int cpu)
 {
+	static int inner_lru_map_fds[MAX_NR_CPUS];
+
+	struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
+	const char *test_name;
 	__u64 start_time;
-	int i;
+	int i, ret;
+
+	if (test == INNER_LRU_HASH_PREALLOC) {
+		int outer_fd = map_fd[array_of_lru_hashs_idx];
+
+		assert(cpu < MAX_NR_CPUS);
+
+		if (cpu) {
+			inner_lru_map_fds[cpu] =
+				bpf_create_map(BPF_MAP_TYPE_LRU_HASH,
+					       sizeof(uint32_t), sizeof(long),
+					       inner_lru_hash_size, 0);
+			if (inner_lru_map_fds[cpu] == -1) {
+				printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n",
+				       strerror(errno), errno);
+				exit(1);
+			}
+		} else {
+			inner_lru_map_fds[cpu] = map_fd[inner_lru_hash_idx];
+		}
+
+		ret = bpf_map_update_elem(outer_fd, &cpu,
+					  &inner_lru_map_fds[cpu],
+					  BPF_ANY);
+		if (ret) {
+			printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n",
+			       cpu, strerror(errno), errno);
+			exit(1);
+		}
+	}
+
+	in6.sin6_addr.s6_addr16[0] = 0xdead;
+	in6.sin6_addr.s6_addr16[1] = 0xbeef;
+
+	if (test == LRU_HASH_PREALLOC) {
+		test_name = "lru_hash_map_perf";
+		in6.sin6_addr.s6_addr16[7] = 0;
+	} else if (test == NOCOMMON_LRU_HASH_PREALLOC) {
+		test_name = "nocommon_lru_hash_map_perf";
+		in6.sin6_addr.s6_addr16[7] = 1;
+	} else if (test == INNER_LRU_HASH_PREALLOC) {
+		test_name = "inner_lru_hash_map_perf";
+		in6.sin6_addr.s6_addr16[7] = 2;
+	} else {
+		assert(0);
+	}
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
-		syscall(__NR_getpid);
-	printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	for (i = 0; i < max_cnt; i++) {
+		ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6));
+		assert(ret == -1 && errno == EBADF);
+	}
+	printf("%d:%s pre-alloc %lld events per sec\n",
+	       cpu, test_name,
+	       max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
-static void test_percpu_lru_hash_prealloc(int cpu)
+static void test_lru_hash_prealloc(int cpu)
 {
-	__u64 start_time;
-	int i;
+	do_test_lru(LRU_HASH_PREALLOC, cpu);
+}
 
-	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
-		syscall(__NR_getppid);
-	printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+static void test_nocommon_lru_hash_prealloc(int cpu)
+{
+	do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu);
+}
+
+static void test_inner_lru_hash_prealloc(int cpu)
+{
+	do_test_lru(INNER_LRU_HASH_PREALLOC, cpu);
 }
 
 static void test_percpu_hash_prealloc(int cpu)
@@ -85,10 +171,10 @@ static void test_percpu_hash_prealloc(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_geteuid);
 	printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
 static void test_hash_kmalloc(int cpu)
@@ -97,10 +183,10 @@ static void test_hash_kmalloc(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_getgid);
 	printf("%d:hash_map_perf kmalloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
 static void test_percpu_hash_kmalloc(int cpu)
@@ -109,10 +195,10 @@ static void test_percpu_hash_kmalloc(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_getegid);
 	printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
 static void test_lpm_kmalloc(int cpu)
@@ -121,10 +207,10 @@ static void test_lpm_kmalloc(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_gettid);
 	printf("%d:lpm_perf kmalloc %lld events per sec\n",
-	       cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
 }
 
 static void test_hash_lookup(int cpu)
@@ -133,10 +219,10 @@ static void test_hash_lookup(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_getpgid, 0);
 	printf("%d:hash_lookup %lld lookups per sec\n",
-	       cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
 }
 
 static void test_array_lookup(int cpu)
@@ -145,46 +231,39 @@ static void test_array_lookup(int cpu)
 	int i;
 
 	start_time = time_get_ns();
-	for (i = 0; i < MAX_CNT; i++)
+	for (i = 0; i < max_cnt; i++)
 		syscall(__NR_getpgrp, 0);
 	printf("%d:array_lookup %lld lookups per sec\n",
-	       cpu, MAX_CNT * 1000000000ll * 64 / (time_get_ns() - start_time));
+	       cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
 }
 
+typedef void (*test_func)(int cpu);
+const test_func test_funcs[] = {
+	[HASH_PREALLOC] = test_hash_prealloc,
+	[PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc,
+	[HASH_KMALLOC] = test_hash_kmalloc,
+	[PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc,
+	[LRU_HASH_PREALLOC] = test_lru_hash_prealloc,
+	[NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc,
+	[LPM_KMALLOC] = test_lpm_kmalloc,
+	[HASH_LOOKUP] = test_hash_lookup,
+	[ARRAY_LOOKUP] = test_array_lookup,
+	[INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc,
+};
+
 static void loop(int cpu)
 {
 	cpu_set_t cpuset;
+	int i;
 
 	CPU_ZERO(&cpuset);
 	CPU_SET(cpu, &cpuset);
 	sched_setaffinity(0, sizeof(cpuset), &cpuset);
 
-	if (test_flags & HASH_PREALLOC)
-		test_hash_prealloc(cpu);
-
-	if (test_flags & PERCPU_HASH_PREALLOC)
-		test_percpu_hash_prealloc(cpu);
-
-	if (test_flags & HASH_KMALLOC)
-		test_hash_kmalloc(cpu);
-
-	if (test_flags & PERCPU_HASH_KMALLOC)
-		test_percpu_hash_kmalloc(cpu);
-
-	if (test_flags & LRU_HASH_PREALLOC)
-		test_lru_hash_prealloc(cpu);
-
-	if (test_flags & PERCPU_LRU_HASH_PREALLOC)
-		test_percpu_lru_hash_prealloc(cpu);
-
-	if (test_flags & LPM_KMALLOC)
-		test_lpm_kmalloc(cpu);
-
-	if (test_flags & HASH_LOOKUP)
-		test_hash_lookup(cpu);
-
-	if (test_flags & ARRAY_LOOKUP)
-		test_array_lookup(cpu);
+	for (i = 0; i < NR_TESTS; i++) {
+		if (check_test_flags(i))
+			test_funcs[i](cpu);
+	}
 }
 
 static void run_perf_test(int tasks)
@@ -241,6 +320,38 @@ static void fill_lpm_trie(void)
 	assert(!r);
 }
 
+static void fixup_map(struct bpf_map_def *map, const char *name, int idx)
+{
+	int i;
+
+	if (!strcmp("inner_lru_hash_map", name)) {
+		inner_lru_hash_idx = idx;
+		inner_lru_hash_size = map->max_entries;
+	}
+
+	if (!strcmp("array_of_lru_hashs", name)) {
+		if (inner_lru_hash_idx == -1) {
+			printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n");
+			exit(1);
+		}
+		map->inner_map_idx = inner_lru_hash_idx;
+		array_of_lru_hashs_idx = idx;
+	}
+
+	if (num_map_entries <= 0)
+		return;
+
+	inner_lru_hash_size = num_map_entries;
+
+	/* Only change the max_entries for the enabled test(s) */
+	for (i = 0; i < NR_TESTS; i++) {
+		if (!strcmp(test_map_names[i], name) &&
+		    (check_test_flags(i))) {
+			map->max_entries = num_map_entries;
+		}
+	}
+}
+
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
@@ -256,7 +367,13 @@ int main(int argc, char **argv)
 	if (argc > 2)
 		num_cpu = atoi(argv[2]) ? : num_cpu;
 
-	if (load_bpf_file(filename)) {
+	if (argc > 3)
+		num_map_entries = atoi(argv[3]);
+
+	if (argc > 4)
+		max_cnt = atoi(argv[4]);
+
+	if (load_bpf_file_fixup_map(filename, fixup_map)) {
 		printf("%s", bpf_log_buf);
 		return 1;
 	}
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
index 00b0aff56e2e7256de150815e0da2eb0fb20d7b6..8c10c9180c1a6535f18caafcdf7195f69d04ecd2 100644
--- a/tools/testing/selftests/bpf/test_lru_map.c
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -22,7 +22,7 @@
 #include "bpf_util.h"
 
 #define LOCAL_FREE_TARGET	(128)
-#define PERCPU_FREE_TARGET	(16)
+#define PERCPU_FREE_TARGET	(4)
 
 static int nr_cpus;
 
@@ -191,12 +191,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 	int next_cpu = 0;
 
 	if (map_flags & BPF_F_NO_COMMON_LRU)
-		/* Ther percpu lru list (i.e each cpu has its own LRU
-		 * list) does not have a local free list.  Hence,
-		 * it will only free old nodes till there is no free
-		 * from the LRU list.  Hence, this test does not apply
-		 * to BPF_F_NO_COMMON_LRU
-		 */
+		/* This test is only applicable to common LRU list */
 		return;
 
 	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
@@ -227,7 +222,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
 	for (key = 1; key < end_key; key++) {
 		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	/* Insert 1+tgt_free to 2*tgt_free
@@ -273,12 +268,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	int next_cpu = 0;
 
 	if (map_flags & BPF_F_NO_COMMON_LRU)
-		/* Ther percpu lru list (i.e each cpu has its own LRU
-		 * list) does not have a local free list.  Hence,
-		 * it will only free old nodes till there is no free
-		 * from the LRU list.  Hence, this test does not apply
-		 * to BPF_F_NO_COMMON_LRU
-		 */
+		/* This test is only applicable to common LRU list */
 		return;
 
 	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
@@ -290,11 +280,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 	assert(batch_size * 2 == tgt_free);
 
 	map_size = tgt_free + batch_size;
-	if (map_flags & BPF_F_NO_COMMON_LRU)
-		lru_map_fd = create_map(map_type, map_flags,
-					map_size * nr_cpus);
-	else
-		lru_map_fd = create_map(map_type, map_flags, map_size);
+	lru_map_fd = create_map(map_type, map_flags, map_size);
 	assert(lru_map_fd != -1);
 
 	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
@@ -341,7 +327,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
 		assert(value[0] == 4321);
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	value[0] = 1234;
@@ -361,7 +347,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	assert(map_equal(lru_map_fd, expected_map_fd));
@@ -387,6 +373,10 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 	unsigned int map_size;
 	int next_cpu = 0;
 
+	if (map_flags & BPF_F_NO_COMMON_LRU)
+		/* This test is only applicable to common LRU list */
+		return;
+
 	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
 	       map_flags);
 
@@ -396,11 +386,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 	assert(batch_size * 2 == tgt_free);
 
 	map_size = tgt_free * 2;
-	if (map_flags & BPF_F_NO_COMMON_LRU)
-		lru_map_fd = create_map(map_type, map_flags,
-					map_size * nr_cpus);
-	else
-		lru_map_fd = create_map(map_type, map_flags, map_size);
+	lru_map_fd = create_map(map_type, map_flags, map_size);
 	assert(lru_map_fd != -1);
 
 	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
@@ -419,7 +405,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 	for (key = 1; key < end_key; key++) {
 		assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	/* Add 1+2*tgt_free to tgt_free*5/2
@@ -431,7 +417,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	assert(map_equal(lru_map_fd, expected_map_fd));
@@ -491,7 +477,7 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
 		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
 					    BPF_NOEXIST));
 		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
-				            BPF_NOEXIST));
+					    BPF_NOEXIST));
 	}
 
 	assert(map_equal(lru_map_fd, expected_map_fd));
@@ -566,6 +552,65 @@ static void test_lru_sanity5(int map_type, int map_flags)
 	printf("Pass\n");
 }
 
+/* Test list rotation for BPF_F_NO_COMMON_LRU map */
+static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
+{
+	int lru_map_fd, expected_map_fd;
+	unsigned long long key, value[nr_cpus];
+	unsigned int map_size = tgt_free * 2;
+	int next_cpu = 0;
+
+	if (!(map_flags & BPF_F_NO_COMMON_LRU))
+		return;
+
+	printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+	       map_flags);
+
+	assert(sched_next_online(0, &next_cpu) != -1);
+
+	expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+	assert(expected_map_fd != -1);
+
+	lru_map_fd = create_map(map_type, map_flags, map_size * nr_cpus);
+	assert(lru_map_fd != -1);
+
+	value[0] = 1234;
+
+	for (key = 1; key <= tgt_free; key++) {
+		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+					    BPF_NOEXIST));
+		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+					    BPF_NOEXIST));
+	}
+
+	for (; key <= tgt_free * 2; key++) {
+		unsigned long long stable_key;
+
+		/* Make ref bit sticky for key: [1, tgt_free] */
+		for (stable_key = 1; stable_key <= tgt_free; stable_key++) {
+			/* Mark the ref bit */
+			assert(!bpf_map_lookup_elem(lru_map_fd, &stable_key,
+						    value));
+		}
+		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+					    BPF_NOEXIST));
+	}
+
+	for (; key <= tgt_free * 3; key++) {
+		assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+					    BPF_NOEXIST));
+		assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+					    BPF_NOEXIST));
+	}
+
+	assert(map_equal(lru_map_fd, expected_map_fd));
+
+	close(expected_map_fd);
+	close(lru_map_fd);
+
+	printf("Pass\n");
+}
+
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
@@ -593,6 +638,7 @@ int main(int argc, char **argv)
 			test_lru_sanity3(map_types[t], map_flags[f], tgt_free);
 			test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
 			test_lru_sanity5(map_types[t], map_flags[f]);
+			test_lru_sanity6(map_types[t], map_flags[f], tgt_free);
 
 			printf("\n");
 		}