提交 af38553c 编写于 作者: L Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "14 fixes and one selftest to verify the ipc fixes herein"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: limit boost_watermark on small zones
  ubsan: disable UBSAN_ALIGNMENT under COMPILE_TEST
  mm/vmscan: remove unnecessary argument description of isolate_lru_pages()
  epoll: atomically remove wait entry on wake up
  kselftests: introduce new epoll60 testcase for catching lost wakeups
  percpu: make pcpu_alloc() aware of current gfp context
  mm/slub: fix incorrect interpretation of s->offset
  scripts/gdb: repair rb_first() and rb_last()
  eventpoll: fix missing wakeup for ovflist in ep_poll_callback
  arch/x86/kvm/svm/sev.c: change flag passed to GUP fast in sev_pin_memory()
  scripts/decodecode: fix trapping instruction formatting
  kernel/kcov.c: fix typos in kcov_remote_start documentation
  mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous()
  mm, memcg: fix error return value of mem_cgroup_css_alloc()
  ipc/mqueue.c: change __do_notify() to bypass check_kill_permission()
......@@ -345,7 +345,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
......
......@@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi)
{
struct eventpoll *ep = epi->ep;
/* Fast preliminary check */
if (epi->next != EP_UNACTIVE_PTR)
return false;
/* Check that the same epi has not been just chained from another CPU */
if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
return false;
......@@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR &&
chain_epi_lockless(epi))
if (chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(epi) &&
list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
/*
......@@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
......@@ -1867,21 +1866,23 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
ep_reset_busy_poll_napi_id(ep);
/*
* We don't have any available event to return to the caller. We need
* to sleep here, and we will be woken by ep_poll_callback() when events
* become available.
*/
if (!waiter) {
waiter = true;
init_waitqueue_entry(&wait, current);
do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*/
init_wait(&wait);
write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
......@@ -1911,10 +1912,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
timed_out = 1;
break;
}
}
/* We were woken up, thus go and try to harvest some events */
eavail = 1;
} while (0);
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
......@@ -1925,12 +1936,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
if (waiter) {
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
return res;
}
......
......@@ -142,6 +142,7 @@ struct mqueue_inode_info {
struct sigevent notify;
struct pid *notify_owner;
u32 notify_self_exec_id;
struct user_namespace *notify_user_ns;
struct user_struct *user; /* user who created, for accounting */
struct sock *notify_sock;
......@@ -773,28 +774,44 @@ static void __do_notify(struct mqueue_inode_info *info)
* synchronously. */
if (info->notify_owner &&
info->attr.mq_curmsgs == 1) {
struct kernel_siginfo sig_i;
switch (info->notify.sigev_notify) {
case SIGEV_NONE:
break;
case SIGEV_SIGNAL:
/* sends signal */
case SIGEV_SIGNAL: {
struct kernel_siginfo sig_i;
struct task_struct *task;
/* do_mq_notify() accepts sigev_signo == 0, why?? */
if (!info->notify.sigev_signo)
break;
clear_siginfo(&sig_i);
sig_i.si_signo = info->notify.sigev_signo;
sig_i.si_errno = 0;
sig_i.si_code = SI_MESGQ;
sig_i.si_value = info->notify.sigev_value;
/* map current pid/uid into info->owner's namespaces */
rcu_read_lock();
/* map current pid/uid into info->owner's namespaces */
sig_i.si_pid = task_tgid_nr_ns(current,
ns_of_pid(info->notify_owner));
sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
current_uid());
/*
* We can't use kill_pid_info(), this signal should
* bypass check_kill_permission(). It is from kernel
* but si_fromuser() can't know this.
* We do check the self_exec_id, to avoid sending
* signals to programs that don't expect them.
*/
task = pid_task(info->notify_owner, PIDTYPE_TGID);
if (task && task->self_exec_id ==
info->notify_self_exec_id) {
do_send_sig_info(info->notify.sigev_signo,
&sig_i, task, PIDTYPE_TGID);
}
rcu_read_unlock();
kill_pid_info(info->notify.sigev_signo,
&sig_i, info->notify_owner);
break;
}
case SIGEV_THREAD:
set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
netlink_sendskb(info->notify_sock, info->notify_cookie);
......@@ -1383,6 +1400,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
info->notify.sigev_signo = notification->sigev_signo;
info->notify.sigev_value = notification->sigev_value;
info->notify.sigev_notify = SIGEV_SIGNAL;
info->notify_self_exec_id = current->self_exec_id;
break;
}
......
......@@ -740,8 +740,8 @@ static const struct file_operations kcov_fops = {
* kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an
* arbitrary 4-byte non-zero number as the instance id). This common handle
* then gets saved into the task_struct of the process that issued the
* KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn
* kernel threads, the common handle must be retrived via kcov_common_handle()
* KCOV_REMOTE_ENABLE ioctl. When this process issues system calls that spawn
* kernel threads, the common handle must be retrieved via kcov_common_handle()
* and passed to the spawned threads via custom annotations. Those kernel
* threads must in turn be annotated with kcov_remote_start(common_handle) and
* kcov_remote_stop(). All of the threads that are spawned by the same process
......
......@@ -60,18 +60,15 @@ config UBSAN_SANITIZE_ALL
Enabling this option will get kernel image size increased
significantly.
config UBSAN_NO_ALIGNMENT
bool "Disable checking of pointers alignment"
default y if HAVE_EFFICIENT_UNALIGNED_ACCESS
config UBSAN_ALIGNMENT
bool "Enable checks for pointers alignment"
default !HAVE_EFFICIENT_UNALIGNED_ACCESS
depends on !X86 || !COMPILE_TEST
help
This option disables the check of unaligned memory accesses.
This option should be used when building allmodconfig.
Disabling this option on architectures that support unaligned
This option enables the check of unaligned memory accesses.
Enabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
config UBSAN_ALIGNMENT
def_bool !UBSAN_NO_ALIGNMENT
config TEST_UBSAN
tristate "Module for testing for undefined behavior detection"
depends on m
......
......@@ -4990,19 +4990,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
if (memcg->id.id < 0)
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
}
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
......@@ -5046,7 +5049,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
return NULL;
return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
......@@ -5057,8 +5060,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
long error = -ENOMEM;
memcg = mem_cgroup_alloc();
if (!memcg)
return ERR_PTR(error);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
......@@ -5108,7 +5111,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
return ERR_PTR(-ENOMEM);
return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
......
......@@ -1607,6 +1607,7 @@ void set_zone_contiguous(struct zone *zone)
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
cond_resched();
}
/* We confirm that there is no hole */
......@@ -2400,6 +2401,14 @@ static inline void boost_watermark(struct zone *zone)
if (!watermark_boost_factor)
return;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
return;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
......
......@@ -80,6 +80,7 @@
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
......@@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
/* whitelisted flags that can be passed to the backing allocators */
gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
......@@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
void __percpu *ptr;
size_t bits, bit_align;
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
do_warn = !(gfp & __GFP_NOWARN);
/*
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.
......
......@@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr,
metadata_access_disable();
}
/*
* See comment in calculate_sizes().
*/
static inline bool freeptr_outside_object(struct kmem_cache *s)
{
return s->offset >= s->inuse;
}
/*
* Return offset of the end of info block which is inuse + free pointer if
* not overlapping with object.
*/
static inline unsigned int get_info_end(struct kmem_cache *s)
{
if (freeptr_outside_object(s))
return s->inuse + sizeof(void *);
else
return s->inuse;
}
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
struct track *p;
if (s->offset)
p = object + s->offset + sizeof(void *);
else
p = object + s->inuse;
p = object + get_info_end(s);
return p + alloc;
}
......@@ -686,10 +703,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
if (s->offset)
off = s->offset + sizeof(void *);
else
off = s->inuse;
off = get_info_end(s);
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
......@@ -782,7 +796,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* object address
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
* pointer is the first word of the object.
* pointer is at the middle of the object.
*
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
......@@ -816,11 +830,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned long off = s->inuse; /* The end of info */
if (s->offset)
/* Freepointer is placed after the object. */
off += sizeof(void *);
unsigned long off = get_info_end(s); /* The end of info */
if (s->flags & SLAB_STORE_USER)
/* We also have user information there */
......@@ -907,7 +917,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
if (!s->offset && val == SLUB_RED_ACTIVE)
if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
......@@ -3587,6 +3597,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*
* This is the case if we do RCU, have a constructor or
* destructor or are poisoning the objects.
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
* freeptr_outside_object() function. If that is no
* longer true, the function needs to be modified.
*/
s->offset = size;
size += sizeof(void *);
......
......@@ -1625,7 +1625,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @mode: One of the LRU isolation modes
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
......
......@@ -126,7 +126,7 @@ faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \
faultline=`cat $T.dis | head -1 | cut -d":" -f2-`
faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'`
cat $T.oo | sed -e "${faultlinenum}s/^\(.*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/"
cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/"
echo
cat $T.aa
cleanup
......@@ -12,7 +12,7 @@ rb_node_type = utils.CachedType("struct rb_node")
def rb_first(root):
if root.type == rb_root_type.get_type():
node = node.address.cast(rb_root_type.get_type().pointer())
node = root.address.cast(rb_root_type.get_type().pointer())
elif root.type != rb_root_type.get_type().pointer():
raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
......@@ -28,7 +28,7 @@ def rb_first(root):
def rb_last(root):
if root.type == rb_root_type.get_type():
node = node.address.cast(rb_root_type.get_type().pointer())
node = root.address.cast(rb_root_type.get_type().pointer())
elif root.type != rb_root_type.get_type().pointer():
raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
......
......@@ -3,6 +3,7 @@
#define _GNU_SOURCE
#include <poll.h>
#include <unistd.h>
#include <assert.h>
#include <signal.h>
#include <pthread.h>
#include <sys/epoll.h>
......@@ -3136,4 +3137,149 @@ TEST(epoll59)
close(ctx.sfd[0]);
}
enum {
EPOLL60_EVENTS_NR = 10,
};
struct epoll60_ctx {
volatile int stopped;
int ready;
int waiters;
int epfd;
int evfd[EPOLL60_EVENTS_NR];
};
static void *epoll60_wait_thread(void *ctx_)
{
struct epoll60_ctx *ctx = ctx_;
struct epoll_event e;
sigset_t sigmask;
uint64_t v;
int ret;
/* Block SIGUSR1 */
sigemptyset(&sigmask);
sigaddset(&sigmask, SIGUSR1);
sigprocmask(SIG_SETMASK, &sigmask, NULL);
/* Prepare empty mask for epoll_pwait() */
sigemptyset(&sigmask);
while (!ctx->stopped) {
/* Mark we are ready */
__atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE);
/* Start when all are ready */
while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) &&
!ctx->stopped);
/* Account this waiter */
__atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE);
ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask);
if (ret != 1) {
/* We expect only signal delivery on stop */
assert(ret < 0 && errno == EINTR && "Lost wakeup!\n");
assert(ctx->stopped);
break;
}
ret = read(e.data.fd, &v, sizeof(v));
/* Since we are on ET mode, thus each thread gets its own fd. */
assert(ret == sizeof(v));
__atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE);
}
return NULL;
}
static inline unsigned long long msecs(void)
{
struct timespec ts;
unsigned long long msecs;
clock_gettime(CLOCK_REALTIME, &ts);
msecs = ts.tv_sec * 1000ull;
msecs += ts.tv_nsec / 1000000ull;
return msecs;
}
static inline int count_waiters(struct epoll60_ctx *ctx)
{
return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE);
}
TEST(epoll60)
{
struct epoll60_ctx ctx = { 0 };
pthread_t waiters[ARRAY_SIZE(ctx.evfd)];
struct epoll_event e;
int i, n, ret;
signal(SIGUSR1, signal_handler);
ctx.epfd = epoll_create1(0);
ASSERT_GE(ctx.epfd, 0);
/* Create event fds */
for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) {
ctx.evfd[i] = eventfd(0, EFD_NONBLOCK);
ASSERT_GE(ctx.evfd[i], 0);
e.events = EPOLLIN | EPOLLET;
e.data.fd = ctx.evfd[i];
ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0);
}
/* Create waiter threads */
for (i = 0; i < ARRAY_SIZE(waiters); i++)
ASSERT_EQ(pthread_create(&waiters[i], NULL,
epoll60_wait_thread, &ctx), 0);
for (i = 0; i < 300; i++) {
uint64_t v = 1, ms;
/* Wait for all to be ready */
while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) !=
ARRAY_SIZE(ctx.evfd))
;
/* Steady, go */
__atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd),
__ATOMIC_ACQUIRE);
/* Wait all have gone to kernel */
while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd))
;
/* 1ms should be enough to schedule away */
usleep(1000);
/* Quickly signal all handles at once */
for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) {
ret = write(ctx.evfd[n], &v, sizeof(v));
ASSERT_EQ(ret, sizeof(v));
}
/* Busy loop for 1s and wait for all waiters to wake up */
ms = msecs();
while (count_waiters(&ctx) && msecs() < ms + 1000)
;
ASSERT_EQ(count_waiters(&ctx), 0);
}
ctx.stopped = 1;
/* Stop waiters */
for (i = 0; i < ARRAY_SIZE(waiters); i++)
ret = pthread_kill(waiters[i], SIGUSR1);
for (i = 0; i < ARRAY_SIZE(waiters); i++)
pthread_join(waiters[i], NULL);
for (i = 0; i < ARRAY_SIZE(waiters); i++)
close(ctx.evfd[i]);
close(ctx.epfd);
}
TEST_HARNESS_MAIN
......@@ -25,7 +25,6 @@ CONFIG_KASAN=y
CONFIG_KASAN_INLINE=y
CONFIG_UBSAN=y
CONFIG_UBSAN_SANITIZE_ALL=y
CONFIG_UBSAN_NO_ALIGNMENT=y
CONFIG_UBSAN_NULL=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册