new attempt at making set*id() safe and robust

changing credentials in a multi-threaded program is extremely difficult on linux because it requires synchronizing the change between all threads, which have their own thread-local credentials on the kernel side. this is further complicated by the fact that changing the real uid can fail due to exceeding RLIMIT_NPROC, making it possible that the syscall will succeed in some threads but fail in others. the old __rsyscall approach being replaced was robust in that it would report failure if any one thread failed, but in this case, the program would be left in an inconsistent state where individual threads might have different uid. (this was not as bad as glibc, which would sometimes even fail to report the failure entirely!) the new approach being committed refuses to change real user id when it cannot temporarily set the rlimit to infinity. this is completely POSIX conformant since POSIX does not require an implementation to allow real-user-id changes for non-privileged processes whatsoever. still, setting the real uid can fail due to memory allocation in the kernel, but this can only happen if there is not already a cached object for the target user. thus, we forcibly serialize the syscalls attempts, and fail the entire operation on the first failure. this *should* lead to an all-or-nothing success/failure result, but it's still fragile and highly dependent on kernel developers not breaking things worse than they're already broken. ideally linux will eventually add a CLONE_USERCRED flag that would give POSIX conformant credential changes without any hacks from userspace, and all of this code would become redundant and could be removed ~10 years down the line when everyone has abandoned the old broken kernels. i'm not holding my breath...

new attempt at making set*id() safe and robust
changing credentials in a multi-threaded program is extremely difficult on linux because it requires synchronizing the change between all threads, which have their own thread-local credentials on the kernel side. this is further complicated by the fact that changing the real uid can fail due to exceeding RLIMIT_NPROC, making it possible that the syscall will succeed in some threads but fail in others. the old __rsyscall approach being replaced was robust in that it would report failure if any one thread failed, but in this case, the program would be left in an inconsistent state where individual threads might have different uid. (this was not as bad as glibc, which would sometimes even fail to report the failure entirely!) the new approach being committed refuses to change real user id when it cannot temporarily set the rlimit to infinity. this is completely POSIX conformant since POSIX does not require an implementation to allow real-user-id changes for non-privileged processes whatsoever. still, setting the real uid can fail due to memory allocation in the kernel, but this can only happen if there is not already a cached object for the target user. thus, we forcibly serialize the syscalls attempts, and fail the entire operation on the first failure. this *should* lead to an all-or-nothing success/failure result, but it's still fragile and highly dependent on kernel developers not breaking things worse than they're already broken. ideally linux will eventually add a CLONE_USERCRED flag that would give POSIX conformant credential changes without any hacks from userspace, and all of this code would become redundant and could be removed ~10 years down the line when everyone has abandoned the old broken kernels. i'm not holding my breath...
acb04806 · Rich Felker · aed707f6 · acb04806 · acb04806 · aed707f6
13 changed file
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -42,7 +42,8 @@ void __lockfile(FILE *);
 #define LOCK(x) (libc.threads_minus_1 ? (__lock(x),1) : ((void)(x),1))
 #define UNLOCK(x) (*(volatile int *)(x)=0)

-int __rsyscall(int, long, long, long, long, long, long);
+void __synccall(void (*)(void *), void *);
+int __setxid(int, int, int, int);

 extern char **__environ;
 #define environ __environ

--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -80,7 +80,7 @@ struct __timer {

 #define SIGTIMER 32
 #define SIGCANCEL 33
-#define SIGSYSCALL 34
+#define SIGSYNCCALL 34

 #define SIGPT_SET ((sigset_t *)(unsigned long [1+(sizeof(long)==4)]){ \
 	[sizeof(long)==4] = 3UL<<(32*(sizeof(long)>4)) })
@@ -98,8 +98,8 @@ int __timedwait_cp(volatile int *, int, clockid_t, const struct timespec *, int)
 void __wait(volatile int *, volatile int *, int, int);
 void __wake(volatile int *, int, int);

-void __rsyscall_lock();
-void __rsyscall_unlock();
+void __synccall_lock();
+void __synccall_unlock();

 #define DEFAULT_STACK_SIZE (16384-PAGE_SIZE)
 #define DEFAULT_GUARD_SIZE PAGE_SIZE

--- a/src/thread/__rsyscall.c
+++ b/src/thread/__rsyscall.c
-#include "pthread_impl.h"
-
-/* "rsyscall" is a mechanism by which a thread can synchronously force all
- * other threads to perform an arbitrary syscall. It is necessary to work
- * around the non-conformant implementation of setuid() et al on Linux,
- * which affect only the calling thread and not the whole process. This
- * implementation performs some tricks with signal delivery to work around
- * the fact that it does not keep any list of threads in userspace. */
-
-static struct {
-	volatile int lock, hold, blocks, cnt;
-	unsigned long arg[6];
-	int nr;
-	int err;
-	int init;
-} rs;
-
-static void rsyscall_handler(int sig, siginfo_t *si, void *ctx)
-{
-	struct pthread *self = __pthread_self();
-	long r;
-
-	if (!rs.hold || rs.cnt == libc.threads_minus_1) return;
-
-	/* Threads which have already decremented themselves from the
-	 * thread count must not increment rs.cnt or otherwise act. */
-	if (self->dead) {
-		sigfillset(&((ucontext_t *)ctx)->uc_sigmask);
-		return;
-	}
-
-	r = __syscall(rs.nr, rs.arg[0], rs.arg[1],
-		rs.arg[2], rs.arg[3], rs.arg[4], rs.arg[5]);
-	if (r < 0) rs.err=-r;
-
-	a_inc(&rs.cnt);
-	__wake(&rs.cnt, 1, 1);
-	while(rs.hold)
-		__wait(&rs.hold, 0, 1, 1);
-	a_dec(&rs.cnt);
-	if (!rs.cnt) __wake(&rs.cnt, 1, 1);
-}
-
-int __rsyscall(int nr, long a, long b, long c, long d, long e, long f)
-{
-	int i, ret;
-	sigset_t set = { 0 };
-	struct pthread *self;
-
-	if (!libc.threads_minus_1)
-		return syscall(nr, a, b, c, d, e, f);
-
-	self = __pthread_self();
-
-	LOCK(&rs.lock);
-	while ((i=rs.blocks))
-		__wait(&rs.blocks, 0, i, 1);
-
-	__syscall(SYS_rt_sigprocmask, SIG_BLOCK, (uint64_t[]){-1}, &set, 8);
-
-	if (!rs.init) {
-		struct sigaction sa = {
-			.sa_flags = SA_SIGINFO | SA_RESTART,
-			.sa_sigaction = rsyscall_handler,
-			.sa_mask = set
-		};
-		sigfillset(&sa.sa_mask);
-		sa.sa_sigaction = rsyscall_handler;
-		__libc_sigaction(SIGSYSCALL, &sa, 0);
-		rs.init = 1;
-	}
-
-	rs.nr = nr;
-	rs.arg[0] = a; rs.arg[1] = b;
-	rs.arg[2] = c; rs.arg[3] = d;
-	rs.arg[4] = d; rs.arg[5] = f;
-	rs.err = 0;
-	rs.cnt = 0;
-	rs.hold = 1;
-
-	/* Dispatch signals until all threads respond */
-	for (i=libc.threads_minus_1; i; i--)
-		sigqueue(self->pid, SIGSYSCALL, (union sigval){0});
-	while ((i=rs.cnt) < libc.threads_minus_1) {
-		sigqueue(self->pid, SIGSYSCALL, (union sigval){0});
-		__wait(&rs.cnt, 0, i, 1);
-	}
-
-	/* Handle any lingering signals with no-op */
-	__syscall(SYS_rt_sigprocmask, SIG_SETMASK, &set, &set, 8);
-
-	/* Resume other threads' signal handlers and wait for them */
-	rs.hold = 0;
-	__wake(&rs.hold, -1, 0);
-	while((i=rs.cnt)) __wait(&rs.cnt, 0, i, 1);
-
-	if (rs.err) errno = rs.err, ret = -1;
-	else ret = syscall(nr, a, b, c, d, e, f);
-
-	UNLOCK(&rs.lock);
-	return ret;
-}
-
-void __rsyscall_lock()
-{
-	a_inc(&rs.blocks);
-	while (rs.lock) __wait(&rs.lock, 0, 1, 1);
-}
-
-void __rsyscall_unlock()
-{
-	a_dec(&rs.blocks);
-	if (rs.lock) __wake(&rs.blocks, 1, 1);
-}
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -3,8 +3,8 @@
 static void dummy_0()
 {
 }
-weak_alias(dummy_0, __rsyscall_lock);
-weak_alias(dummy_0, __rsyscall_unlock);
+weak_alias(dummy_0, __synccall_lock);
+weak_alias(dummy_0, __synccall_unlock);
 weak_alias(dummy_0, __pthread_tsd_run_dtors);

 #ifdef __pthread_unwind_next
@@ -99,12 +99,12 @@ int pthread_create(pthread_t *res, const pthread_attr_t *attr, void *(*entry)(vo
 	new->tlsdesc[1] = (uintptr_t)new;
 	stack = (void *)((uintptr_t)new-1 & ~(uintptr_t)15);

-	__rsyscall_lock();
+	__synccall_lock();

 	a_inc(&libc.threads_minus_1);
 	ret = __uniclone(stack, start, new);

-	__rsyscall_unlock();
+	__synccall_unlock();

 	if (ret < 0) {
 		a_dec(&libc.threads_minus_1);

--- a/src/thread/synccall.c
+++ b/src/thread/synccall.c
+#include "pthread_impl.h"
+#include <semaphore.h>
+
+static struct chain {
+	struct chain *next;
+	sem_t sem, sem2;
+} *head;
+
+static void (*callback)(void *), *context;
+static int chainlen;
+static sem_t chainlock, chaindone;
+static pthread_rwlock_t lock = PTHREAD_RWLOCK_INITIALIZER;
+
+static void handler(int sig, siginfo_t *si, void *ctx)
+{
+	struct chain ch;
+	pthread_t self = __pthread_self();
+	int old_errno = errno;
+
+	if (chainlen == libc.threads_minus_1) return;
+
+	sigqueue(self->pid, SIGSYNCCALL, (union sigval){0});
+
+	/* Threads which have already decremented themselves from the
+	 * thread count must not act. Block further receipt of signals
+	 * and return. */
+	if (self->dead) {
+		memset(&((ucontext_t *)ctx)->uc_sigmask, -1, 8);
+		errno = old_errno;
+		return;
+	}
+
+	sem_init(&ch.sem, 0, 0);
+	sem_init(&ch.sem2, 0, 0);
+
+	while (sem_wait(&chainlock));
+	ch.next = head;
+	head = &ch;
+	if (++chainlen == libc.threads_minus_1) sem_post(&chaindone);
+	sem_post(&chainlock);
+
+	while (sem_wait(&ch.sem));
+	callback(context);
+	sem_post(&ch.sem2);
+	while (sem_wait(&ch.sem));
+
+	errno = old_errno;
+}
+
+void __synccall(void (*func)(void *), void *ctx)
+{
+	pthread_t self;
+	struct sigaction sa;
+	struct chain *cur, *next;
+	uint64_t oldmask;
+
+	pthread_rwlock_wrlock(&lock);
+
+	__syscall(SYS_rt_sigprocmask, SIG_BLOCK, (uint64_t[]){-1}, &oldmask, 8);
+
+	if (!libc.threads_minus_1) {
+		func(ctx);
+		return;
+	}
+
+	sem_init(&chaindone, 0, 0);
+	sem_init(&chainlock, 0, 1);
+	chainlen = 0;
+	callback = func;
+	context = ctx;
+
+	sa.sa_flags = SA_SIGINFO | SA_RESTART;
+	sa.sa_sigaction = handler;
+	sigfillset(&sa.sa_mask);
+	__libc_sigaction(SIGSYNCCALL, &sa, 0);
+
+	self = __pthread_self();
+	sigqueue(self->pid, SIGSYNCCALL, (union sigval){0});
+	while (sem_wait(&chaindone));
+
+	for (cur=head; cur; cur=cur->next) {
+		sem_post(&cur->sem);
+		while (sem_wait(&cur->sem2));
+	}
+	func(ctx);
+
+	for (cur=head; cur; cur=next) {
+		next = cur->next;
+		sem_post(&cur->sem);
+	}
+
+	sa.sa_flags = 0;
+	sa.sa_handler = SIG_IGN;
+	__libc_sigaction(SIGSYNCCALL, &sa, 0);
+
+	__syscall(SYS_rt_sigprocmask, SIG_SETMASK, &oldmask, 0, 8);
+
+	pthread_rwlock_unlock(&lock);
+}
+
+void __synccall_lock()
+{
+	pthread_rwlock_rdlock(&lock);
+}
+
+void __synccall_unlock()
+{
+	pthread_rwlock_unlock(&lock);
+}
--- a/src/unistd/setegid.c
+++ b/src/unistd/setegid.c
 #include <unistd.h>
+#include "libc.h"
+#include "syscall.h"

 int setegid(gid_t egid)
 {
-	return setregid(-1, egid);
+	return __setxid(SYS_setresgid, -1, egid, -1);
 }
--- a/src/unistd/seteuid.c
+++ b/src/unistd/seteuid.c
 #include <unistd.h>
+#include "syscall.h"
+#include "libc.h"

 int seteuid(uid_t euid)
 {
-	return setreuid(-1, euid);
+	return __setxid(SYS_setresuid, -1, euid, -1);
 }
--- a/src/unistd/setgid.c
+++ b/src/unistd/setgid.c
@@ -4,5 +4,5 @@

 int setgid(gid_t gid)
 {
-	return __rsyscall(SYS_setgid, gid, 0, 0, 0, 0, 0);
+	return __setxid(SYS_setgid, gid, 0, 0);
 }
--- a/src/unistd/setregid.c
+++ b/src/unistd/setregid.c
@@ -4,5 +4,5 @@

 int setregid(gid_t rgid, gid_t egid)
 {
-	return __rsyscall(SYS_setregid, rgid, egid, 0, 0, 0, 0);
+	return __setxid(SYS_setregid, rgid, egid, 0);
 }
--- a/src/unistd/setresgid.c
+++ b/src/unistd/setresgid.c
@@ -5,5 +5,5 @@

 int setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
-	return __rsyscall(SYS_setresgid, rgid, egid, sgid, 0, 0, 0);
+	return __setxid(SYS_setresgid, rgid, egid, sgid);
 }
--- a/src/unistd/setresuid.c
+++ b/src/unistd/setresuid.c
@@ -5,5 +5,5 @@

 int setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	return __rsyscall(SYS_setresuid, ruid, euid, suid, 0, 0, 0);
+	return __setxid(SYS_setresuid, ruid, euid, suid);
 }
--- a/src/unistd/setreuid.c
+++ b/src/unistd/setreuid.c
@@ -4,5 +4,5 @@

 int setreuid(uid_t ruid, uid_t euid)
 {
-	return __rsyscall(SYS_setreuid, ruid, euid, 0, 0, 0, 0);
+	return __setxid(SYS_setreuid, ruid, euid, 0);
 }
--- a/src/unistd/setuid.c
+++ b/src/unistd/setuid.c
@@ -4,5 +4,5 @@

 int setuid(uid_t uid)
 {
-	return __rsyscall(SYS_setuid, uid, 0, 0, 0, 0, 0);
+	return __setxid(SYS_setuid, uid, 0, 0);
 }