diff --git a/configure b/configure index c685ca3f918c9a5bdecf1ad7e83933073015824c..97d2f68956af39edf017726be968674ad01a34e9 100755 --- a/configure +++ b/configure @@ -8384,6 +8384,180 @@ if test "$ac_res" != no; then fi +{ $as_echo "$as_me:$LINENO: checking for library containing shm_open" >&5 +$as_echo_n "checking for library containing shm_open... " >&6; } +if test "${ac_cv_search_shm_open+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_func_search_save_LIBS=$LIBS +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char shm_open (); +int +main () +{ +return shm_open (); + ; + return 0; +} +_ACEOF +for ac_lib in '' rt; do + if test -z "$ac_lib"; then + ac_res="none required" + else + ac_res=-l$ac_lib + LIBS="-l$ac_lib $ac_func_search_save_LIBS" + fi + rm -f conftest.$ac_objext conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + $as_test_x conftest$ac_exeext + }; then + ac_cv_search_shm_open=$ac_res +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -rf conftest.dSYM +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ + conftest$ac_exeext + if test "${ac_cv_search_shm_open+set}" = set; then + break +fi +done +if test "${ac_cv_search_shm_open+set}" = set; then + : +else + ac_cv_search_shm_open=no +fi +rm conftest.$ac_ext +LIBS=$ac_func_search_save_LIBS +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_search_shm_open" >&5 +$as_echo "$ac_cv_search_shm_open" >&6; } +ac_res=$ac_cv_search_shm_open +if test "$ac_res" != no; then + test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + +fi + +{ $as_echo "$as_me:$LINENO: checking for library containing shm_unlink" >&5 +$as_echo_n "checking for library containing shm_unlink... " >&6; } +if test "${ac_cv_search_shm_unlink+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_func_search_save_LIBS=$LIBS +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char shm_unlink (); +int +main () +{ +return shm_unlink (); + ; + return 0; +} +_ACEOF +for ac_lib in '' rt; do + if test -z "$ac_lib"; then + ac_res="none required" + else + ac_res=-l$ac_lib + LIBS="-l$ac_lib $ac_func_search_save_LIBS" + fi + rm -f conftest.$ac_objext conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + $as_test_x conftest$ac_exeext + }; then + ac_cv_search_shm_unlink=$ac_res +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -rf conftest.dSYM +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ + conftest$ac_exeext + if test "${ac_cv_search_shm_unlink+set}" = set; then + break +fi +done +if test "${ac_cv_search_shm_unlink+set}" = set; then + : +else + ac_cv_search_shm_unlink=no +fi +rm conftest.$ac_ext +LIBS=$ac_func_search_save_LIBS +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_search_shm_unlink" >&5 +$as_echo "$ac_cv_search_shm_unlink" >&6; } +ac_res=$ac_cv_search_shm_unlink +if test "$ac_res" != no; then + test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + +fi + # Solaris: { $as_echo "$as_me:$LINENO: checking for library containing fdatasync" >&5 $as_echo_n "checking for library containing fdatasync... " >&6; } @@ -19763,7 +19937,8 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l + +for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5 diff --git a/configure.in b/configure.in index 82771bddb12ab584c9ba4f42e8f39570372fe100..ead0908fd9a1ba51ff53b81548cbef6e14ded12d 100644 --- a/configure.in +++ b/configure.in @@ -883,6 +883,8 @@ case $host_os in esac AC_SEARCH_LIBS(getopt_long, [getopt gnugetopt]) AC_SEARCH_LIBS(crypt, crypt) +AC_SEARCH_LIBS(shm_open, rt) +AC_SEARCH_LIBS(shm_unlink, rt) # Solaris: AC_SEARCH_LIBS(fdatasync, [rt posix4]) # Required for thread_test.c on Solaris 2.5: @@ -1230,7 +1232,7 @@ PGAC_FUNC_GETTIMEOFDAY_1ARG LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) +AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e8e8e6f8fcde7987c8606dce23c201346cd9a778..77a9303933dd6cf47861ef2a538682bcc6c51216 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1194,6 +1194,32 @@ include 'filename' + + dynamic_shared_memory_type (enum) + + dynamic_shared_memory_type configuration parameter + + + + Specifies the dynamic shared memory implementation that the server + should use. Possible values are posix (for POSIX shared + memory allocated using shm_open), sysv + (for System V shared memory allocated via shmget), + windows (for Windows shared memory), mmap + (to simulate shared memory using memory-mapped files stored in the + data directory), and none (to disable this feature). + Not all values are supported on all platforms; the first supported + option is the default for that platform. The use of the + mmap option, which is not the default on any platform, + is generally discouraged because the operating system may write + modified pages back to disk repeatedly, increasing system I/O load; + however, it may be useful for debugging, when the + pg_dynshmem directory is stored on a RAM disk, or when + other shared memory facilities are not available. + + + + diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 20e3c321abd2cd8a81dc25dff59fabc57106523f..b604407999c14538c97ddd07f39439fedac8f990 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -29,6 +29,7 @@ #endif #include "miscadmin.h" +#include "portability/mem.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" @@ -36,31 +37,6 @@ typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ -#define IPCProtection (0600) /* access/modify by user only */ - -#ifdef SHM_SHARE_MMU /* use intimate shared memory on Solaris */ -#define PG_SHMAT_FLAGS SHM_SHARE_MMU -#else -#define PG_SHMAT_FLAGS 0 -#endif - -/* Linux prefers MAP_ANONYMOUS, but the flag is called MAP_ANON on other systems. */ -#ifndef MAP_ANONYMOUS -#define MAP_ANONYMOUS MAP_ANON -#endif - -/* BSD-derived systems have MAP_HASSEMAPHORE, but it's not present (or needed) on Linux. */ -#ifndef MAP_HASSEMAPHORE -#define MAP_HASSEMAPHORE 0 -#endif - -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) - -/* Some really old systems don't define MAP_FAILED. */ -#ifndef MAP_FAILED -#define MAP_FAILED ((void *) -1) -#endif - unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile index 743f30e1c7389a6290200ad5b85c8460e72afbb2..873dd60dbf4630387f75cbdfc3f1f00d326a438b 100644 --- a/src/backend/storage/ipc/Makefile +++ b/src/backend/storage/ipc/Makefile @@ -15,7 +15,7 @@ override CFLAGS+= -fno-inline endif endif -OBJS = ipc.o ipci.o pmsignal.o procarray.o procsignal.o shmem.o shmqueue.o \ - sinval.o sinvaladt.o standby.o +OBJS = dsm_impl.o dsm.o ipc.o ipci.o pmsignal.o procarray.o procsignal.o \ + shmem.o shmqueue.o sinval.o sinvaladt.o standby.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c new file mode 100644 index 0000000000000000000000000000000000000000..e516197bd48126e72ed87057c90ca677f379dd1f --- /dev/null +++ b/src/backend/storage/ipc/dsm.c @@ -0,0 +1,972 @@ +/*------------------------------------------------------------------------- + * + * dsm.c + * manage dynamic shared memory segments + * + * This file provides a set of services to make programming with dynamic + * shared memory segments more convenient. Unlike the low-level + * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments + * created using this module will be cleaned up automatically. Mappings + * will be removed when the resource owner under which they were created + * is cleaned up, unless dsm_keep_mapping() is used, in which case they + * have session lifespan. Segments will be removed when there are no + * remaining mappings, or at postmaster shutdown in any case. After a + * hard postmaster crash, remaining segments will be removed, if they + * still exist, at the next postmaster startup. + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#ifndef WIN32 +#include +#endif +#include + +#include "lib/ilist.h" +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner_private.h" + +#define PG_DYNSHMEM_STATE_FILE PG_DYNSHMEM_DIR "/state" +#define PG_DYNSHMEM_NEW_STATE_FILE PG_DYNSHMEM_DIR "/state.new" +#define PG_DYNSHMEM_STATE_BUFSIZ 512 +#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32 + +/* + * There's no point in getting too cheap here, because the minimum allocation + * is one OS page, which is probably at least 4KB and could easily be as high + * as 64KB. Each currently sizeof(dsm_control_item), currently 8 bytes. + */ +#define PG_DYNSHMEM_FIXED_SLOTS 64 +#define PG_DYNSHMEM_SLOTS_PER_BACKEND 2 + +#define INVALID_CONTROL_SLOT ((uint32) -1) + +/* Backend-local state for a dynamic shared memory segment. */ +struct dsm_segment +{ + dlist_node node; /* List link in dsm_segment_list. */ + ResourceOwner resowner; /* Resource owner. */ + dsm_handle handle; /* Segment name. */ + uint32 control_slot; /* Slot in control segment. */ + void *impl_private; /* Implementation-specific private data. */ + void *mapped_address; /* Mapping address, or NULL if unmapped. */ + uint64 mapped_size; /* Size of our mapping. */ +}; + +/* Shared-memory state for a dynamic shared memory segment. */ +typedef struct dsm_control_item +{ + dsm_handle handle; + uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */ +} dsm_control_item; + +/* Layout of the dynamic shared memory control segment. */ +typedef struct dsm_control_header +{ + uint32 magic; + uint32 nitems; + uint32 maxitems; + dsm_control_item item[FLEXIBLE_ARRAY_MEMBER]; +} dsm_control_header; + +static void dsm_cleanup_using_control_segment(void); +static void dsm_cleanup_for_mmap(void); +static bool dsm_read_state_file(dsm_handle *h); +static void dsm_write_state_file(dsm_handle h); +static void dsm_postmaster_shutdown(int code, Datum arg); +static void dsm_backend_shutdown(int code, Datum arg); +static dsm_segment *dsm_create_descriptor(void); +static bool dsm_control_segment_sane(dsm_control_header *control, + uint64 mapped_size); +static uint64 dsm_control_bytes_needed(uint32 nitems); + +/* Has this backend initialized the dynamic shared memory system yet? */ +static bool dsm_init_done = false; + +/* + * List of dynamic shared memory segments used by this backend. + * + * At process exit time, we must decrement the reference count of each + * segment we have attached; this list makes it possible to find all such + * segments. + * + * This list should always be empty in the postmaster. We could probably + * allow the postmaster to map dynamic shared memory segments before it + * begins to start child processes, provided that each process adjusted + * the reference counts for those segments in the control segment at + * startup time, but there's no obvious need for such a facility, which + * would also be complex to handle in the EXEC_BACKEND case. Once the + * postmaster has begun spawning children, there's an additional problem: + * each new mapping would require an update to the control segment, + * which requires locking, in which the postmaster must not be involved. + */ +static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list); + +/* + * Control segment information. + * + * Unlike ordinary shared memory segments, the control segment is not + * reference counted; instead, it lasts for the postmaster's entire + * life cycle. For simplicity, it doesn't have a dsm_segment object either. + */ +static dsm_handle dsm_control_handle; +static dsm_control_header *dsm_control; +static uint64 dsm_control_mapped_size = 0; +static void *dsm_control_impl_private = NULL; + +/* + * Start up the dynamic shared memory system. + * + * This is called just once during each cluster lifetime, at postmaster + * startup time. + */ +void +dsm_postmaster_startup(void) +{ + void *dsm_control_address = NULL; + uint32 maxitems; + uint64 segsize; + + Assert(!IsUnderPostmaster); + + /* If dynamic shared memory is disabled, there's nothing to do. */ + if (dynamic_shared_memory_type == DSM_IMPL_NONE) + return; + + /* + * Check for, and remove, shared memory segments left behind by a dead + * postmaster. This isn't necessary on Windows, which always removes them + * when the last reference is gone. + */ + switch (dynamic_shared_memory_type) + { + case DSM_IMPL_POSIX: + case DSM_IMPL_SYSV: + dsm_cleanup_using_control_segment(); + break; + case DSM_IMPL_MMAP: + dsm_cleanup_for_mmap(); + break; + case DSM_IMPL_WINDOWS: + /* Nothing to do. */ + break; + default: + elog(ERROR, "unknown dynamic shared memory type: %d", + dynamic_shared_memory_type); + } + + /* Determine size for new control segment. */ + maxitems = PG_DYNSHMEM_FIXED_SLOTS + + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends; + elog(DEBUG2, "dynamic shared memory system will support %u segments", + maxitems); + segsize = dsm_control_bytes_needed(maxitems); + + /* Loop until we find an unused identifier for the new control segment. */ + for (;;) + { + Assert(dsm_control_address == NULL); + Assert(dsm_control_mapped_size == 0); + dsm_control_handle = random(); + if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, ERROR)) + break; + } + dsm_control = dsm_control_address; + on_shmem_exit(dsm_postmaster_shutdown, 0); + elog(DEBUG2, "created dynamic shared memory control segment %u (" + UINT64_FORMAT " bytes)", dsm_control_handle, segsize); + dsm_write_state_file(dsm_control_handle); + + /* Initialize control segment. */ + dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC; + dsm_control->nitems = 0; + dsm_control->maxitems = maxitems; +} + +/* + * Determine whether the control segment from the previous postmaster + * invocation still exists. If so, remove the dynamic shared memory + * segments to which it refers, and then the control segment itself. + */ +static void +dsm_cleanup_using_control_segment(void) +{ + void *mapped_address = NULL; + void *junk_mapped_address = NULL; + void *impl_private = NULL; + void *junk_impl_private = NULL; + uint64 mapped_size = 0; + uint64 junk_mapped_size = 0; + uint32 nitems; + uint32 i; + dsm_handle old_control_handle; + dsm_control_header *old_control; + + /* + * Read the state file. If it doesn't exist or is empty, there's nothing + * more to do. + */ + if (!dsm_read_state_file(&old_control_handle)) + return; + + /* + * Try to attach the segment. If this fails, it probably just means that + * the operating system has been rebooted and the segment no longer exists, + * or an unrelated proces has used the same shm ID. So just fall out + * quietly. + */ + if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + return; + + /* + * We've managed to reattach it, but the contents might not be sane. + * If they aren't, we disregard the segment after all. + */ + old_control = (dsm_control_header *) mapped_address; + if (!dsm_control_segment_sane(old_control, mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); + return; + } + + /* + * OK, the control segment looks basically valid, so we can get use + * it to get a list of segments that need to be removed. + */ + nitems = old_control->nitems; + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + uint32 refcnt; + + /* If the reference count is 0, the slot is actually unused. */ + refcnt = old_control->item[i].refcnt; + if (refcnt == 0) + continue; + + /* Log debugging information. */ + handle = old_control->item[i].handle; + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)", + handle, refcnt); + + /* Destroy the referenced segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Destroy the old control segment, too. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + old_control_handle); + dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); +} + +/* + * When we're using the mmap shared memory implementation, "shared memory" + * segments might even manage to survive an operating system reboot. + * But there's no guarantee as to exactly what will survive: some segments + * may survive, and others may not, and the contents of some may be out + * of date. In particular, the control segment may be out of date, so we + * can't rely on it to figure out what to remove. However, since we know + * what directory contains the files we used as shared memory, we can simply + * scan the directory and blow everything away that shouldn't be there. + */ +static void +dsm_cleanup_for_mmap(void) +{ + DIR *dir; + struct dirent *dent; + + /* Open the directory; can't use AllocateDir in postmaster. */ + if ((dir = opendir(PG_DYNSHMEM_DIR)) == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + PG_DYNSHMEM_DIR))); + + /* Scan for something with a name of the correct format. */ + while ((dent = readdir(dir)) != NULL) + { + if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX, + strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0) + { + char buf[MAXPGPATH]; + snprintf(buf, MAXPGPATH, PG_DYNSHMEM_DIR "/%s", dent->d_name); + + elog(DEBUG2, "removing file \"%s\"", buf); + + /* We found a matching file; so remove it. */ + if (unlink(buf) != 0) + { + int save_errno; + + save_errno = errno; + closedir(dir); + errno = save_errno; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", buf))); + } + } + } + + /* Cleanup complete. */ + closedir(dir); +} + +/* + * Read and parse the state file. + * + * If the state file is empty or the contents are garbled, it probably means + * that the operating system rebooted before the data written by the previous + * postmaster made it to disk. In that case, we can just ignore it; any shared + * memory from before the reboot should be gone anyway. + */ +static bool +dsm_read_state_file(dsm_handle *h) +{ + int statefd; + char statebuf[PG_DYNSHMEM_STATE_BUFSIZ]; + int nbytes = 0; + char *endptr, + *s; + dsm_handle handle; + + /* Read the state file to get the ID of the old control segment. */ + statefd = open(PG_DYNSHMEM_STATE_FILE, O_RDONLY | PG_BINARY, 0); + if (statefd < 0) + { + if (errno == ENOENT) + return false; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + PG_DYNSHMEM_STATE_FILE))); + } + nbytes = read(statefd, statebuf, PG_DYNSHMEM_STATE_BUFSIZ - 1); + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + PG_DYNSHMEM_STATE_FILE))); + /* make sure buffer is NUL terminated */ + statebuf[nbytes] = '\0'; + close(statefd); + + /* + * We expect to find the handle of the old control segment here, + * on a line by itself. + */ + handle = strtoul(statebuf, &endptr, 10); + for (s = endptr; *s == ' ' || *s == '\t'; ++s) + ; + if (*s != '\n' && *s != '\0') + return false; + + /* Looks good. */ + *h = handle; + return true; +} + +/* + * Write our control segment handle to the state file, so that if the + * postmaster is killed without running it's on_shmem_exit hooks, the + * next postmaster can clean things up after restart. + */ +static void +dsm_write_state_file(dsm_handle h) +{ + int statefd; + char statebuf[PG_DYNSHMEM_STATE_BUFSIZ]; + int nbytes; + + /* Create or truncate the file. */ + statefd = open(PG_DYNSHMEM_NEW_STATE_FILE, + O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, 0600); + if (statefd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + PG_DYNSHMEM_NEW_STATE_FILE))); + + /* Write contents. */ + snprintf(statebuf, PG_DYNSHMEM_STATE_BUFSIZ, "%u\n", dsm_control_handle); + nbytes = strlen(statebuf); + if (write(statefd, statebuf, nbytes) != nbytes) + { + if (errno == 0) + errno = ENOSPC; /* if no error signalled, assume no space */ + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PG_DYNSHMEM_NEW_STATE_FILE))); + } + + /* Close file. */ + close(statefd); + + /* + * Atomically rename file into place, so that no one ever sees a partially + * written state file. + */ + if (rename(PG_DYNSHMEM_NEW_STATE_FILE, PG_DYNSHMEM_STATE_FILE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\": %m", + PG_DYNSHMEM_NEW_STATE_FILE))); +} + +/* + * At shutdown time, we iterate over the control segment and remove all + * remaining dynamic shared memory segments. We avoid throwing errors here; + * the postmaster is shutting down either way, and this is just non-critical + * resource cleanup. + */ +static void +dsm_postmaster_shutdown(int code, Datum arg) +{ + uint32 nitems; + uint32 i; + void *dsm_control_address; + void *junk_mapped_address = NULL; + void *junk_impl_private = NULL; + uint64 junk_mapped_size = 0; + + /* + * If some other backend exited uncleanly, it might have corrupted the + * control segment while it was dying. In that case, we warn and ignore + * the contents of the control segment. This may end up leaving behind + * stray shared memory segments, but there's not much we can do about + * that if the metadata is gone. + */ + nitems = dsm_control->nitems; + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + ereport(LOG, + (errmsg("dynamic shared memory control segment is corrupt"))); + return; + } + + /* Remove any remaining segments. */ + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + + /* If the reference count is 0, the slot is actually unused. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + /* Log debugging information. */ + handle = dsm_control->item[i].handle; + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u", + handle); + + /* Destroy the segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Remove the control segment itself. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + dsm_control_handle); + dsm_control_address = dsm_control; + dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, LOG); + dsm_control = dsm_control_address; + + /* And, finally, remove the state file. */ + if (unlink(PG_DYNSHMEM_STATE_FILE) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not unlink file \"%s\": %m", + PG_DYNSHMEM_STATE_FILE))); +} + +/* + * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND, + * we must reread the state file and map the control segment; in other cases, + * we'll have inherited the postmaster's mapping and global variables. + */ +static void +dsm_backend_startup(void) +{ + /* If dynamic shared memory is disabled, reject this. */ + if (dynamic_shared_memory_type == DSM_IMPL_NONE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("dynamic shared memory is disabled"), + errhint("Set dynamic_shared_memory_type to a value other than \"none\"."))); + +#ifdef EXEC_BACKEND + { + dsm_handle control_handle; + void *control_address = NULL; + + /* Read the control segment information from the state file. */ + if (!dsm_read_state_file(&control_handle)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not parse dynamic shared memory state file"))); + + /* Attach control segment. */ + dsm_impl_op(DSM_OP_ATTACH, control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, ERROR); + dsm_control_handle = control_handle; + dsm_control = control_address; + /* If control segment doesn't look sane, something is badly wrong. */ + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, WARNING); + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("dynamic shared memory control segment is not valid"))); + } + } +#endif + + /* Arrange to detach segments on exit. */ + on_shmem_exit(dsm_backend_shutdown, 0); + + dsm_init_done = true; +} + +/* + * Create a new dynamic shared memory segment. + */ +dsm_segment * +dsm_create(uint64 size) +{ + dsm_segment *seg = dsm_create_descriptor(); + uint32 i; + uint32 nitems; + + /* Unsafe in postmaster (and pointless in a stand-alone backend). */ + Assert(IsUnderPostmaster); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* Loop until we find an unused segment identifier. */ + for (;;) + { + Assert(seg->mapped_address == NULL && seg->mapped_size == 0); + seg->handle = random(); + if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR)) + break; + } + + /* Lock the control segment so we can register the new segment. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + + /* Search the control segment for an unused slot. */ + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + if (dsm_control->item[i].refcnt == 0) + { + dsm_control->item[i].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[i].refcnt = 2; + seg->control_slot = i; + LWLockRelease(DynamicSharedMemoryControlLock); + return seg; + } + } + + /* Verify that we can support an additional mapping. */ + if (nitems >= dsm_control->maxitems) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("too many dynamic shared memory segments"))); + + /* Enter the handle into a new array slot. */ + dsm_control->item[nitems].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[nitems].refcnt = 2; + seg->control_slot = nitems; + dsm_control->nitems++; + LWLockRelease(DynamicSharedMemoryControlLock); + + return seg; +} + +/* + * Attach a dynamic shared memory segment. + * + * See comments for dsm_segment_handle() for an explanation of how this + * is intended to be used. + * + * This function will return NULL if the segment isn't known to the system. + * This can happen if we're asked to attach the segment, but then everyone + * else detaches it (causing it to be destroyed) before we get around to + * attaching it. + */ +dsm_segment * +dsm_attach(dsm_handle h) +{ + dsm_segment *seg; + dlist_iter iter; + uint32 i; + uint32 nitems; + + /* Unsafe in postmaster (and pointless in a stand-alone backend). */ + Assert(IsUnderPostmaster); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* + * Since this is just a debugging cross-check, we could leave it out + * altogether, or include it only in assert-enabled builds. But since + * the list of attached segments should normally be very short, let's + * include it always for right now. + * + * If you're hitting this error, you probably want to attempt to + * find an existing mapping via dsm_find_mapping() before calling + * dsm_attach() to create a new one. + */ + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == h) + elog(ERROR, "can't attach the same segment more than once"); + } + + /* Create a new segment descriptor. */ + seg = dsm_create_descriptor(); + seg->handle = h; + + /* Bump reference count for this segment in shared memory. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + /* If the reference count is 0, the slot is actually unused. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + /* + * If the reference count is 1, the slot is still in use, but the + * segment is in the process of going away. Treat that as if we + * didn't find a match. + */ + if (dsm_control->item[i].refcnt == 1) + break; + + /* Otherwise, if the descriptor matches, we've found a match. */ + if (dsm_control->item[i].handle == seg->handle) + { + dsm_control->item[i].refcnt++; + seg->control_slot = i; + break; + } + } + LWLockRelease(DynamicSharedMemoryControlLock); + + /* + * If we didn't find the handle we're looking for in the control + * segment, it probably means that everyone else who had it mapped, + * including the original creator, died before we got to this point. + * It's up to the caller to decide what to do about that. + */ + if (seg->control_slot == INVALID_CONTROL_SLOT) + { + dsm_detach(seg); + return NULL; + } + + /* Here's where we actually try to map the segment. */ + dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR); + + return seg; +} + +/* + * At backend shutdown time, detach any segments that are still attached. + */ +static void +dsm_backend_shutdown(int code, Datum arg) +{ + while (!dlist_is_empty(&dsm_segment_list)) + { + dsm_segment *seg; + + seg = dlist_head_element(dsm_segment, node, &dsm_segment_list); + dsm_detach(seg); + } +} + +/* + * Resize an existing shared memory segment. + * + * This may cause the shared memory segment to be remapped at a different + * address. For the caller's convenience, we return the mapped address. + */ +void * +dsm_resize(dsm_segment *seg, uint64 size) +{ + Assert(seg->control_slot != INVALID_CONTROL_SLOT); + dsm_impl_op(DSM_OP_RESIZE, seg->handle, size, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR); + return seg->mapped_address; +} + +/* + * Remap an existing shared memory segment. + * + * This is intended to be used when some other process has extended the + * mapping using dsm_resize(), but we've still only got the initial + * portion mapped. Since this might change the address at which the + * segment is mapped, we return the new mapped address. + */ +void * +dsm_remap(dsm_segment *seg) +{ + dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR); + + return seg->mapped_address; +} + +/* + * Detach from a shared memory segment, destroying the segment if we + * remove the last reference. + * + * This function should never fail. It will often be invoked when aborting + * a transaction, and a further error won't serve any purpose. It's not a + * complete disaster if we fail to unmap or destroy the segment; it means a + * resource leak, but that doesn't necessarily preclude further operations. + */ +void +dsm_detach(dsm_segment *seg) +{ + /* + * Try to remove the mapping, if one exists. Normally, there will be, + * but maybe not, if we failed partway through a create or attach + * operation. We remove the mapping before decrementing the reference + * count so that the process that sees a zero reference count can be + * certain that no remaining mappings exist. Even if this fails, we + * pretend that it works, because retrying is likely to fail in the + * same way. + */ + if (seg->mapped_address != NULL) + { + dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING); + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + } + + /* Reduce reference count, if we previously increased it. */ + if (seg->control_slot != INVALID_CONTROL_SLOT) + { + uint32 refcnt; + uint32 control_slot = seg->control_slot; + + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt > 1); + refcnt = --dsm_control->item[control_slot].refcnt; + seg->control_slot = INVALID_CONTROL_SLOT; + LWLockRelease(DynamicSharedMemoryControlLock); + + /* If new reference count is 1, try to destroy the segment. */ + if (refcnt == 1) + { + /* + * If we fail to destroy the segment here, or are killed before + * we finish doing so, the reference count will remain at 1, which + * will mean that nobody else can attach to the segment. At + * postmaster shutdown time, or when a new postmaster is started + * after a hard kill, another attempt will be made to remove the + * segment. + * + * The main case we're worried about here is being killed by + * a signal before we can finish removing the segment. In that + * case, it's important to be sure that the segment still gets + * removed. If we actually fail to remove the segment for some + * other reason, the postmaster may not have any better luck than + * we did. There's not much we can do about that, though. + */ + if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } + } + + /* Clean up our remaining backend-private data structures. */ + if (seg->resowner != NULL) + ResourceOwnerForgetDSM(seg->resowner, seg); + dlist_delete(&seg->node); + pfree(seg); +} + +/* + * Keep a dynamic shared memory mapping until end of session. + * + * By default, mappings are owned by the current resource owner, which + * typically means they stick around for the duration of the current query + * only. + */ +void +dsm_keep_mapping(dsm_segment *seg) +{ + if (seg->resowner != NULL) + { + ResourceOwnerForgetDSM(seg->resowner, seg); + seg->resowner = NULL; + } +} + +/* + * Find an existing mapping for a shared memory segment, if there is one. + */ +dsm_segment * +dsm_find_mapping(dsm_handle h) +{ + dlist_iter iter; + dsm_segment *seg; + + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == h) + return seg; + } + + return NULL; +} + +/* + * Get the address at which a dynamic shared memory segment is mapped. + */ +void * +dsm_segment_address(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_address; +} + +/* + * Get the size of a mapping. + */ +uint64 +dsm_segment_map_length(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_size; +} + +/* + * Get a handle for a mapping. + * + * To establish communication via dynamic shared memory between two backends, + * one of them should first call dsm_create() to establish a new shared + * memory mapping. That process should then call dsm_segment_handle() to + * obtain a handle for the mapping, and pass that handle to the + * coordinating backend via some means (e.g. bgw_main_arg, or via the + * main shared memory segment). The recipient, once in position of the + * handle, should call dsm_attach(). + */ +dsm_handle +dsm_segment_handle(dsm_segment *seg) +{ + return seg->handle; +} + +/* + * Create a segment descriptor. + */ +static dsm_segment * +dsm_create_descriptor(void) +{ + dsm_segment *seg; + + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + + seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment)); + dlist_push_head(&dsm_segment_list, &seg->node); + + /* seg->handle must be initialized by the caller */ + seg->control_slot = INVALID_CONTROL_SLOT; + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + + seg->resowner = CurrentResourceOwner; + ResourceOwnerRememberDSM(CurrentResourceOwner, seg); + + return seg; +} + +/* + * Sanity check a control segment. + * + * The goal here isn't to detect everything that could possibly be wrong with + * the control segment; there's not enough information for that. Rather, the + * goal is to make sure that someone can iterate over the items in the segment + * without overrunning the end of the mapping and crashing. We also check + * the magic number since, if that's messed up, this may not even be one of + * our segments at all. + */ +static bool +dsm_control_segment_sane(dsm_control_header *control, uint64 mapped_size) +{ + if (mapped_size < offsetof(dsm_control_header, item)) + return false; /* Mapped size too short to read header. */ + if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC) + return false; /* Magic number doesn't match. */ + if (dsm_control_bytes_needed(control->maxitems) > mapped_size) + return false; /* Max item count won't fit in map. */ + if (control->nitems > control->maxitems) + return false; /* Overfull. */ + return true; +} + +/* + * Compute the number of control-segment bytes needed to store a given + * number of items. + */ +static uint64 +dsm_control_bytes_needed(uint32 nitems) +{ + return offsetof(dsm_control_header, item) + + sizeof(dsm_control_item) * (uint64) nitems; +} diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c new file mode 100644 index 0000000000000000000000000000000000000000..f929f02743a927773411d7eeb5f9e75c3b825928 --- /dev/null +++ b/src/backend/storage/ipc/dsm_impl.c @@ -0,0 +1,990 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.c + * manage dynamic shared memory segments + * + * This file provides low-level APIs for creating and destroying shared + * memory segments using several different possible techniques. We refer + * to these segments as dynamic because they can be created, altered, and + * destroyed at any point during the server life cycle. This is unlike + * the main shared memory segment, of which there is always exactly one + * and which is always mapped at a fixed address in every PostgreSQL + * background process. + * + * Because not all systems provide the same primitives in this area, nor + * do all primitives behave the same way on all systems, we provide + * several implementations of this facility. Many systems implement + * POSIX shared memory (shm_open etc.), which is well-suited to our needs + * in this area, with the exception that shared memory identifiers live + * in a flat system-wide namespace, raising the uncomfortable prospect of + * name collisions with other processes (including other copies of + * PostgreSQL) running on the same system. Some systems only support + * the older System V shared memory interface (shmget etc.) which is + * also usable; however, the default allocation limits are often quite + * small, and the namespace is even more restricted. + * + * We also provide an mmap-based shared memory implementation. This may + * be useful on systems that provide shared memory via a special-purpose + * filesystem; by opting for this implementation, the user can even + * control precisely where their shared memory segments are placed. It + * can also be used as a fallback for systems where shm_open and shmget + * are not available or can't be used for some reason. Of course, + * mapping a file residing on an actual spinning disk is a fairly poor + * approximation for shared memory because writeback may hurt performance + * substantially, but there should be few systems where we must make do + * with such poor tools. + * + * As ever, Windows requires its own implemetation. + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#ifndef WIN32 +#include +#endif +#include +#ifdef HAVE_SYS_IPC_H +#include +#endif +#ifdef HAVE_SYS_SHM_H +#include +#endif + +#include "portability/mem.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#ifdef USE_DSM_POSIX +static bool dsm_impl_posix(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, + uint64 *mapped_size, int elevel); +#endif +#ifdef USE_DSM_SYSV +static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, + uint64 *mapped_size, int elevel); +#endif +#ifdef USE_DSM_WINDOWS +static bool dsm_impl_windows(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, + uint64 *mapped_size, int elevel); +#endif +#ifdef USE_DSM_MMAP +static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, + uint64 *mapped_size, int elevel); +#endif +static int errcode_for_dynamic_shared_memory(void); + +const struct config_enum_entry dynamic_shared_memory_options[] = { +#ifdef USE_DSM_POSIX + { "posix", DSM_IMPL_POSIX, false}, +#endif +#ifdef USE_DSM_SYSV + { "sysv", DSM_IMPL_SYSV, false}, +#endif +#ifdef USE_DSM_WINDOWS + { "windows", DSM_IMPL_WINDOWS, false}, +#endif +#ifdef USE_DSM_MMAP + { "mmap", DSM_IMPL_MMAP, false}, +#endif + { "none", DSM_IMPL_NONE, false}, + {NULL, 0, false} +}; + +/* Implementation selector. */ +int dynamic_shared_memory_type; + +/* Size of buffer to be used for zero-filling. */ +#define ZBUFFER_SIZE 8192 + +/*------ + * Perform a low-level shared memory operation in a platform-specific way, + * as dictated by the selected implementation. Each implementation is + * required to implement the following primitives. + * + * DSM_OP_CREATE. Create a segment whose size is the request_size and + * map it. + * + * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. + * The segment may already be mapped; any existing mapping should be removed + * before creating a new one. + * + * DSM_OP_DETACH. Unmap the segment. + * + * DSM_OP_RESIZE. Resize the segment to the given request_size and + * remap the segment at that new size. + * + * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the + * segment. + * + * Arguments: + * op: The operation to be performed. + * handle: The handle of an existing object, or for DSM_OP_CREATE, the + * a new handle the caller wants created. + * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE, + * the new size. Otherwise, 0. + * impl_private: Private, implementation-specific data. Will be a pointer + * to NULL for the first operation on a shared memory segment within this + * backend; thereafter, it will point to the value to which it was set + * on the previous call. + * mapped_address: Pointer to start of current mapping; pointer to NULL + * if none. Updated with new mapping address. + * mapped_size: Pointer to size of current mapping; pointer to 0 if none. + * Updated with new mapped size. + * elevel: Level at which to log errors. + * + * Return value: true on success, false on failure. When false is returned, + * a message should first be logged at the specified elevel, except in the + * case where DSM_OP_CREATE experiences a name collision, which should + * silently return false. + *----- + */ +bool +dsm_impl_op(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, uint64 *mapped_size, + int elevel) +{ + Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0); + Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || + (*mapped_address == NULL && *mapped_size == 0)); + + if (request_size > (size_t) -1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested shared memory size overflows size_t"))); + + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_POSIX + case DSM_IMPL_POSIX: + return dsm_impl_posix(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_SYSV + case DSM_IMPL_SYSV: + return dsm_impl_sysv(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + return dsm_impl_windows(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_MMAP + case DSM_IMPL_MMAP: + return dsm_impl_mmap(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif + } + elog(ERROR, "unexpected dynamic shared memory type: %d", + dynamic_shared_memory_type); +} + +/* + * Does the current dynamic shared memory implementation support resizing + * segments? (The answer here could be platform-dependent in the future, + * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently + * can't resize segments to anything larger than 256MB that way. For now, + * we keep it simple.) + */ +bool +dsm_impl_can_resize(void) +{ + switch (dynamic_shared_memory_type) + { + case DSM_IMPL_NONE: + return false; + case DSM_IMPL_POSIX: + return true; + case DSM_IMPL_SYSV: + return false; + case DSM_IMPL_WINDOWS: + return false; + case DSM_IMPL_MMAP: + return false; + default: + return false; /* should not happen */ + } +} + +#ifdef USE_DSM_POSIX +/* + * Operating system primitives to support POSIX shared memory. + * + * POSIX shared memory segments are created and attached using shm_open() + * and shm_unlink(); other operations, such as sizing or mapping the + * segment, are performed as if the shared memory segments were files. + * + * Indeed, on some platforms, they may be implemented that way. While + * POSIX shared memory segments seem intended to exist in a flat namespace, + * some operating systems may implement them as files, even going so far + * to treat a request for /xyz as a request to create a file by that name + * in the root directory. Users of such broken platforms should select + * a different shared memory implementation. + */ +static bool +dsm_impl_posix(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, uint64 *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, "/PostgreSQL.%u", handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* + * Create new segment or open an existing one for attach or resize. + * + * Even though we're not going through fd.c, we should be safe against + * running out of file descriptors, because of NUM_RESERVED_FDS. We're + * only opening one extra descriptor here, and we'll close it before + * returning. + */ + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = shm_open(name, flags, 0600)) == -1) + { + if (errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating or resizing the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else if (*mapped_size != request_size && ftruncate(fd, request_size)) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment %s to " UINT64_FORMAT " bytes: %m", + name, request_size))); + return false; + } + + /* + * If we're reattaching or resizing, we must remove any existing mapping, + * unless we've already got the right thing mapped. + */ + if (*mapped_address != NULL) + { + if (*mapped_size == request_size) + return true; + if (munmap(*mapped_address, *mapped_size) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_HASSEMAPHORE, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + close(fd); + + return true; +} +#endif + +#ifdef USE_DSM_SYSV +/* + * Operating system primitives to support System V shared memory. + * + * System V shared memory segments are manipulated using shmget(), shmat(), + * shmdt(), and shmctl(). There's no portable way to resize such + * segments. As the default allocation limits for System V shared memory + * are usually quite low, the POSIX facilities may be preferable; but + * those are not supported everywhere. + */ +static bool +dsm_impl_sysv(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, uint64 *mapped_size, + int elevel) +{ + key_t key; + int ident; + char *address; + char name[64]; + int *ident_cache; + + /* Resize is not supported for System V shared memory. */ + if (op == DSM_OP_RESIZE) + { + elog(elevel, "System V shared memory segments cannot be resized"); + return false; + } + + /* Since resize isn't supported, reattach is a no-op. */ + if (op == DSM_OP_ATTACH && *mapped_address != NULL) + return true; + + /* + * POSIX shared memory and mmap-based shared memory identify segments + * with names. To avoid needless error message variation, we use the + * handle as the name. + */ + snprintf(name, 64, "%u", handle); + + /* + * The System V shared memory namespace is very restricted; names are + * of type key_t, which is expected to be some sort of integer data type, + * but not necessarily the same one as dsm_handle. Since we use + * dsm_handle to identify shared memory segments across processes, this + * might seem like a problem, but it's really not. If dsm_handle is + * bigger than key_t, the cast below might truncate away some bits from + * the handle the user-provided, but it'll truncate exactly the same bits + * away in exactly the same fashion every time we use that handle, which + * is all that really matters. Conversely, if dsm_handle is smaller than + * key_t, we won't use the full range of available key space, but that's + * no big deal either. + * + * We do make sure that the key isn't negative, because that might not + * be portable. + */ + key = (key_t) handle; + if (key < 1) /* avoid compiler warning if type is unsigned */ + key = -key; + + /* + * There's one special key, IPC_PRIVATE, which can't be used. If we end + * up with that value by chance during a create operation, just pretend + * it already exists, so that caller will retry. If we run into it + * anywhere else, the caller has passed a handle that doesn't correspond + * to anything we ever created, which should not happen. + */ + if (key == IPC_PRIVATE) + { + if (op != DSM_OP_CREATE) + elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); + errno = EEXIST; + return false; + } + + /* + * Before we can do anything with a shared memory segment, we have to + * map the shared memory key to a shared memory identifier using shmget(). + * To avoid repeated lookups, we store the key using impl_private. + */ + if (*impl_private != NULL) + { + ident_cache = *impl_private; + ident = *ident_cache; + } + else + { + int flags = IPCProtection; + size_t segsize; + + /* + * Allocate the memory BEFORE acquiring the resource, so that we don't + * leak the resource if memory allocation fails. + */ + ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); + + /* + * When using shmget to find an existing segment, we must pass the + * size as 0. Passing a non-zero size which is greater than the + * actual size will result in EINVAL. + */ + segsize = 0; + + if (op == DSM_OP_CREATE) + { + flags |= IPC_CREAT | IPC_EXCL; + segsize = request_size; + } + + if ((ident = shmget(key, segsize, flags)) == -1) + { + if (errno != EEXIST) + { + int save_errno = errno; + pfree(ident_cache); + errno = save_errno; + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not get shared memory segment: %m"))); + } + return false; + } + + *ident_cache = ident; + *impl_private = ident_cache; + } + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + pfree(ident_cache); + *impl_private = NULL; + if (*mapped_address != NULL && shmdt(*mapped_address) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* If we're attaching it, we must use IPC_STAT to determine the size. */ + if (op == DSM_OP_ATTACH) + { + struct shmid_ds shm; + + if (shmctl(ident, IPC_STAT, &shm) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + if (op == DSM_OP_CREATE) + shmctl(ident, IPC_RMID, NULL); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = shm.shm_segsz; + } + + /* Map it. */ + address = shmat(ident, NULL, PG_SHMAT_FLAGS); + if (address == (void *) -1) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + if (op == DSM_OP_CREATE) + shmctl(ident, IPC_RMID, NULL); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + return true; +} +#endif + +#ifdef USE_DSM_WINDOWS +/* + * Operating system primitives to support Windows shared memory. + * + * Windows shared memory implementation is done using file mapping + * which can be backed by either physical file or system paging file. + * Current implementation uses system paging file as other effects + * like performance are not clear for physical file and it is used in similar + * way for main shared memory in windows. + * + * A memory mapping object is a kernel object - they always get deleted when + * the last reference to them goes away, either explicitly via a CloseHandle or + * when the process containing the reference exits. + */ +static bool +dsm_impl_windows(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, + uint64 *mapped_size, int elevel) +{ + char *address; + HANDLE hmap; + char name[64]; + MEMORY_BASIC_INFORMATION info; + + /* Resize is not supported for Windows shared memory. */ + if (op == DSM_OP_RESIZE) + { + elog(elevel, "Windows shared memory segments cannot be resized"); + return false; + } + + /* Since resize isn't supported, reattach is a no-op. */ + if (op == DSM_OP_ATTACH && *mapped_address != NULL) + return true; + + /* + * Storing the shared memory segment in the Global\ namespace, can + * allow any process running in any session to access that file + * mapping object provided that the caller has the required access rights. + * But to avoid issues faced in main shared memory, we are using the naming + * convention similar to main shared memory. We can change here once + * issue mentioned in GetSharedMemName is resolved. + */ + snprintf(name, 64, "Global/PostgreSQL.%u", handle); + + /* + * Handle teardown cases. Since Windows automatically destroys the object + * when no references reamin, we can treat it the same as detach. + */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && UnmapViewOfFile(*mapped_address) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + if (*impl_private != NULL + && CloseHandle(*impl_private) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + + *impl_private = NULL; + *mapped_address = NULL; + *mapped_size = 0; + return true; + } + + /* Create new segment or open an existing one for attach. */ + if (op == DSM_OP_CREATE) + { + DWORD size_high = (DWORD) (request_size >> 32); + DWORD size_low = (DWORD) request_size; + hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ + NULL, /* Default security attrs */ + PAGE_READWRITE, /* Memory is read/write */ + size_high, /* Upper 32 bits of size */ + size_low, /* Lower 32 bits of size */ + name); + _dosmaperr(GetLastError()); + if (errno == EEXIST) + { + /* + * On Windows, when the segment already exists, a handle for the + * existing segment is returned. We must close it before + * returning. We don't do _dosmaperr here, so errno won't be + * modified. + */ + CloseHandle(hmap); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + } + else + { + hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, + FALSE, /* do not inherit the name */ + name); /* name of mapping object */ + _dosmaperr(GetLastError()); + } + + if (!hmap) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* Map it. */ + address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, + 0, 0, 0); + if (!address) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * VirtualQuery gives size in page_size units, which is 4K for Windows. + * We need size only when we are attaching, but it's better to get the + * size when creating new segment to keep size consistent both for + * DSM_OP_CREATE and DSM_OP_ATTACH. + */ + if (VirtualQuery(address, &info, sizeof(info)) == 0) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + UnmapViewOfFile(address); + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + + *mapped_address = address; + *mapped_size = info.RegionSize; + *impl_private = hmap; + + return true; +} +#endif + +#ifdef USE_DSM_MMAP +/* + * Operating system primitives to support mmap-based shared memory. + * + * Calling this "shared memory" is somewhat of a misnomer, because what + * we're really doing is creating a bunch of files and mapping them into + * our address space. The operating system may feel obliged to + * synchronize the contents to disk even if nothing is being paged out, + * which will not serve us well. The user can relocate the pg_dynshmem + * directory to a ramdisk to avoid this problem, if available. + */ +static bool +dsm_impl_mmap(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, uint64 *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", + handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* Create new segment or open an existing one for attach or resize. */ + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = OpenTransientFile(name, flags, 0600)) == -1) + { + if (errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating or resizing the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else if (*mapped_size > request_size && ftruncate(fd, request_size)) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment %s to " UINT64_FORMAT " bytes: %m", + name, request_size))); + return false; + } + else if (*mapped_size < request_size) + { + /* + * Allocate a buffer full of zeros. + * + * Note: palloc zbuffer, instead of just using a local char array, + * to ensure it is reasonably well-aligned; this may save a few + * cycles transferring data to the kernel. + */ + char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); + uint32 remaining = request_size; + bool success = true; + + /* + * Zero-fill the file. We have to do this the hard way to ensure + * that all the file space has really been allocated, so that we + * don't later seg fault when accessing the memory mapping. This + * is pretty pessimal. + */ + while (success && remaining > 0) + { + uint64 goal = remaining; + + if (goal > ZBUFFER_SIZE) + goal = ZBUFFER_SIZE; + if (write(fd, zbuffer, goal) == goal) + remaining -= goal; + else + success = false; + } + + if (!success) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno ? save_errno : ENOSPC; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment %s to " UINT64_FORMAT " bytes: %m", + name, request_size))); + return false; + } + } + + /* + * If we're reattaching or resizing, we must remove any existing mapping, + * unless we've already got the right thing mapped. + */ + if (*mapped_address != NULL) + { + if (*mapped_size == request_size) + return true; + if (munmap(*mapped_address, *mapped_size) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_HASSEMAPHORE, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + CloseTransientFile(fd); + + return true; +} +#endif + +static int +errcode_for_dynamic_shared_memory() +{ + if (errno == EFBIG || errno == ENOMEM) + return errcode(ERRCODE_OUT_OF_MEMORY); + else + return errcode_for_file_access(); +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index a0b741b444a2935361dbe9b68dd4bec58c2bd469..040c7aa1044dabb6d4fe9a3e835772932b661b7d 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -30,6 +30,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/dsm.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -249,6 +250,10 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) ShmemBackendArrayAllocation(); #endif + /* Initialize dynamic shared memory facilities. */ + if (!IsUnderPostmaster) + dsm_postmaster_startup(); + /* * Now give loadable modules a chance to set up their shmem allocations */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ddbeb34ce723fdb551736e3b2b09edc22f169424..1756b48c4fe3566970cdc670a362e5ff726abb5b 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -61,6 +61,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/dsm_impl.h" #include "storage/standby.h" #include "storage/fd.h" #include "storage/proc.h" @@ -385,6 +386,7 @@ static const struct config_enum_entry synchronous_commit_options[] = { */ extern const struct config_enum_entry wal_level_options[]; extern const struct config_enum_entry sync_method_options[]; +extern const struct config_enum_entry dynamic_shared_memory_options[]; /* * GUC option variables that are exported from this module @@ -3335,6 +3337,16 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"dynamic_shared_memory_type", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Selects the dynamic shared memory implementation used."), + NULL + }, + &dynamic_shared_memory_type, + DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE, dynamic_shared_memory_options, + NULL, NULL, NULL + }, + { {"wal_sync_method", PGC_SIGHUP, WAL_SETTINGS, gettext_noop("Selects the method used for forcing WAL updates to disk."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 70221f42918acc77051d38b128d4d217d21cbdb0..707edf1d91d4bd651a10933fc3ad5685ca8dba42 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -123,6 +123,13 @@ #work_mem = 1MB # min 64kB #maintenance_work_mem = 16MB # min 1MB #max_stack_depth = 2MB # min 100kB +#dynamic_shared_memory_type = posix # the default is the first option + # supported by the operating system: + # posix + # sysv + # windows + # mmap + # use none to disable dynamic shared memory # - Disk - diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index e7ec3931f127910ffca0c5889deebf10a7427aa1..ba1770157996fb8b302652f4af2b0fc981996208 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -98,6 +98,11 @@ typedef struct ResourceOwnerData int nfiles; /* number of owned temporary files */ File *files; /* dynamically allocated array */ int maxfiles; /* currently allocated array size */ + + /* We have built-in support for remembering dynamic shmem segments */ + int ndsms; /* number of owned shmem segments */ + dsm_segment **dsms; /* dynamically allocated array */ + int maxdsms; /* currently allocated array size */ } ResourceOwnerData; @@ -132,6 +137,7 @@ static void PrintPlanCacheLeakWarning(CachedPlan *plan); static void PrintTupleDescLeakWarning(TupleDesc tupdesc); static void PrintSnapshotLeakWarning(Snapshot snapshot); static void PrintFileLeakWarning(File file); +static void PrintDSMLeakWarning(dsm_segment *seg); /***************************************************************************** @@ -271,6 +277,21 @@ ResourceOwnerReleaseInternal(ResourceOwner owner, PrintRelCacheLeakWarning(owner->relrefs[owner->nrelrefs - 1]); RelationClose(owner->relrefs[owner->nrelrefs - 1]); } + + /* + * Release dynamic shared memory segments. Note that dsm_detach() + * will remove the segment from my list, so I just have to iterate + * until there are none. + * + * As in the preceding cases, warn if there are leftover at commit + * time. + */ + while (owner->ndsms > 0) + { + if (isCommit) + PrintDSMLeakWarning(owner->dsms[owner->ndsms - 1]); + dsm_detach(owner->dsms[owner->ndsms - 1]); + } } else if (phase == RESOURCE_RELEASE_LOCKS) { @@ -402,6 +423,7 @@ ResourceOwnerDelete(ResourceOwner owner) Assert(owner->ncatrefs == 0); Assert(owner->ncatlistrefs == 0); Assert(owner->nrelrefs == 0); + Assert(owner->ndsms == 0); Assert(owner->nplanrefs == 0); Assert(owner->ntupdescs == 0); Assert(owner->nsnapshots == 0); @@ -438,6 +460,8 @@ ResourceOwnerDelete(ResourceOwner owner) pfree(owner->snapshots); if (owner->files) pfree(owner->files); + if (owner->dsms) + pfree(owner->dsms); pfree(owner); } @@ -1230,3 +1254,88 @@ PrintFileLeakWarning(File file) "temporary file leak: File %d still referenced", file); } + +/* + * Make sure there is room for at least one more entry in a ResourceOwner's + * dynamic shmem segment reference array. + * + * This is separate from actually inserting an entry because if we run out + * of memory, it's critical to do so *before* acquiring the resource. + */ +void +ResourceOwnerEnlargeDSMs(ResourceOwner owner) +{ + int newmax; + + if (owner->ndsms < owner->maxdsms) + return; /* nothing to do */ + + if (owner->dsms == NULL) + { + newmax = 16; + owner->dsms = (dsm_segment **) + MemoryContextAlloc(TopMemoryContext, + newmax * sizeof(dsm_segment *)); + owner->maxdsms = newmax; + } + else + { + newmax = owner->maxdsms * 2; + owner->dsms = (dsm_segment **) + repalloc(owner->dsms, newmax * sizeof(dsm_segment *)); + owner->maxdsms = newmax; + } +} + +/* + * Remember that a dynamic shmem segment is owned by a ResourceOwner + * + * Caller must have previously done ResourceOwnerEnlargeDSMs() + */ +void +ResourceOwnerRememberDSM(ResourceOwner owner, dsm_segment *seg) +{ + Assert(owner->ndsms < owner->maxdsms); + owner->dsms[owner->ndsms] = seg; + owner->ndsms++; +} + +/* + * Forget that a temporary file is owned by a ResourceOwner + */ +void +ResourceOwnerForgetDSM(ResourceOwner owner, dsm_segment *seg) +{ + dsm_segment **dsms = owner->dsms; + int ns1 = owner->ndsms - 1; + int i; + + for (i = ns1; i >= 0; i--) + { + if (dsms[i] == seg) + { + while (i < ns1) + { + dsms[i] = dsms[i + 1]; + i++; + } + owner->ndsms = ns1; + return; + } + } + elog(ERROR, + "dynamic shared memory segment %u is not owned by resource owner %s", + dsm_segment_handle(seg), owner->name); +} + + +/* + * Debugging subroutine + */ +static void +PrintDSMLeakWarning(dsm_segment *seg) +{ + elog(WARNING, + "dynamic shared memory leak: segment %u still referenced", + dsm_segment_handle(seg)); +} diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index f66f5302883a7d58e8ff313aee19f74e22d60ddf..a6eb0d806162b0665acd1b58ca440bfa6d35c0c0 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -182,6 +182,7 @@ const char *subdirs[] = { "pg_xlog", "pg_xlog/archive_status", "pg_clog", + "pg_dynshmem", "pg_notify", "pg_serial", "pg_snapshots", diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 8aabf3c87a4706a2181f5d59eb93d8587ce65044..5eac52d93a9c24f78f7324c45d87ffee0da9cb7c 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -424,6 +424,9 @@ /* Define to 1 if you have the `setsid' function. */ #undef HAVE_SETSID +/* Define to 1 if you have the `shm_open' function. */ +#undef HAVE_SHM_OPEN + /* Define to 1 if you have the `sigprocmask' function. */ #undef HAVE_SIGPROCMASK diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h new file mode 100644 index 0000000000000000000000000000000000000000..2a07c10f1ee294c9b7aa10d10d975f49d66eb646 --- /dev/null +++ b/src/include/portability/mem.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * mem.h + * portability definitions for various memory operations + * + * Copyright (c) 2001-2013, PostgreSQL Global Development Group + * + * src/include/portability/mem.h + * + *------------------------------------------------------------------------- + */ +#ifndef MEM_H +#define MEM_H + +#define IPCProtection (0600) /* access/modify by user only */ + +#ifdef SHM_SHARE_MMU /* use intimate shared memory on Solaris */ +#define PG_SHMAT_FLAGS SHM_SHARE_MMU +#else +#define PG_SHMAT_FLAGS 0 +#endif + +/* Linux prefers MAP_ANONYMOUS, but the flag is called MAP_ANON on other systems. */ +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +/* BSD-derived systems have MAP_HASSEMAPHORE, but it's not present (or needed) on Linux. */ +#ifndef MAP_HASSEMAPHORE +#define MAP_HASSEMAPHORE 0 +#endif + +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) + +/* Some really old systems don't define MAP_FAILED. */ +#ifndef MAP_FAILED +#define MAP_FAILED ((void *) -1) +#endif + +#endif /* MEM_H */ diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h new file mode 100644 index 0000000000000000000000000000000000000000..2b5e7227a0e1ac2ebe8c928299e8d3f6162b69f1 --- /dev/null +++ b/src/include/storage/dsm.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * dsm.h + * manage dynamic shared memory segments + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/dsm.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSM_H +#define DSM_H + +#include "storage/dsm_impl.h" + +typedef struct dsm_segment dsm_segment; + +/* Initialization function. */ +extern void dsm_postmaster_startup(void); + +/* Functions that create, update, or remove mappings. */ +extern dsm_segment *dsm_create(uint64 size); +extern dsm_segment *dsm_attach(dsm_handle h); +extern void *dsm_resize(dsm_segment *seg, uint64 size); +extern void *dsm_remap(dsm_segment *seg); +extern void dsm_detach(dsm_segment *seg); + +/* Resource management functions. */ +extern void dsm_keep_mapping(dsm_segment *seg); +extern dsm_segment *dsm_find_mapping(dsm_handle h); + +/* Informational functions. */ +extern void *dsm_segment_address(dsm_segment *seg); +extern uint64 dsm_segment_map_length(dsm_segment *seg); +extern dsm_handle dsm_segment_handle(dsm_segment *seg); + +#endif /* DSM_H */ diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..13f1f48b237a834b919b80da7fcdd057400bbc0b --- /dev/null +++ b/src/include/storage/dsm_impl.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.h + * low-level dynamic shared memory primitives + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/dsm_impl.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSM_IMPL_H +#define DSM_IMPL_H + +/* Dynamic shared memory implementations. */ +#define DSM_IMPL_NONE 0 +#define DSM_IMPL_POSIX 1 +#define DSM_IMPL_SYSV 2 +#define DSM_IMPL_WINDOWS 3 +#define DSM_IMPL_MMAP 4 + +/* + * Determine which dynamic shared memory implementations will be supported + * on this platform, and which one will be the default. + */ +#ifdef WIN32 +#define USE_DSM_WINDOWS +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_WINDOWS +#else +#ifdef HAVE_SHM_OPEN +#define USE_DSM_POSIX +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_POSIX +#endif +#define USE_DSM_SYSV +#ifndef DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_SYSV +#endif +#define USE_DSM_MMAP +#endif + +/* GUC. */ +extern int dynamic_shared_memory_type; + +/* + * Directory for on-disk state. + * + * This is used by all implementations for crash recovery and by the mmap + * implementation for storage. + */ +#define PG_DYNSHMEM_DIR "pg_dynshmem" +#define PG_DYNSHMEM_MMAP_FILE_PREFIX "mmap." + +/* A "name" for a dynamic shared memory segment. */ +typedef uint32 dsm_handle; + +/* All the shared-memory operations we know about. */ +typedef enum +{ + DSM_OP_CREATE, + DSM_OP_ATTACH, + DSM_OP_DETACH, + DSM_OP_RESIZE, + DSM_OP_DESTROY +} dsm_op; + +/* Create, attach to, detach from, resize, or destroy a segment. */ +extern bool dsm_impl_op(dsm_op op, dsm_handle handle, uint64 request_size, + void **impl_private, void **mapped_address, uint64 *mapped_size, + int elevel); + +/* Some implementations cannot resize segments. Can this one? */ +extern bool dsm_impl_can_resize(void); + +#endif /* DSM_IMPL_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 39415a398a643a84edd7a4a1fbc3c2e7a9803075..730c47ba68691f9162da094ba0cad13ebb08bd2e 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -80,6 +80,7 @@ typedef enum LWLockId OldSerXidLock, SyncRepLock, BackgroundWorkerLock, + DynamicSharedMemoryControlLock, /* Individual lock IDs end here */ FirstBufMappingLock, FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS, diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h index a5d8707be2f5cc89734e7d2b7a21b2ebffc0ae0d..6693483368b608f6778c3d5ad8487412273b6561 100644 --- a/src/include/utils/resowner_private.h +++ b/src/include/utils/resowner_private.h @@ -16,6 +16,7 @@ #ifndef RESOWNER_PRIVATE_H #define RESOWNER_PRIVATE_H +#include "storage/dsm.h" #include "storage/fd.h" #include "storage/lock.h" #include "utils/catcache.h" @@ -80,4 +81,11 @@ extern void ResourceOwnerRememberFile(ResourceOwner owner, extern void ResourceOwnerForgetFile(ResourceOwner owner, File file); +/* support for dynamic shared memory management */ +extern void ResourceOwnerEnlargeDSMs(ResourceOwner owner); +extern void ResourceOwnerRememberDSM(ResourceOwner owner, + dsm_segment *); +extern void ResourceOwnerForgetDSM(ResourceOwner owner, + dsm_segment *); + #endif /* RESOWNER_PRIVATE_H */