From 6cc4451b5c47eac02e09c3342281da469374432d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 15 Nov 2007 20:36:40 +0000 Subject: [PATCH] Prevent re-use of a deleted relation's relfilenode until after the next checkpoint. This guards against an unlikely data-loss scenario in which we re-use the relfilenode, then crash, then replay the deletion and recreation of the file. Even then we'd be OK if all insertions into the new relation had been WAL-logged ... but that's not guaranteed given all the no-WAL-logging optimizations that have recently been added. Patch by Heikki Linnakangas, per a discussion last month. --- src/backend/access/transam/xlog.c | 16 ++- src/backend/commands/tablespace.c | 29 ++++- src/backend/storage/smgr/md.c | 204 ++++++++++++++++++++++++++++-- src/backend/storage/smgr/smgr.c | 44 ++++++- src/include/storage/smgr.h | 6 +- 5 files changed, 274 insertions(+), 25 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6d4dc6940a..36adc20848 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -45,6 +45,7 @@ #include "storage/fd.h" #include "storage/pmsignal.h" #include "storage/procarray.h" +#include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" @@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags) UpdateControlFile(); } + /* + * Let smgr prepare for checkpoint; this has to happen before we + * determine the REDO pointer. Note that smgr must not do anything + * that'd have to be undone if we decide no checkpoint is needed. + */ + smgrpreckpt(); + + /* Begin filling in the checkpoint WAL record */ MemSet(&checkPoint, 0, sizeof(checkPoint)); checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.time = time(NULL); @@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); + /* + * Let smgr do post-checkpoint cleanup (eg, deleting old files). + */ + smgrpostckpt(); + /* * Delete old log files (those no longer needed even for previous * checkpoint). diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index f19e237315..305da59da0 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -57,6 +57,7 @@ #include "commands/comment.h" #include "commands/tablespace.h" #include "miscadmin.h" +#include "postmaster/bgwriter.h" #include "storage/fd.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt) LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* - * Try to remove the physical infrastructure + * Try to remove the physical infrastructure. */ if (!remove_tablespace_directories(tablespaceoid, false)) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("tablespace \"%s\" is not empty", - tablespacename))); + { + /* + * Not all files deleted? However, there can be lingering empty files + * in the directories, left behind by for example DROP TABLE, that + * have been scheduled for deletion at next checkpoint (see comments + * in mdunlink() for details). We could just delete them immediately, + * but we can't tell them apart from important data files that we + * mustn't delete. So instead, we force a checkpoint which will clean + * out any lingering files, and try again. + */ + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + if (!remove_tablespace_directories(tablespaceoid, false)) + { + /* Still not empty, the files must be important then */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tablespace \"%s\" is not empty", + tablespacename))); + } + } /* Record the filesystem change in XLOG */ { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 251811421e..59d39117f3 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -34,6 +34,7 @@ /* special values for the segno arg to RememberFsyncRequest */ #define FORGET_RELATION_FSYNC (InvalidBlockNumber) #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) +#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2) /* * On Windows, we have to interpret EACCES as possibly meaning the same as @@ -113,6 +114,10 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * table remembers the pending operations. We use a hash table mostly as * a convenient way of eliminating duplicate requests. * + * We use a similar mechanism to remember no-longer-needed files that can + * be deleted after the next checkpoint, but we use a linked list instead of + * a hash table, because we don't expect there to be any duplicate requests. + * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) */ @@ -131,9 +136,17 @@ typedef struct CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */ } PendingOperationEntry; +typedef struct +{ + RelFileNode rnode; /* the dead relation to delete */ + CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ +} PendingUnlinkEntry; + static HTAB *pendingOpsTable = NULL; +static List *pendingUnlinks = NIL; static CycleCtr mdsync_cycle_ctr = 0; +static CycleCtr mdckpt_cycle_ctr = 0; typedef enum /* behavior for mdopen & _mdfd_getseg */ @@ -146,6 +159,7 @@ typedef enum /* behavior for mdopen & _mdfd_getseg */ /* local routines */ static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior); static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg); +static void register_unlink(RelFileNode rnode); static MdfdVec *_fdvec_alloc(void); #ifndef LET_OS_MANAGE_FILESIZE @@ -188,6 +202,7 @@ mdinit(void) 100L, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + pendingUnlinks = NIL; } } @@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo) * Note that we're passed a RelFileNode --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * + * Actually, we don't unlink the first segment file of the relation, but + * just truncate it to zero length, and record a request to unlink it after + * the next checkpoint. Additional segments can be unlinked immediately, + * however. Leaving the empty file in place prevents that relfilenode + * number from being reused. The scenario this protects us from is: + * 1. We delete a relation (and commit, and actually remove its file). + * 2. We create a new relation, which by chance gets the same relfilenode as + * the just-deleted one (OIDs must've wrapped around for that to happen). + * 3. We crash before another checkpoint occurs. + * During replay, we would delete the file and then recreate it, which is fine + * if the contents of the file were repopulated by subsequent WAL entries. + * But if we didn't WAL-log insertions, but instead relied on fsyncing the + * file after populating it (as for instance CLUSTER and CREATE INDEX do), + * the contents of the file would be lost forever. By leaving the empty file + * until after the next checkpoint, we prevent reassignment of the relfilenode + * number until it's safe, because relfilenode assignment skips over any + * existing file. + * * If isRedo is true, it's okay for the relation to be already gone. - * Also, any failure should be reported as WARNING not ERROR, because + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */ void mdunlink(RelFileNode rnode, bool isRedo) { char *path; + int ret; /* * We have to clean out any pending fsync requests for the doomed relation, @@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo) path = relpath(rnode); - /* Delete the first segment, or only segment if not doing segmenting */ - if (unlink(path) < 0) + /* + * Delete or truncate the first segment, or only segment if not doing + * segmenting + */ + if (isRedo) + ret = unlink(path); + else + ret = truncate(path, 0); + if (ret < 0) { if (!isRedo || errno != ENOENT) ereport(WARNING, @@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo) #endif pfree(path); + + /* Register request to unlink first segment later */ + if (!isRedo) + register_unlink(rnode); } /* @@ -1063,6 +1112,91 @@ mdsync(void) mdsync_in_progress = false; } +/* + * mdpreckpt() -- Do pre-checkpoint work + * + * To distinguish unlink requests that arrived before this checkpoint + * started from those that arrived during the checkpoint, we use a cycle + * counter similar to the one we use for fsync requests. That cycle + * counter is incremented here. + * + * This must be called *before* the checkpoint REDO point is determined. + * That ensures that we won't delete files too soon. + * + * Note that we can't do anything here that depends on the assumption + * that the checkpoint will be completed. + */ +void +mdpreckpt(void) +{ + ListCell *cell; + + /* + * In case the prior checkpoint wasn't completed, stamp all entries in + * the list with the current cycle counter. Anything that's in the + * list at the start of checkpoint can surely be deleted after the + * checkpoint is finished, regardless of when the request was made. + */ + foreach(cell, pendingUnlinks) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); + + entry->cycle_ctr = mdckpt_cycle_ctr; + } + + /* + * Any unlink requests arriving after this point will be assigned the + * next cycle counter, and won't be unlinked until next checkpoint. + */ + mdckpt_cycle_ctr++; +} + +/* + * mdpostckpt() -- Do post-checkpoint work + * + * Remove any lingering files that can now be safely removed. + */ +void +mdpostckpt(void) +{ + while (pendingUnlinks != NIL) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); + char *path; + + /* + * New entries are appended to the end, so if the entry is new + * we've reached the end of old entries. + */ + if (entry->cycle_ctr == mdsync_cycle_ctr) + break; + + /* Else assert we haven't missed it */ + Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr); + + /* Unlink the file */ + path = relpath(entry->rnode); + if (unlink(path) < 0) + { + /* + * ENOENT shouldn't happen either, but it doesn't really matter + * because we would've deleted it now anyway. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove relation %u/%u/%u: %m", + entry->rnode.spcNode, + entry->rnode.dbNode, + entry->rnode.relNode))); + } + pfree(path); + + pendingUnlinks = list_delete_first(pendingUnlinks); + pfree(entry); + } +} + /* * register_dirty_segment() -- Mark a relation segment as needing fsync * @@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg) } } +/* + * register_unlink() -- Schedule a file to be deleted after next checkpoint + * + * As with register_dirty_segment, this could involve either a local or + * a remote pending-ops table. + */ +static void +register_unlink(RelFileNode rnode) +{ + if (pendingOpsTable) + { + /* push it into local pending-ops table */ + RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST); + } + else + { + /* + * Notify the bgwriter about it. If we fail to queue the request + * message, we have to sleep and try again, because we can't simply + * delete the file now. Ugly, but hopefully won't happen often. + * + * XXX should we just leave the file orphaned instead? + */ + Assert(IsUnderPostmaster); + while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST)) + pg_usleep(10000L); /* 10 msec seems a good number */ + } +} + /* * RememberFsyncRequest() -- callback from bgwriter side of fsync request * - * We stuff the fsync request into the local hash table for execution - * during the bgwriter's next checkpoint. + * We stuff most fsync requests into the local hash table for execution + * during the bgwriter's next checkpoint. UNLINK requests go into a + * separate linked list, however, because they get processed separately. * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. - * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for - * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for - * a whole database. (These are a tad slow because the hash table has to be - * searched linearly, but it doesn't seem worth rethinking the table structure - * for them.) + * We define three: + * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation + * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database + * - UNLINK_RELATION_REQUEST is a request to delete the file after the next + * checkpoint. + * + * (Handling the FORGET_* requests is a tad slow because the hash table has + * to be searched linearly, but it doesn't seem worth rethinking the table + * structure for them.) */ void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) @@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) } } } + else if (segno == UNLINK_RELATION_REQUEST) + { + /* Unlink request: put it in the linked list */ + MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); + PendingUnlinkEntry *entry; + + entry = palloc(sizeof(PendingUnlinkEntry)); + entry->rnode = rnode; + entry->cycle_ctr = mdckpt_cycle_ctr; + + pendingUnlinks = lappend(pendingUnlinks, entry); + + MemoryContextSwitchTo(oldcxt); + } else { /* Normal case: enter a request to fsync this segment */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 22ac13146c..6b13483a6d 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.106 2007/09/05 18:10:48 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.107 2007/11/15 20:36:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -55,9 +55,11 @@ typedef struct f_smgr void (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks, bool isTemp); void (*smgr_immedsync) (SMgrRelation reln); - void (*smgr_commit) (void); /* may be NULL */ - void (*smgr_abort) (void); /* may be NULL */ - void (*smgr_sync) (void); /* may be NULL */ + void (*smgr_commit) (void); /* may be NULL */ + void (*smgr_abort) (void); /* may be NULL */ + void (*smgr_pre_ckpt) (void); /* may be NULL */ + void (*smgr_sync) (void); /* may be NULL */ + void (*smgr_post_ckpt) (void); /* may be NULL */ } f_smgr; @@ -65,7 +67,7 @@ static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, - NULL, NULL, mdsync + NULL, NULL, mdpreckpt, mdsync, mdpostckpt } }; @@ -778,7 +780,22 @@ smgrabort(void) } /* - * smgrsync() -- Sync files to disk at checkpoint time. + * smgrpreckpt() -- Prepare for checkpoint. + */ +void +smgrpreckpt(void) +{ + int i; + + for (i = 0; i < NSmgr; i++) + { + if (smgrsw[i].smgr_pre_ckpt) + (*(smgrsw[i].smgr_pre_ckpt)) (); + } +} + +/* + * smgrsync() -- Sync files to disk during checkpoint. */ void smgrsync(void) @@ -792,6 +809,21 @@ smgrsync(void) } } +/* + * smgrpostckpt() -- Post-checkpoint cleanup. + */ +void +smgrpostckpt(void) +{ + int i; + + for (i = 0; i < NSmgr; i++) + { + if (smgrsw[i].smgr_post_ckpt) + (*(smgrsw[i].smgr_post_ckpt)) (); + } +} + void smgr_redo(XLogRecPtr lsn, XLogRecord *record) diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index bc071e7ef0..87b0171a1b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.59 2007/09/05 18:10:48 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.60 2007/11/15 20:36:40 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -83,7 +83,9 @@ extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); extern void smgrcommit(void); extern void smgrabort(void); +extern void smgrpreckpt(void); extern void smgrsync(void); +extern void smgrpostckpt(void); extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record); extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec); @@ -104,7 +106,9 @@ extern void mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, extern BlockNumber mdnblocks(SMgrRelation reln); extern void mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp); extern void mdimmedsync(SMgrRelation reln); +extern void mdpreckpt(void); extern void mdsync(void); +extern void mdpostckpt(void); extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno); extern void ForgetRelationFsyncRequests(RelFileNode rnode); -- GitLab