xlog.c 263.6 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.369 2010/02/08 09:08:51 heikki Exp $
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <ctype.h>
T
Tom Lane 已提交
18
#include <signal.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23 24
#include <sys/wait.h>
#include <unistd.h>
25

26
#include "access/clog.h"
27
#include "access/multixact.h"
28
#include "access/subtrans.h"
29
#include "access/transam.h"
30
#include "access/tuptoaster.h"
31
#include "access/twophase.h"
32
#include "access/xact.h"
33
#include "access/xlog_internal.h"
34
#include "access/xlogutils.h"
35
#include "catalog/catversion.h"
T
Tom Lane 已提交
36
#include "catalog/pg_control.h"
37
#include "catalog/pg_database.h"
38 39
#include "catalog/pg_type.h"
#include "funcapi.h"
40
#include "libpq/pqsignal.h"
41
#include "miscadmin.h"
42
#include "pgstat.h"
43
#include "postmaster/bgwriter.h"
44 45
#include "replication/walreceiver.h"
#include "replication/walsender.h"
46
#include "storage/bufmgr.h"
47
#include "storage/fd.h"
48
#include "storage/ipc.h"
49
#include "storage/pmsignal.h"
50
#include "storage/procarray.h"
51
#include "storage/smgr.h"
52
#include "storage/spin.h"
53
#include "utils/builtins.h"
54
#include "utils/guc.h"
55
#include "utils/ps_status.h"
56
#include "utils/relmapper.h"
57
#include "pg_trace.h"
58

59

60 61
/* File path names (all relative to $PGDATA) */
#define BACKUP_LABEL_FILE		"backup_label"
62
#define BACKUP_LABEL_OLD		"backup_label.old"
63 64 65 66
#define RECOVERY_COMMAND_FILE	"recovery.conf"
#define RECOVERY_COMMAND_DONE	"recovery.done"


T
Tom Lane 已提交
67 68
/* User-settable parameters */
int			CheckPointSegments = 3;
V
Vadim B. Mikheev 已提交
69
int			XLOGbuffers = 8;
70
int			XLogArchiveTimeout = 0;
71
bool		XLogArchiveMode = false;
72
char	   *XLogArchiveCommand = NULL;
73 74
bool 		XLogRequestRecoveryConnections = true;
int			MaxStandbyDelay = 30;
75
bool		fullPageWrites = true;
76
bool		log_checkpoints = false;
77
int			sync_method = DEFAULT_SYNC_METHOD;
T
Tom Lane 已提交
78

79 80 81 82
#ifdef WAL_DEBUG
bool		XLOG_DEBUG = false;
#endif

83
/*
84 85 86 87 88
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
B
Bruce Momjian 已提交
89
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
90 91 92
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
93 94 95
 */
#define XLOGfileslop	(2*CheckPointSegments + 1)

96 97 98 99
/*
 * GUC support
 */
const struct config_enum_entry sync_method_options[] = {
100
	{"fsync", SYNC_METHOD_FSYNC, false},
101
#ifdef HAVE_FSYNC_WRITETHROUGH
102
	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
103 104
#endif
#ifdef HAVE_FDATASYNC
105
	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
106 107
#endif
#ifdef OPEN_SYNC_FLAG
108
	{"open_sync", SYNC_METHOD_OPEN, false},
109 110
#endif
#ifdef OPEN_DATASYNC_FLAG
111
	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
112
#endif
113
	{NULL, 0, false}
114
};
T
Tom Lane 已提交
115

116 117 118 119 120 121 122
/*
 * Statistics for current checkpoint are collected in this global struct.
 * Because only the background writer or a stand-alone backend can perform
 * checkpoints, this will be unused in normal backends.
 */
CheckpointStatsData CheckpointStats;

T
Tom Lane 已提交
123
/*
124 125
 * ThisTimeLineID will be same in all backends --- it identifies current
 * WAL timeline for the database system.
T
Tom Lane 已提交
126
 */
127
TimeLineID	ThisTimeLineID = 0;
V
WAL  
Vadim B. Mikheev 已提交
128

129
/*
130
 * Are we doing recovery from XLOG?
131
 *
132 133 134 135 136
 * This is only ever true in the startup process; it should be read as meaning
 * "this process is replaying WAL records", rather than "the system is in
 * recovery mode".  It should be examined primarily by functions that need
 * to act differently when called from a WAL redo function (e.g., to skip WAL
 * logging).  To check whether the system is in recovery regardless of which
137 138
 * process you're running in, use RecoveryInProgress() but only after shared
 * memory startup and lock initialization.
139
 */
T
Tom Lane 已提交
140
bool		InRecovery = false;
B
Bruce Momjian 已提交
141

142 143 144 145 146
/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
HotStandbyState		standbyState = STANDBY_DISABLED;

static 	XLogRecPtr	LastRec;

147 148
/*
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
149
 * known, need to check the shared state".
150 151 152
 */
static bool LocalRecoveryInProgress = true;

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
/*
 * Local state for XLogInsertAllowed():
 *		1: unconditionally allowed to insert XLOG
 *		0: unconditionally not allowed to insert XLOG
 *		-1: must check RecoveryInProgress(); disallow until it is false
 * Most processes start with -1 and transition to 1 after seeing that recovery
 * is not in progress.  But we can also force the value for special cases.
 * The coding in XLogInsertAllowed() depends on the first two of these states
 * being numerically the same as bool true and false.
 */
static int	LocalXLogInsertAllowed = -1;

/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;

168
/* Was the last xlog file restored from archive, or local? */
B
Bruce Momjian 已提交
169
static bool restoredFromArchive = false;
170

171
/* options taken from recovery.conf for archive recovery */
172
static char *recoveryRestoreCommand = NULL;
173
static char *recoveryEndCommand = NULL;
174 175 176
static bool recoveryTarget = false;
static bool recoveryTargetExact = false;
static bool recoveryTargetInclusive = true;
B
Bruce Momjian 已提交
177
static TransactionId recoveryTargetXid;
178
static TimestampTz recoveryTargetTime;
179
static TimestampTz recoveryLastXTime = 0;
180

181 182 183 184 185
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyMode = false;
static char *PrimaryConnInfo = NULL;
char *TriggerFile = NULL;

186
/* if recoveryStopsHere returns true, it saves actual stop xid/time here */
B
Bruce Momjian 已提交
187
static TransactionId recoveryStopXid;
188
static TimestampTz recoveryStopTime;
B
Bruce Momjian 已提交
189
static bool recoveryStopAfter;
190 191 192 193 194 195 196 197 198 199 200 201 202

/*
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 * During recovery, however, things are more complicated.  To simplify life
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 * scan through the WAL history (that is, it is the line that was active when
 * the currently-scanned WAL record was generated).  We also need these
 * timeline values:
 *
 * recoveryTargetTLI: the desired timeline that we want to end in.
 *
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 * its known parents, newest first (so recoveryTargetTLI is always the
B
Bruce Momjian 已提交
203
 * first list member).	Only these TLIs are expected to be seen in the WAL
204 205 206 207 208 209 210 211 212
 * segments we read, and indeed only these TLIs will be considered as
 * candidate WAL files to open at all.
 *
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 * (This is not necessarily the same as ThisTimeLineID, because we could
 * be scanning data that was copied from an ancestor timeline when the current
 * file was created.)  During a sequential scan we do not allow this value
 * to decrease.
 */
B
Bruce Momjian 已提交
213 214 215
static TimeLineID recoveryTargetTLI;
static List *expectedTLIs;
static TimeLineID curFileTLI;
216

T
Tom Lane 已提交
217 218
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
219 220 221 222
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
T
Tom Lane 已提交
223 224
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
225

226
XLogRecPtr	XactLastRecEnd = {0, 0};
227

T
Tom Lane 已提交
228 229 230
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
231
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
232
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
233
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
234
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
235 236
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 * InitXLOGAccess.
T
Tom Lane 已提交
237
 */
238
static XLogRecPtr RedoRecPtr;
239

240 241 242 243 244 245 246 247 248 249 250 251
/*
 * RedoStartLSN points to the checkpoint's REDO location which is specified
 * in a backup label file, backup history file or control file. In standby
 * mode, XLOG streaming usually starts from the position where an invalid
 * record was found. But if we fail to read even the initial checkpoint
 * record, we use the REDO location instead of the checkpoint location as
 * the start position of XLOG streaming. Otherwise we would have to jump
 * backwards to the REDO location after reading the checkpoint record,
 * because the REDO record can precede the checkpoint record.
 */
static XLogRecPtr RedoStartLSN = {0, 0};

T
Tom Lane 已提交
252 253 254 255 256 257 258 259 260
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
261
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
262 263 264
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
265 266 267 268
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
269 270 271
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
272 273
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
274 275 276
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
277
 * but is updated when convenient.	Again, it exists for the convenience of
278
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
279 280 281 282 283 284 285 286 287 288
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
289 290 291 292 293 294 295 296 297 298 299 300 301
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
302
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
303 304
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the bgwriter, this is just pro forma).
305
 *
T
Tom Lane 已提交
306 307
 *----------
 */
308

T
Tom Lane 已提交
309
typedef struct XLogwrtRqst
310
{
T
Tom Lane 已提交
311 312
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
313
} XLogwrtRqst;
314

315 316 317 318 319 320
typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

T
Tom Lane 已提交
321 322 323
/*
 * Shared state data for XLogInsert.
 */
324 325
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
326 327
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
328
	int			curridx;		/* current block index in cache */
B
Bruce Momjian 已提交
329 330 331
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
332
	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
333 334
} XLogCtlInsert;

T
Tom Lane 已提交
335 336 337
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
338 339
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
340 341
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	int			curridx;		/* cache index of next block to write */
342
	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
343 344
} XLogCtlWrite;

T
Tom Lane 已提交
345 346 347
/*
 * Total shared-memory state for XLOG.
 */
348 349
typedef struct XLogCtlData
{
350
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
351
	XLogCtlInsert Insert;
352

T
Tom Lane 已提交
353
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
354 355
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
356 357
	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
	TransactionId ckptXid;
B
Bruce Momjian 已提交
358
	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
359

360
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
361 362
	XLogCtlWrite Write;

T
Tom Lane 已提交
363
	/*
B
Bruce Momjian 已提交
364 365 366
	 * These values do not change after startup, although the pointed-to pages
	 * and xlblocks values certainly do.  Permission to read/write the pages
	 * and xlblocks values depends on WALInsertLock and WALWriteLock.
T
Tom Lane 已提交
367
	 */
B
Bruce Momjian 已提交
368
	char	   *pages;			/* buffers for unwritten XLOG pages */
369
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
370
	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
371
	TimeLineID	ThisTimeLineID;
372
	TimeLineID	RecoveryTargetTLI;
T
Tom Lane 已提交
373

374 375
	/*
	 * SharedRecoveryInProgress indicates if we're still in crash or archive
376
	 * recovery.  Protected by info_lck.
377 378 379 380
	 */
	bool		SharedRecoveryInProgress;

	/*
381 382
	 * During recovery, we keep a copy of the latest checkpoint record here.
	 * Used by the background writer when it wants to create a restartpoint.
383 384 385 386 387 388 389 390
	 *
	 * Protected by info_lck.
	 */
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;

	/* end+1 of the last record replayed (or being replayed) */
	XLogRecPtr	replayEndRecPtr;
391 392
	/* timestamp of last record replayed (or being replayed) */
	TimestampTz	recoveryLastXTime;
393 394
	/* end+1 of the last record replayed */
	XLogRecPtr	recoveryLastRecPtr;
395

396
	slock_t		info_lck;		/* locks shared variables shown above */
397 398
} XLogCtlData;

399
static XLogCtlData *XLogCtl = NULL;
400

401
/*
T
Tom Lane 已提交
402
 * We maintain an image of pg_control in shared memory.
403
 */
404
static ControlFileData *ControlFile = NULL;
405

T
Tom Lane 已提交
406 407 408 409 410
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
411

T
Tom Lane 已提交
412 413
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
414
	(XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
T
Tom Lane 已提交
415 416 417 418 419 420

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
421
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
422 423 424 425 426 427 428
	)

#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
429

T
Tom Lane 已提交
430 431 432 433
/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
434
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
435

T
Tom Lane 已提交
436 437 438 439 440 441 442 443 444 445
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
446

T
Tom Lane 已提交
447 448 449 450
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
451 452
 * will be just past that page. readLen indicates how much of the current
 * page has been read into readBuf.
T
Tom Lane 已提交
453
 */
454 455 456 457
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
458 459 460
static uint32 readLen = 0;
/* Is the currently open segment being streamed from primary? */
static bool readStreamed = false;
B
Bruce Momjian 已提交
461

462
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
T
Tom Lane 已提交
463
static char *readBuf = NULL;
B
Bruce Momjian 已提交
464

465 466 467 468
/* Buffer for current ReadRecord result (expandable) */
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;

T
Tom Lane 已提交
469
/* State information for XLOG reading */
B
Bruce Momjian 已提交
470 471
static XLogRecPtr ReadRecPtr;	/* start of last record read */
static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
472
static TimeLineID lastPageTLI = 0;
473

474 475 476
static XLogRecPtr minRecoveryPoint;		/* local copy of
										 * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
477

V
WAL  
Vadim B. Mikheev 已提交
478 479
static bool InRedo = false;

480
/*
481
 * Flags set by interrupt handlers for later service in the redo loop.
482
 */
483
static volatile sig_atomic_t got_SIGHUP = false;
484
static volatile sig_atomic_t shutdown_requested = false;
485

486 487
/*
 * Flag set when executing a restore command, to tell SIGTERM signal handler
488
 * that it's safe to just proc_exit.
489 490 491
 */
static volatile sig_atomic_t in_restore_command = false;

492

493 494
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
495 496
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
497 498
static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
499
static void exitArchiveRecovery(TimeLineID endTLI,
B
Bruce Momjian 已提交
500
					uint32 endLogId, uint32 endLogSeg);
501
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
502
static void CheckRequiredParameterValues(CheckPoint checkPoint);
503
static void LocalSetXLogInsertAllowed(void);
504
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
T
Tom Lane 已提交
505

506
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
B
Bruce Momjian 已提交
507
				XLogRecPtr *lsn, BkpBlock *bkpb);
508 509
static bool AdvanceXLInsertBuffer(bool new_segment);
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
510 511
static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
512
					   bool use_lock);
513 514 515 516 517 518
static int	XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
			 bool fromArchive, bool notexistOk);
static int	XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
				   bool fromArchive);
static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess);
B
Bruce Momjian 已提交
519
static void XLogFileClose(void);
520
static bool RestoreArchivedFile(char *path, const char *xlogfname,
B
Bruce Momjian 已提交
521
					const char *recovername, off_t expectedSize);
522
static void ExecuteRecoveryEndCommand(void);
523 524
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
525
static void ValidateXLOGDirectoryStructure(void);
526
static void CleanupBackupHistory(void);
527
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
528
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
529
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
530
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
531 532 533 534
static List *readTimeLineHistory(TimeLineID targetTLI);
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
B
Bruce Momjian 已提交
535 536
					 TimeLineID endTLI,
					 uint32 endLogId, uint32 endLogSeg);
T
Tom Lane 已提交
537 538
static void WriteControlFile(void);
static void ReadControlFile(void);
539
static char *str_time(pg_time_t tnow);
540
static bool CheckForStandbyTrigger(void);
541

542
#ifdef WAL_DEBUG
543
static void xlog_outrec(StringInfo buf, XLogRecord *record);
544
#endif
545
static void pg_start_backup_callback(int code, Datum arg);
546
static bool read_backup_label(XLogRecPtr *checkPointLoc);
547
static void rm_redo_error_callback(void *arg);
548
static int	get_sync_bit(int method);
T
Tom Lane 已提交
549 550 551 552 553


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
554
 * the rdata chain (see xlog.h for notes about rdata).
T
Tom Lane 已提交
555 556 557 558 559 560 561 562 563 564 565
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
566
XLogRecPtr
567
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
568
{
B
Bruce Momjian 已提交
569 570
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
571
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
572 573 574
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
575
	int			curridx;
B
Bruce Momjian 已提交
576 577 578 579 580
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
581 582 583 584
	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
	pg_crc32	rdata_crc;
B
Bruce Momjian 已提交
585 586 587 588
	uint32		len,
				write_len;
	unsigned	i;
	bool		updrqst;
589
	bool		doPageWrites;
590
	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
V
Vadim B. Mikheev 已提交
591

592
	/* cross-check on whether we should be here or not */
593 594
	if (!XLogInsertAllowed())
		elog(ERROR, "cannot make new WAL entries during recovery");
595

596
	/* info's high bits are reserved for use by me */
V
Vadim B. Mikheev 已提交
597
	if (info & XLR_INFO_MASK)
598
		elog(PANIC, "invalid xlog info mask %02X", info);
V
Vadim B. Mikheev 已提交
599

600 601
	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

T
Tom Lane 已提交
602
	/*
B
Bruce Momjian 已提交
603 604
	 * In bootstrap mode, we don't actually log anything but XLOG resources;
	 * return a phony record pointer.
T
Tom Lane 已提交
605
	 */
V
Vadim B. Mikheev 已提交
606
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
607 608
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
609
		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
610
		return RecPtr;
V
WAL  
Vadim B. Mikheev 已提交
611 612
	}

T
Tom Lane 已提交
613
	/*
614
	 * Here we scan the rdata chain, determine which buffers must be backed
T
Tom Lane 已提交
615
	 * up, and compute the CRC values for the data.  Note that the record
B
Bruce Momjian 已提交
616 617 618 619
	 * header isn't added into the CRC initially since we don't know the final
	 * length or info bits quite yet.  Thus, the CRC will represent the CRC of
	 * the whole record in the order "rdata, then backup blocks, then record
	 * header".
T
Tom Lane 已提交
620
	 *
621 622 623 624 625
	 * We may have to loop back to here if a race condition is detected below.
	 * We could prevent the race by doing all this work while holding the
	 * insert lock, but it seems better to avoid doing CRC calculations while
	 * holding the lock.  This means we have to be careful about modifying the
	 * rdata chain until we know we aren't going to loop back again.  The only
B
Bruce Momjian 已提交
626 627 628 629 630
	 * change we allow ourselves to make earlier is to set rdt->data = NULL in
	 * chain items we have decided we will have to back up the whole buffer
	 * for.  This is OK because we will certainly decide the same thing again
	 * for those items if we do it over; doing it here saves an extra pass
	 * over the chain later.
T
Tom Lane 已提交
631
	 */
632
begin:;
T
Tom Lane 已提交
633 634 635 636 637 638
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

639 640 641 642 643 644 645 646
	/*
	 * Decide if we need to do full-page writes in this XLOG record: true if
	 * full_page_writes is on or we have a PITR request for it.  Since we
	 * don't yet have the insert lock, forcePageWrites could change under us,
	 * but we'll recheck it once we have the lock.
	 */
	doPageWrites = fullPageWrites || Insert->forcePageWrites;

647
	INIT_CRC32(rdata_crc);
T
Tom Lane 已提交
648
	len = 0;
B
Bruce Momjian 已提交
649
	for (rdt = rdata;;)
650 651 652
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
653
			/* Simple data, just include it */
654
			len += rdt->len;
655
			COMP_CRC32(rdata_crc, rdt->data, rdt->len);
656
		}
T
Tom Lane 已提交
657
		else
658
		{
T
Tom Lane 已提交
659 660
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
661
			{
T
Tom Lane 已提交
662
				if (rdt->buffer == dtbuf[i])
663
				{
664
					/* Buffer already referenced by earlier chain item */
T
Tom Lane 已提交
665 666 667 668 669
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
670
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
671 672
					}
					break;
673
				}
T
Tom Lane 已提交
674
				if (dtbuf[i] == InvalidBuffer)
675
				{
T
Tom Lane 已提交
676 677
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
678 679
					if (XLogCheckBuffer(rdt, doPageWrites,
										&(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
T
Tom Lane 已提交
680 681 682 683 684 685 686
					{
						dtbuf_bkp[i] = true;
						rdt->data = NULL;
					}
					else if (rdt->data)
					{
						len += rdt->len;
687
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
688 689
					}
					break;
690 691
				}
			}
T
Tom Lane 已提交
692
			if (i >= XLR_MAX_BKP_BLOCKS)
693
				elog(PANIC, "can backup at most %d blocks per xlog record",
T
Tom Lane 已提交
694
					 XLR_MAX_BKP_BLOCKS);
695
		}
696
		/* Break out of loop when rdt points to last chain item */
697 698 699 700 701
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
	/*
	 * Now add the backup block headers and data into the CRC
	 */
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (dtbuf_bkp[i])
		{
			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
			char	   *page;

			COMP_CRC32(rdata_crc,
					   (char *) bkpb,
					   sizeof(BkpBlock));
			page = (char *) BufferGetBlock(dtbuf[i]);
			if (bkpb->hole_length == 0)
			{
				COMP_CRC32(rdata_crc,
						   page,
						   BLCKSZ);
			}
			else
			{
				/* must skip the hole */
				COMP_CRC32(rdata_crc,
						   page,
						   bkpb->hole_offset);
				COMP_CRC32(rdata_crc,
						   page + (bkpb->hole_offset + bkpb->hole_length),
						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
			}
		}
	}

T
Tom Lane 已提交
735
	/*
736 737
	 * NOTE: We disallow len == 0 because it provides a useful bit of extra
	 * error checking in ReadRecord.  This means that all callers of
B
Bruce Momjian 已提交
738 739 740
	 * XLogInsert must supply at least some not-in-a-buffer data.  However, we
	 * make an exception for XLOG SWITCH records because we don't want them to
	 * ever cross a segment boundary.
T
Tom Lane 已提交
741
	 */
742
	if (len == 0 && !isLogSwitch)
743
		elog(PANIC, "invalid xlog record length %u", len);
744

745
	START_CRIT_SECTION();
746

747 748 749
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
750
	/*
B
Bruce Momjian 已提交
751 752 753
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to go
	 * back and recompute everything.  This can only happen just after a
	 * checkpoint, so it's better to be slow in this case and fast otherwise.
754 755
	 *
	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
B
Bruce Momjian 已提交
756 757
	 * affect the contents of the XLOG record, so we'll update our local copy
	 * but not force a recomputation.
T
Tom Lane 已提交
758 759
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
760
	{
T
Tom Lane 已提交
761 762 763
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

764
		if (doPageWrites)
765
		{
766
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
T
Tom Lane 已提交
767
			{
768 769 770 771 772 773 774 775 776 777 778 779 780
				if (dtbuf[i] == InvalidBuffer)
					continue;
				if (dtbuf_bkp[i] == false &&
					XLByteLE(dtbuf_lsn[i], RedoRecPtr))
				{
					/*
					 * Oops, this buffer now needs to be backed up, but we
					 * didn't think so above.  Start over.
					 */
					LWLockRelease(WALInsertLock);
					END_CRIT_SECTION();
					goto begin;
				}
T
Tom Lane 已提交
781
			}
782 783 784
		}
	}

785
	/*
B
Bruce Momjian 已提交
786 787 788 789
	 * Also check to see if forcePageWrites was just turned on; if we weren't
	 * already doing full-page writes then go back and recompute. (If it was
	 * just turned off, we could recompute the record without full pages, but
	 * we choose not to bother.)
790 791 792 793 794 795 796 797 798
	 */
	if (Insert->forcePageWrites && !doPageWrites)
	{
		/* Oops, must redo it with full-page data */
		LWLockRelease(WALInsertLock);
		END_CRIT_SECTION();
		goto begin;
	}

T
Tom Lane 已提交
799
	/*
B
Bruce Momjian 已提交
800 801 802 803
	 * Make additional rdata chain entries for the backup blocks, so that we
	 * don't need to special-case them in the write loop.  Note that we have
	 * now irrevocably changed the input rdata chain.  At the exit of this
	 * loop, write_len includes the backup block data.
T
Tom Lane 已提交
804
	 *
805 806 807
	 * Also set the appropriate info bits to show which buffers were backed
	 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
	 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
T
Tom Lane 已提交
808 809 810
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
811
	{
812 813 814
		BkpBlock   *bkpb;
		char	   *page;

815
		if (!dtbuf_bkp[i])
816 817
			continue;

T
Tom Lane 已提交
818
		info |= XLR_SET_BKP_BLOCK(i);
819

820 821 822 823 824
		bkpb = &(dtbuf_xlg[i]);
		page = (char *) BufferGetBlock(dtbuf[i]);

		rdt->next = &(dtbuf_rdt1[i]);
		rdt = rdt->next;
825

826 827
		rdt->data = (char *) bkpb;
		rdt->len = sizeof(BkpBlock);
T
Tom Lane 已提交
828
		write_len += sizeof(BkpBlock);
829

830 831
		rdt->next = &(dtbuf_rdt2[i]);
		rdt = rdt->next;
832

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
		if (bkpb->hole_length == 0)
		{
			rdt->data = page;
			rdt->len = BLCKSZ;
			write_len += BLCKSZ;
			rdt->next = NULL;
		}
		else
		{
			/* must skip the hole */
			rdt->data = page;
			rdt->len = bkpb->hole_offset;
			write_len += bkpb->hole_offset;

			rdt->next = &(dtbuf_rdt3[i]);
			rdt = rdt->next;

			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
			write_len += rdt->len;
			rdt->next = NULL;
		}
855 856
	}

857 858 859 860 861 862 863
	/*
	 * If we backed up any full blocks and online backup is not in progress,
	 * mark the backup blocks as removable.  This allows the WAL archiver to
	 * know whether it is safe to compress archived WAL data by transforming
	 * full-block records into the non-full-block format.
	 *
	 * Note: we could just set the flag whenever !forcePageWrites, but
B
Bruce Momjian 已提交
864 865
	 * defining it like this leaves the info bit free for some potential other
	 * use in records without any backup blocks.
866 867 868 869
	 */
	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
		info |= XLR_BKP_REMOVABLE;

870
	/*
871
	 * If there isn't enough space on the current XLOG page for a record
B
Bruce Momjian 已提交
872
	 * header, advance to the next page (leaving the unused space as zeroes).
873
	 */
T
Tom Lane 已提交
874 875
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
876 877
	if (freespace < SizeOfXLogRecord)
	{
878
		updrqst = AdvanceXLInsertBuffer(false);
879 880 881
		freespace = INSERT_FREESPACE(Insert);
	}

882
	/* Compute record's XLOG location */
T
Tom Lane 已提交
883
	curridx = Insert->curridx;
884 885 886
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/*
B
Bruce Momjian 已提交
887 888 889 890 891
	 * If the record is an XLOG_SWITCH, and we are exactly at the start of a
	 * segment, we need not insert it (and don't want to because we'd like
	 * consecutive switch requests to be no-ops).  Instead, make sure
	 * everything is written and flushed through the end of the prior segment,
	 * and return the prior segment's end address.
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
	 */
	if (isLogSwitch &&
		(RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
	{
		/* We can release insert lock immediately */
		LWLockRelease(WALInsertLock);

		RecPtr.xrecoff -= SizeOfXLogLongPHD;
		if (RecPtr.xrecoff == 0)
		{
			/* crossing a logid boundary */
			RecPtr.xlogid -= 1;
			RecPtr.xrecoff = XLogFileSize;
		}

		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
		{
			XLogwrtRqst FlushRqst;

			FlushRqst.Write = RecPtr;
			FlushRqst.Flush = RecPtr;
			XLogWrite(FlushRqst, false, false);
		}
		LWLockRelease(WALWriteLock);

		END_CRIT_SECTION();

		return RecPtr;
	}
T
Tom Lane 已提交
923

924 925
	/* Insert record header */

926
	record = (XLogRecord *) Insert->currpos;
927
	record->xl_prev = Insert->PrevRecord;
928
	record->xl_xid = GetCurrentTransactionIdIfAny();
929
	record->xl_tot_len = SizeOfXLogRecord + write_len;
T
Tom Lane 已提交
930
	record->xl_len = len;		/* doesn't include backup blocks */
931
	record->xl_info = info;
932
	record->xl_rmid = rmid;
933

934 935 936 937
	/* Now we can finish computing the record's CRC */
	COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(rdata_crc);
938 939
	record->xl_crc = rdata_crc;

940
#ifdef WAL_DEBUG
V
WAL  
Vadim B. Mikheev 已提交
941 942
	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
943
		StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
944

945
		initStringInfo(&buf);
946 947
		appendStringInfo(&buf, "INSERT @ %X/%X: ",
						 RecPtr.xlogid, RecPtr.xrecoff);
948
		xlog_outrec(&buf, record);
949
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
950
		{
951 952
			appendStringInfo(&buf, " - ");
			RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
953
		}
954 955
		elog(LOG, "%s", buf.data);
		pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
956
	}
957
#endif
V
WAL  
Vadim B. Mikheev 已提交
958

T
Tom Lane 已提交
959 960 961 962
	/* Record begin of record in appropriate places */
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

963
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
964
	freespace -= SizeOfXLogRecord;
965

T
Tom Lane 已提交
966 967 968 969
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
970
	{
971 972 973 974
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
975
		{
976 977 978 979 980
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
981
				write_len -= freespace;
982 983 984 985 986
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
987
				write_len -= rdata->len;
988 989 990 991
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
992 993
		}

994
		/* Use next buffer */
995
		updrqst = AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
996 997 998 999 1000 1001
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
1002
		freespace = INSERT_FREESPACE(Insert);
1003
	}
1004

T
Tom Lane 已提交
1005 1006
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
1007
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
1008
	freespace = INSERT_FREESPACE(Insert);
1009

V
Vadim B. Mikheev 已提交
1010
	/*
B
Bruce Momjian 已提交
1011 1012
	 * The recptr I return is the beginning of the *next* record. This will be
	 * stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
1013
	 */
T
Tom Lane 已提交
1014
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
1015

1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
	/*
	 * If the record is an XLOG_SWITCH, we must now write and flush all the
	 * existing data, and then forcibly advance to the start of the next
	 * segment.  It's not good to do this I/O while holding the insert lock,
	 * but there seems too much risk of confusion if we try to release the
	 * lock sooner.  Fortunately xlog switch needn't be a high-performance
	 * operation anyway...
	 */
	if (isLogSwitch)
	{
		XLogCtlWrite *Write = &XLogCtl->Write;
		XLogwrtRqst FlushRqst;
		XLogRecPtr	OldSegEnd;

1030 1031
		TRACE_POSTGRESQL_XLOG_SWITCH();

1032 1033 1034
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

		/*
B
Bruce Momjian 已提交
1035 1036
		 * Flush through the end of the page containing XLOG_SWITCH, and
		 * perform end-of-segment actions (eg, notifying archiver).
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
		 */
		WriteRqst = XLogCtl->xlblocks[curridx];
		FlushRqst.Write = WriteRqst;
		FlushRqst.Flush = WriteRqst;
		XLogWrite(FlushRqst, false, true);

		/* Set up the next buffer as first page of next segment */
		/* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
		(void) AdvanceXLInsertBuffer(true);

		/* There should be no unwritten data */
		curridx = Insert->curridx;
		Assert(curridx == Write->curridx);

		/* Compute end address of old segment */
		OldSegEnd = XLogCtl->xlblocks[curridx];
		OldSegEnd.xrecoff -= XLOG_BLCKSZ;
		if (OldSegEnd.xrecoff == 0)
		{
			/* crossing a logid boundary */
			OldSegEnd.xlogid -= 1;
			OldSegEnd.xrecoff = XLogFileSize;
		}

		/* Make it look like we've written and synced all of old segment */
		LogwrtResult.Write = OldSegEnd;
		LogwrtResult.Flush = OldSegEnd;

		/*
		 * Update shared-memory status --- this code should match XLogWrite
		 */
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->LogwrtResult = LogwrtResult;
			if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
				xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
			if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
				xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
			SpinLockRelease(&xlogctl->info_lck);
		}

		Write->LogwrtResult = LogwrtResult;

		LWLockRelease(WALWriteLock);

		updrqst = false;		/* done already */
	}
1087
	else
1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	{
		/* normal case, ie not xlog switch */

		/* Need to update shared LogwrtRqst if some block was filled up */
		if (freespace < SizeOfXLogRecord)
		{
			/* curridx is filled and available for writing out */
			updrqst = true;
		}
		else
		{
			/* if updrqst already set, write through end of previous buf */
			curridx = PrevBufIdx(curridx);
		}
		WriteRqst = XLogCtl->xlblocks[curridx];
	}
1104

1105
	LWLockRelease(WALInsertLock);
1106 1107 1108

	if (updrqst)
	{
1109 1110 1111
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1112
		SpinLockAcquire(&xlogctl->info_lck);
T
Tom Lane 已提交
1113
		/* advance global request to include new block(s) */
1114 1115
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
1116
		/* update local result copy while I have the chance */
1117
		LogwrtResult = xlogctl->LogwrtResult;
1118
		SpinLockRelease(&xlogctl->info_lck);
1119 1120
	}

1121
	XactLastRecEnd = RecPtr;
1122

1123
	END_CRIT_SECTION();
1124

1125
	return RecPtr;
1126
}
1127

1128
/*
1129 1130 1131
 * Determine whether the buffer referenced by an XLogRecData item has to
 * be backed up, and if so fill a BkpBlock struct for it.  In any case
 * save the buffer's LSN at *lsn.
1132
 */
1133
static bool
1134
XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1135
				XLogRecPtr *lsn, BkpBlock *bkpb)
1136
{
1137
	Page		page;
1138

1139
	page = BufferGetPage(rdata->buffer);
1140 1141

	/*
B
Bruce Momjian 已提交
1142 1143 1144
	 * XXX We assume page LSN is first data on *every* page that can be passed
	 * to XLogInsert, whether it otherwise has the standard page layout or
	 * not.
1145
	 */
1146
	*lsn = PageGetLSN(page);
1147

1148
	if (doPageWrites &&
1149
		XLByteLE(PageGetLSN(page), RedoRecPtr))
1150
	{
1151 1152 1153
		/*
		 * The page needs to be backed up, so set up *bkpb
		 */
1154
		BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1155

1156 1157 1158
		if (rdata->buffer_std)
		{
			/* Assume we can omit data between pd_lower and pd_upper */
1159 1160
			uint16		lower = ((PageHeader) page)->pd_lower;
			uint16		upper = ((PageHeader) page)->pd_upper;
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181

			if (lower >= SizeOfPageHeaderData &&
				upper > lower &&
				upper <= BLCKSZ)
			{
				bkpb->hole_offset = lower;
				bkpb->hole_length = upper - lower;
			}
			else
			{
				/* No "hole" to compress out */
				bkpb->hole_offset = 0;
				bkpb->hole_length = 0;
			}
		}
		else
		{
			/* Not a standard page header, don't try to eliminate "hole" */
			bkpb->hole_offset = 0;
			bkpb->hole_length = 0;
		}
1182

1183
		return true;			/* buffer requires backup */
1184
	}
1185 1186

	return false;				/* buffer does not need to be backed up */
1187 1188
}

1189 1190 1191 1192 1193 1194
/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
1195
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
1196
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1197
 * then when complete, rename it to 0000000100000001000000C6.done
1198 1199 1200 1201 1202
 */
static void
XLogArchiveNotify(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
B
Bruce Momjian 已提交
1203
	FILE	   *fd;
1204 1205 1206 1207

	/* insert an otherwise empty file called <XLOG>.ready */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	fd = AllocateFile(archiveStatusPath, "w");
B
Bruce Momjian 已提交
1208 1209
	if (fd == NULL)
	{
1210 1211 1212 1213 1214 1215
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not create archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}
B
Bruce Momjian 已提交
1216 1217
	if (FreeFile(fd))
	{
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not write archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}

	/* Notify archiver that it's got something to do */
	if (IsUnderPostmaster)
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}

/*
 * Convenience routine to notify using log/seg representation of filename
 */
static void
XLogArchiveNotifySeg(uint32 log, uint32 seg)
{
	char		xlog[MAXFNAMELEN];

1238
	XLogFileName(xlog, ThisTimeLineID, log, seg);
1239 1240 1241 1242
	XLogArchiveNotify(xlog);
}

/*
1243
 * XLogArchiveCheckDone
1244
 *
1245 1246 1247 1248
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
1249 1250
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1251 1252 1253 1254
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1255 1256
 */
static bool
1257
XLogArchiveCheckDone(const char *xlog)
1258 1259 1260 1261
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

1262 1263 1264 1265 1266
	/* Always deletable if archiving is off */
	if (!XLogArchivingActive())
		return true;

	/* First check for .done --- this means archiver is done with it */
1267 1268 1269 1270 1271 1272 1273
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
B
Bruce Momjian 已提交
1274
		return false;
1275 1276 1277 1278 1279 1280 1281

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Retry creation of the .ready file */
1282
	XLogArchiveNotify(xlog);
1283 1284 1285
	return false;
}

1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool
XLogArchiveIsBusy(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

	/* First check for .done --- this means archiver is done with it */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/*
1318 1319 1320
	 * Check to see if the WAL file has been removed by checkpoint, which
	 * implies it has already been archived, and explains why we can't see a
	 * status file for it.
1321 1322 1323 1324 1325 1326 1327 1328 1329
	 */
	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
	if (stat(archiveStatusPath, &stat_buf) != 0 &&
		errno == ENOENT)
		return false;

	return true;
}

1330 1331 1332
/*
 * XLogArchiveCleanup
 *
1333
 * Cleanup archive notification file(s) for a particular xlog segment
1334 1335 1336 1337
 */
static void
XLogArchiveCleanup(const char *xlog)
{
B
Bruce Momjian 已提交
1338
	char		archiveStatusPath[MAXPGPATH];
1339

1340
	/* Remove the .done file */
1341 1342 1343
	StatusFilePath(archiveStatusPath, xlog, ".done");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1344 1345 1346 1347 1348

	/* Remove the .ready file if present --- normally it shouldn't be */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1349 1350
}

T
Tom Lane 已提交
1351 1352 1353 1354
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
1355 1356 1357 1358
 * If new_segment is TRUE then we set up the next buffer page as the first
 * page of the next xlog segment file, possibly but not usually the next
 * consecutive file page.
 *
T
Tom Lane 已提交
1359
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
1360
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
1361 1362 1363
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
1364
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
1365 1366
 */
static bool
1367
AdvanceXLInsertBuffer(bool new_segment)
1368
{
T
Tom Lane 已提交
1369 1370
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
1371
	int			nextidx = NextBufIdx(Insert->curridx);
T
Tom Lane 已提交
1372 1373 1374
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
1375 1376
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
1377

T
Tom Lane 已提交
1378 1379 1380
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
1381

T
Tom Lane 已提交
1382
	/*
B
Bruce Momjian 已提交
1383 1384 1385
	 * Get ending-offset of the buffer page we need to replace (this may be
	 * zero if the buffer hasn't been used yet).  Fall through if it's already
	 * written out.
T
Tom Lane 已提交
1386 1387 1388 1389 1390 1391
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
1392

T
Tom Lane 已提交
1393
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1394

1395
		/* Before waiting, get info_lck and update LogwrtResult */
1396 1397 1398 1399
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

1400
			SpinLockAcquire(&xlogctl->info_lck);
1401 1402 1403
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
1404
			SpinLockRelease(&xlogctl->info_lck);
1405
		}
1406 1407 1408 1409 1410 1411 1412 1413 1414

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
1415
		{
1416 1417 1418 1419
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1420
			{
1421 1422 1423
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
1424
			}
1425
			else
T
Tom Lane 已提交
1426 1427
			{
				/*
B
Bruce Momjian 已提交
1428 1429
				 * Have to write buffers while holding insert lock. This is
				 * not good, so only write as much as we absolutely must.
T
Tom Lane 已提交
1430
				 */
1431
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
T
Tom Lane 已提交
1432 1433 1434
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
1435
				XLogWrite(WriteRqst, false, false);
1436
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1437
				Insert->LogwrtResult = LogwrtResult;
1438
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1439 1440 1441 1442
			}
		}
	}

T
Tom Lane 已提交
1443
	/*
B
Bruce Momjian 已提交
1444 1445
	 * Now the next buffer slot is free and we can set it up to be the next
	 * output page.
T
Tom Lane 已提交
1446
	 */
1447
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1448 1449 1450 1451 1452 1453 1454 1455

	if (new_segment)
	{
		/* force it to a segment start point */
		NewPageEndPtr.xrecoff += XLogSegSize - 1;
		NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
	}

1456
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
1457
	{
T
Tom Lane 已提交
1458
		/* crossing a logid boundary */
1459
		NewPageEndPtr.xlogid += 1;
1460
		NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1461
	}
T
Tom Lane 已提交
1462
	else
1463
		NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1464
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1465
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
B
Bruce Momjian 已提交
1466

T
Tom Lane 已提交
1467
	Insert->curridx = nextidx;
1468
	Insert->currpage = NewPage;
B
Bruce Momjian 已提交
1469 1470

	Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
B
Bruce Momjian 已提交
1471

T
Tom Lane 已提交
1472
	/*
B
Bruce Momjian 已提交
1473 1474
	 * Be sure to re-zero the buffer so that bytes beyond what we've written
	 * will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
1475
	 */
1476
	MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1477

1478 1479 1480
	/*
	 * Fill the new page's header
	 */
B
Bruce Momjian 已提交
1481 1482
	NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

1483
	/* NewPage->xlp_info = 0; */	/* done by memset */
B
Bruce Momjian 已提交
1484 1485
	NewPage   ->xlp_tli = ThisTimeLineID;
	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1486
	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
T
Tom Lane 已提交
1487

1488
	/*
1489
	 * If first page of an XLOG segment file, make it a long header.
1490 1491 1492
	 */
	if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
	{
1493
		XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1494

1495 1496
		NewLongPage->xlp_sysid = ControlFile->system_identifier;
		NewLongPage->xlp_seg_size = XLogSegSize;
1497
		NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
B
Bruce Momjian 已提交
1498 1499 1500
		NewPage   ->xlp_info |= XLP_LONG_HEADER;

		Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1501 1502
	}

T
Tom Lane 已提交
1503
	return update_needed;
1504 1505
}

1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518
/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
 * Caller must have just finished filling the open log file (so that
 * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
 * to the open log file and see if that exceeds CheckPointSegments.
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool
XLogCheckpointNeeded(void)
{
	/*
1519 1520
	 * A straight computation of segment number could overflow 32 bits. Rather
	 * than assuming we have working 64-bit arithmetic, we compare the
B
Bruce Momjian 已提交
1521 1522
	 * highest-order bits separately, and force a checkpoint immediately when
	 * they change.
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
	 */
	uint32		old_segno,
				new_segno;
	uint32		old_highbits,
				new_highbits;

	old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
		(RedoRecPtr.xrecoff / XLogSegSize);
	old_highbits = RedoRecPtr.xlogid / XLogSegSize;
	new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
	new_highbits = openLogId / XLogSegSize;
	if (new_highbits != old_highbits ||
B
Bruce Momjian 已提交
1535
		new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1536 1537 1538 1539
		return true;
	return false;
}

T
Tom Lane 已提交
1540 1541 1542
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
1543 1544 1545 1546 1547
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
1548 1549 1550 1551 1552 1553
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
1554
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
1555
 */
1556
static void
1557
XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1558
{
1559
	XLogCtlWrite *Write = &XLogCtl->Write;
T
Tom Lane 已提交
1560
	bool		ispartialpage;
1561
	bool		last_iteration;
1562
	bool		finishing_seg;
1563
	bool		use_existent;
1564 1565 1566 1567
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;
1568

1569 1570 1571
	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

B
Bruce Momjian 已提交
1572
	/*
B
Bruce Momjian 已提交
1573
	 * Update local LogwrtResult (caller probably did this already, but...)
B
Bruce Momjian 已提交
1574
	 */
T
Tom Lane 已提交
1575 1576
	LogwrtResult = Write->LogwrtResult;

1577 1578 1579
	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
B
Bruce Momjian 已提交
1580 1581 1582 1583 1584
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
1585 1586 1587 1588 1589 1590 1591 1592 1593
	 */
	npages = 0;
	startidx = 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  We advance Write->curridx only after successfully
	 * writing pages.  (Right now, this refinement is useless since we are
B
Bruce Momjian 已提交
1594 1595
	 * going to PANIC if any error occurs anyway; but someday it may come in
	 * useful.)
1596 1597
	 */
	curridx = Write->curridx;
B
 
Bruce Momjian 已提交
1598

T
Tom Lane 已提交
1599
	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1600
	{
1601
		/*
B
Bruce Momjian 已提交
1602 1603 1604
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
1605
		 */
1606
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1607
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1608
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1609 1610
				 XLogCtl->xlblocks[curridx].xlogid,
				 XLogCtl->xlblocks[curridx].xrecoff);
1611

T
Tom Lane 已提交
1612
		/* Advance LogwrtResult.Write to end of current buffer page */
1613
		LogwrtResult.Write = XLogCtl->xlblocks[curridx];
T
Tom Lane 已提交
1614 1615 1616
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1617
		{
T
Tom Lane 已提交
1618
			/*
1619 1620
			 * Switch to new logfile segment.  We cannot have any pending
			 * pages here (since we dump what we have at segment end).
T
Tom Lane 已提交
1621
			 */
1622
			Assert(npages == 0);
T
Tom Lane 已提交
1623
			if (openLogFile >= 0)
1624
				XLogFileClose();
T
Tom Lane 已提交
1625 1626
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1627 1628 1629 1630
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1631
			openLogOff = 0;
1632 1633
		}

1634
		/* Make sure we have the current logfile open */
T
Tom Lane 已提交
1635
		if (openLogFile < 0)
1636
		{
T
Tom Lane 已提交
1637
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1638
			openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
1639
			openLogOff = 0;
1640 1641
		}

1642 1643 1644 1645 1646
		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx = curridx;
1647
			startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1648 1649
		}
		npages++;
1650

T
Tom Lane 已提交
1651
		/*
B
Bruce Momjian 已提交
1652 1653 1654 1655
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
T
Tom Lane 已提交
1656
		 */
1657 1658
		last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);

1659
		finishing_seg = !ispartialpage &&
1660
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1661

1662
		if (last_iteration ||
1663 1664
			curridx == XLogCtl->XLogCacheBlck ||
			finishing_seg)
T
Tom Lane 已提交
1665
		{
1666 1667
			char	   *from;
			Size		nbytes;
1668

1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681
			/* Need to seek in the file? */
			if (openLogOff != startoffset)
			{
				if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
					ereport(PANIC,
							(errcode_for_file_access(),
							 errmsg("could not seek in log file %u, "
									"segment %u to offset %u: %m",
									openLogId, openLogSeg, startoffset)));
				openLogOff = startoffset;
			}

			/* OK to write the page(s) */
1682 1683
			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
			nbytes = npages * (Size) XLOG_BLCKSZ;
1684 1685 1686 1687 1688 1689 1690 1691 1692
			errno = 0;
			if (write(openLogFile, from, nbytes) != nbytes)
			{
				/* if write didn't set errno, assume no disk space */
				if (errno == 0)
					errno = ENOSPC;
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not write to log file %u, segment %u "
P
Peter Eisentraut 已提交
1693
								"at offset %u, length %lu: %m",
1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709
								openLogId, openLogSeg,
								openLogOff, (unsigned long) nbytes)));
			}

			/* Update state for write */
			openLogOff += nbytes;
			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
			npages = 0;

			/*
			 * If we just wrote the whole last page of a logfile segment,
			 * fsync the segment immediately.  This avoids having to go back
			 * and re-open prior segments when an fsync request comes along
			 * later. Doing it here ensures that one and only one backend will
			 * perform this fsync.
			 *
1710 1711 1712
			 * We also do this if this is the last page written for an xlog
			 * switch.
			 *
1713
			 * This is also the right place to notify the Archiver that the
B
Bruce Momjian 已提交
1714
			 * segment is ready to copy to archival storage, and to update the
1715 1716 1717
			 * timer for archive_timeout, and to signal for a checkpoint if
			 * too many logfile segments have been used since the last
			 * checkpoint.
1718
			 */
1719
			if (finishing_seg || (xlog_switch && last_iteration))
1720
			{
1721
				issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
B
Bruce Momjian 已提交
1722
				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
1723 1724 1725

				if (XLogArchivingActive())
					XLogArchiveNotifySeg(openLogId, openLogSeg);
1726

1727
				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1728 1729

				/*
1730
				 * Signal bgwriter to start a checkpoint if we've consumed too
1731
				 * much xlog since the last one.  For speed, we first check
B
Bruce Momjian 已提交
1732 1733 1734
				 * using the local copy of RedoRecPtr, which might be out of
				 * date; if it looks like a checkpoint is needed, forcibly
				 * update RedoRecPtr and recheck.
1735
				 */
1736 1737
				if (IsUnderPostmaster &&
					XLogCheckpointNeeded())
1738
				{
1739 1740
					(void) GetRedoRecPtr();
					if (XLogCheckpointNeeded())
1741
						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1742
				}
1743
			}
T
Tom Lane 已提交
1744
		}
1745

T
Tom Lane 已提交
1746 1747 1748 1749 1750 1751
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
1752 1753 1754 1755 1756
		curridx = NextBufIdx(curridx);

		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
1757
	}
1758 1759 1760

	Assert(npages == 0);
	Assert(curridx == Write->curridx);
1761

T
Tom Lane 已提交
1762 1763 1764 1765 1766
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1767
	{
T
Tom Lane 已提交
1768
		/*
B
Bruce Momjian 已提交
1769 1770 1771
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.	However, we do not need to
		 * fsync more than one file.
T
Tom Lane 已提交
1772
		 */
1773 1774
		if (sync_method != SYNC_METHOD_OPEN &&
			sync_method != SYNC_METHOD_OPEN_DSYNC)
T
Tom Lane 已提交
1775
		{
1776
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1777
				!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1778
				XLogFileClose();
1779 1780 1781
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1782
				openLogFile = XLogFileOpen(openLogId, openLogSeg);
1783 1784
				openLogOff = 0;
			}
1785
			issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
T
Tom Lane 已提交
1786 1787
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1788 1789
	}

T
Tom Lane 已提交
1790 1791 1792
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1793
	 * We make sure that the shared 'request' values do not fall behind the
B
Bruce Momjian 已提交
1794 1795
	 * 'result' values.  This is not absolutely essential, but it saves some
	 * code in a couple of places.
T
Tom Lane 已提交
1796
	 */
1797 1798 1799 1800
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1801
		SpinLockAcquire(&xlogctl->info_lck);
1802 1803 1804 1805 1806
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1807
		SpinLockRelease(&xlogctl->info_lck);
1808
	}
1809

T
Tom Lane 已提交
1810 1811 1812
	Write->LogwrtResult = LogwrtResult;
}

1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
/*
 * Record the LSN for an asynchronous transaction commit.
 * (This should not be called for aborts, nor for synchronous commits.)
 */
void
XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
		xlogctl->asyncCommitLSN = asyncCommitLSN;
	SpinLockRelease(&xlogctl->info_lck);
}

1829 1830 1831 1832
/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
1833 1834
 * database is consistent.
 *
1835
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1836
 * is only updated if it's not already greater than or equal to 'lsn'.
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851
 */
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
	/* Quick check using our local copy of the variable */
	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
		return;

	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	/* update local copy */
	minRecoveryPoint = ControlFile->minRecoveryPoint;

	/*
	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
1852 1853
	 * i.e., we're doing crash recovery.  We never modify the control file's
	 * value in that case, so we can short-circuit future checks here too.
1854 1855 1856 1857 1858 1859 1860
	 */
	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
		updateMinRecoveryPoint = false;
	else if (force || XLByteLT(minRecoveryPoint, lsn))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
1861
		XLogRecPtr	newMinRecoveryPoint;
1862 1863 1864 1865

		/*
		 * To avoid having to update the control file too often, we update it
		 * all the way to the last record being replayed, even though 'lsn'
1866 1867 1868 1869 1870 1871 1872 1873 1874
		 * would suffice for correctness.  This also allows the 'force' case
		 * to not need a valid 'lsn' value.
		 *
		 * Another important reason for doing it this way is that the passed
		 * 'lsn' value could be bogus, i.e., past the end of available WAL,
		 * if the caller got it from a corrupted heap page.  Accepting such
		 * a value as the min recovery point would prevent us from coming up
		 * at all.  Instead, we just log a warning and continue with recovery.
		 * (See also the comments about corrupt LSNs in XLogFlush.)
1875 1876 1877 1878 1879
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
		SpinLockRelease(&xlogctl->info_lck);

1880 1881 1882 1883 1884 1885
		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
			elog(WARNING,
				 "xlog min recovery request %X/%X is past current point %X/%X",
				 lsn.xlogid, lsn.xrecoff,
				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);

1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
		/* update control file */
		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
		{
			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
			UpdateControlFile();
			minRecoveryPoint = newMinRecoveryPoint;

			ereport(DEBUG2,
					(errmsg("updated min recovery point to %X/%X",
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
		}
	}
	LWLockRelease(ControlFileLock);
}

T
Tom Lane 已提交
1901 1902 1903
/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
1904
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
1905 1906 1907 1908 1909 1910 1911 1912
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

1913
	/*
1914 1915 1916 1917 1918
	 * During REDO, we are reading not writing WAL.  Therefore, instead of
	 * trying to flush the WAL, we should update minRecoveryPoint instead.
	 * We test XLogInsertAllowed(), not InRecovery, because we need the
	 * bgwriter to act this way too, and because when the bgwriter tries
	 * to write the end-of-recovery checkpoint, it should indeed flush.
1919
	 */
1920
	if (!XLogInsertAllowed())
1921 1922
	{
		UpdateMinRecoveryPoint(record, false);
T
Tom Lane 已提交
1923
		return;
1924
	}
T
Tom Lane 已提交
1925 1926 1927 1928 1929

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

1930
#ifdef WAL_DEBUG
1931
	if (XLOG_DEBUG)
1932
		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1933 1934 1935
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1936
#endif
1937

T
Tom Lane 已提交
1938 1939 1940 1941
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
B
Bruce Momjian 已提交
1942 1943 1944 1945
	 * piggyback as much data as we can on each fsync: if we see any more data
	 * entered into the xlog buffer, we'll write and fsync that too, so that
	 * the final value of LogwrtResult.Flush is as large as possible. This
	 * gives us some chance of avoiding another fsync immediately after.
T
Tom Lane 已提交
1946 1947 1948 1949 1950
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

1951
	/* read LogwrtResult and update local state */
1952 1953 1954 1955
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1956
		SpinLockAcquire(&xlogctl->info_lck);
1957 1958 1959
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
1960
		SpinLockRelease(&xlogctl->info_lck);
1961
	}
1962 1963 1964

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1965
	{
1966 1967 1968 1969
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
1970
		{
1971 1972 1973 1974 1975 1976
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

B
Bruce Momjian 已提交
1977
				if (freespace < SizeOfXLogRecord)		/* buffer is full */
1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
1993
			XLogWrite(WriteRqst, false, false);
T
Tom Lane 已提交
1994
		}
1995
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1996 1997 1998
	}

	END_CRIT_SECTION();
1999 2000 2001

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
2002 2003
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
2004
	 *
2005 2006 2007 2008
	 * Formerly we treated this as a PANIC condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take down
	 * the whole system due to corruption on one data page.  In particular, if
	 * the bad page is encountered again during recovery then we would be
2009 2010 2011 2012 2013 2014
	 * unable to restart the database at all!  (This scenario actually
	 * happened in the field several times with 7.1 releases.)  As of 8.4,
	 * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
	 * problem; the only time we can reach here during recovery is while
	 * flushing the end-of-recovery checkpoint record, and we don't expect
	 * that to have a bad LSN.
2015
	 *
2016
	 * Note that for calls from xact.c, the ERROR will
2017
	 * be promoted to PANIC since xact.c calls this routine inside a critical
B
Bruce Momjian 已提交
2018 2019
	 * section.  However, calls from bufmgr.c are not within critical sections
	 * and so we will not force a restart for a bad LSN on a data page.
2020 2021
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
2022
		elog(ERROR,
B
Bruce Momjian 已提交
2023
		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2024 2025
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2026 2027
}

2028 2029 2030 2031 2032 2033
/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
B
Bruce Momjian 已提交
2034
 * are not being used, we will flush complete blocks only.	We can guarantee
2035
 * that async commits reach disk after at most three cycles; normally only
B
Bruce Momjian 已提交
2036
 * one or two.	(We allow XLogWrite to write "flexibly", meaning it can stop
2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048
 * at the end of the buffer ring; this makes a difference only with very high
 * load or long wal_writer_delay, but imposes one extra cycle for the worst
 * case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 */
void
XLogBackgroundFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
	bool		flexible = true;

2049 2050 2051 2052
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return;

2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072
	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* back off to last completed page boundary */
	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2073
		SpinLockAcquire(&xlogctl->info_lck);
2074
		WriteRqstPtr = xlogctl->asyncCommitLSN;
2075
		SpinLockRelease(&xlogctl->info_lck);
2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
		flexible = false;		/* ensure it all gets written */
	}

	/* Done if already known flushed */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
		return;

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
#endif

	START_CRIT_SECTION();

	/* now wait for the write lock */
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->Write.LogwrtResult;
	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		XLogwrtRqst WriteRqst;

		WriteRqst.Write = WriteRqstPtr;
		WriteRqst.Flush = WriteRqstPtr;
		XLogWrite(WriteRqst, flexible, false);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();
}

2109 2110 2111 2112 2113 2114 2115 2116 2117
/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool
XLogNeedsFlush(XLogRecPtr record)
{
2118 2119 2120 2121 2122
	/*
	 * During recovery, we don't flush WAL but update minRecoveryPoint
	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
	 * would need to be updated.
	 */
2123
	if (RecoveryInProgress())
2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151
	{
		/* Quick exit if already known updated */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;

		/*
		 * Update local copy of minRecoveryPoint. But if the lock is busy,
		 * just return a conservative guess.
		 */
		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
			return true;
		minRecoveryPoint = ControlFile->minRecoveryPoint;
		LWLockRelease(ControlFileLock);

		/*
		 * An invalid minRecoveryPoint means that we need to recover all the WAL,
		 * i.e., we're doing crash recovery.  We never modify the control file's
		 * value in that case, so we can short-circuit future checks here too.
		 */
		if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
			updateMinRecoveryPoint = false;

		/* check again */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;
		else
			return true;
	}
2152

2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* check again */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	return true;
}

T
Tom Lane 已提交
2174 2175 2176
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
2177 2178 2179
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
2180
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
2181 2182
 * file was used.
 *
2183
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2184
 * place.  This should be TRUE except during bootstrap log creation.  The
2185
 * caller must *not* hold the lock at call.
2186
 *
T
Tom Lane 已提交
2187
 * Returns FD of opened file.
2188 2189 2190 2191 2192
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
T
Tom Lane 已提交
2193
 */
2194
int
2195 2196
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
2197
{
2198
	char		path[MAXPGPATH];
2199
	char		tmppath[MAXPGPATH];
2200
	char	   *zbuffer;
2201 2202 2203
	uint32		installed_log;
	uint32		installed_seg;
	int			max_advance;
2204
	int			fd;
2205
	int			nbytes;
2206

2207
	XLogFilePath(path, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
2208 2209

	/*
B
Bruce Momjian 已提交
2210
	 * Try to use existent file (checkpoint maker may have created it already)
V
Vadim B. Mikheev 已提交
2211
	 */
2212
	if (*use_existent)
V
Vadim B. Mikheev 已提交
2213
	{
2214
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2215
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
2216 2217 2218
		if (fd < 0)
		{
			if (errno != ENOENT)
2219
				ereport(ERROR,
2220
						(errcode_for_file_access(),
2221
						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2222
								path, log, seg)));
V
Vadim B. Mikheev 已提交
2223 2224
		}
		else
2225
			return fd;
V
Vadim B. Mikheev 已提交
2226 2227
	}

2228
	/*
B
Bruce Momjian 已提交
2229 2230 2231 2232
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
	 * another process is doing the same thing.  If so, we will end up
	 * pre-creating an extra log segment.  That seems OK, and better than
	 * holding the lock throughout this lengthy process.
2233
	 */
2234 2235
	elog(DEBUG2, "creating and filling new WAL file");

2236
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2237 2238

	unlink(tmppath);
2239

2240
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2241
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
2242
					   S_IRUSR | S_IWUSR);
2243
	if (fd < 0)
2244
		ereport(ERROR,
2245
				(errcode_for_file_access(),
2246
				 errmsg("could not create file \"%s\": %m", tmppath)));
2247

2248
	/*
B
Bruce Momjian 已提交
2249 2250 2251 2252 2253 2254 2255
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
2256 2257 2258 2259
	 *
	 * Note: palloc zbuffer, instead of just using a local char array, to
	 * ensure it is reasonably well-aligned; this may save a few cycles
	 * transferring data to the kernel.
2260
	 */
2261 2262
	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2263
	{
2264
		errno = 0;
2265
		if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
T
Tom Lane 已提交
2266
		{
B
Bruce Momjian 已提交
2267
			int			save_errno = errno;
T
Tom Lane 已提交
2268

B
Bruce Momjian 已提交
2269
			/*
B
Bruce Momjian 已提交
2270
			 * If we fail to make the file, delete it to release disk space
B
Bruce Momjian 已提交
2271
			 */
2272
			unlink(tmppath);
2273 2274
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
2275

2276
			ereport(ERROR,
2277
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2278
					 errmsg("could not write to file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2279
		}
2280
	}
2281
	pfree(zbuffer);
2282

2283
	if (pg_fsync(fd) != 0)
2284
		ereport(ERROR,
2285
				(errcode_for_file_access(),
2286
				 errmsg("could not fsync file \"%s\": %m", tmppath)));
2287

2288
	if (close(fd))
2289
		ereport(ERROR,
2290 2291
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2292

2293
	/*
2294 2295
	 * Now move the segment into place with its final name.
	 *
2296
	 * If caller didn't want to use a pre-existing file, get rid of any
B
Bruce Momjian 已提交
2297 2298 2299
	 * pre-existing file.  Otherwise, cope with possibility that someone else
	 * has created the file while we were filling ours: if so, use ours to
	 * pre-create a future log segment.
2300
	 */
2301 2302 2303 2304 2305
	installed_log = log;
	installed_seg = seg;
	max_advance = XLOGfileslop;
	if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
								*use_existent, &max_advance,
2306 2307
								use_lock))
	{
2308 2309 2310 2311 2312
		/*
		 * No need for any more future segments, or InstallXLogFileSegment()
		 * failed to rename the file into place. If the rename failed, opening
		 * the file below will fail.
		 */
2313 2314 2315 2316 2317 2318 2319
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
2320
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2321 2322
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2323
		ereport(ERROR,
2324
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2325 2326
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2327

2328 2329
	elog(DEBUG2, "done creating and filling new WAL file");

2330
	return fd;
2331 2332
}

2333 2334 2335 2336 2337 2338 2339 2340 2341
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
2342
 * considerations.	But we should be just as tense as XLogFileInit to avoid
2343 2344 2345 2346 2347 2348 2349 2350
 * emplacing a bogus file.
 */
static void
XLogFileCopy(uint32 log, uint32 seg,
			 TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
2351
	char		buffer[XLOG_BLCKSZ];
2352 2353 2354 2355 2356 2357 2358 2359 2360 2361
	int			srcfd;
	int			fd;
	int			nbytes;

	/*
	 * Open the source file
	 */
	XLogFilePath(path, srcTLI, srclog, srcseg);
	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
2362
		ereport(ERROR,
2363 2364 2365 2366 2367 2368
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", path)));

	/*
	 * Copy into a temp file name.
	 */
2369
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2370 2371 2372

	unlink(tmppath);

2373
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2374 2375 2376
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2377
		ereport(ERROR,
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * Do the data copying.
	 */
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
	{
		errno = 0;
		if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			if (errno != 0)
2390
				ereport(ERROR,
2391 2392 2393
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			else
2394
				ereport(ERROR,
B
Bruce Momjian 已提交
2395
						(errmsg("not enough data in file \"%s\"", path)));
2396 2397 2398 2399 2400 2401 2402
		}
		errno = 0;
		if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			int			save_errno = errno;

			/*
B
Bruce Momjian 已提交
2403
			 * If we fail to make the file, delete it to release disk space
2404 2405 2406 2407 2408
			 */
			unlink(tmppath);
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

2409
			ereport(ERROR,
2410
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2411
					 errmsg("could not write to file \"%s\": %m", tmppath)));
2412 2413 2414 2415
		}
	}

	if (pg_fsync(fd) != 0)
2416
		ereport(ERROR,
2417 2418 2419 2420
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
2421
		ereport(ERROR,
2422 2423 2424 2425 2426 2427 2428 2429
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));

	close(srcfd);

	/*
	 * Now move the segment into place with its final name.
	 */
2430
	if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2431
		elog(ERROR, "InstallXLogFileSegment should not have failed");
2432 2433
}

2434 2435 2436 2437 2438 2439
/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
2440 2441 2442
 * *log, *seg: identify segment to install as (or first possible target).
 * When find_free is TRUE, these are modified on return to indicate the
 * actual installation location or last segment searched.
2443 2444 2445 2446 2447 2448 2449
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
2450 2451 2452 2453
 * *max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
2454
 *
2455
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2456
 * place.  This should be TRUE except during bootstrap log creation.  The
2457
 * caller must *not* hold the lock at call.
2458
 *
2459 2460 2461
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
2462 2463
 */
static bool
2464 2465
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
2466 2467 2468
					   bool use_lock)
{
	char		path[MAXPGPATH];
2469
	struct stat stat_buf;
2470

2471
	XLogFilePath(path, ThisTimeLineID, *log, *seg);
2472 2473 2474 2475 2476

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
2477
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2478

2479 2480 2481 2482 2483
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
2484 2485
	else
	{
2486
		/* Find a free slot to put it in */
2487
		while (stat(path, &stat_buf) == 0)
2488
		{
2489
			if (*max_advance <= 0)
2490 2491 2492
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
2493
					LWLockRelease(ControlFileLock);
2494 2495
				return false;
			}
2496 2497 2498
			NextLogSeg(*log, *seg);
			(*max_advance)--;
			XLogFilePath(path, ThisTimeLineID, *log, *seg);
2499 2500 2501 2502 2503 2504 2505
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
2506
	 */
2507
#if HAVE_WORKING_LINK
2508
	if (link(tmppath, path) < 0)
2509 2510 2511 2512
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2513
				(errcode_for_file_access(),
2514
				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2515
						tmppath, path, *log, *seg)));
2516 2517
		return false;
	}
2518
	unlink(tmppath);
2519
#else
2520
	if (rename(tmppath, path) < 0)
2521
	{
2522 2523 2524
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2525
				(errcode_for_file_access(),
2526
				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2527
						tmppath, path, *log, *seg)));
2528
		return false;
2529
	}
2530
#endif
V
Vadim B. Mikheev 已提交
2531

2532
	if (use_lock)
2533
		LWLockRelease(ControlFileLock);
2534

2535
	return true;
2536 2537
}

T
Tom Lane 已提交
2538
/*
2539
 * Open a pre-existing logfile segment for writing.
T
Tom Lane 已提交
2540
 */
2541
int
2542
XLogFileOpen(uint32 log, uint32 seg)
2543
{
2544 2545
	char		path[MAXPGPATH];
	int			fd;
2546

2547
	XLogFilePath(path, ThisTimeLineID, log, seg);
2548

2549
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2550
					   S_IRUSR | S_IWUSR);
2551
	if (fd < 0)
2552 2553
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2554 2555
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2556 2557 2558 2559 2560 2561

	return fd;
}

/*
 * Open a logfile segment for reading (during recovery).
2562 2563 2564
 *
 * If fromArchive is true, the segment is retrieved from archive, otherwise
 * it's read from pg_xlog.
2565 2566
 */
static int
2567 2568
XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
			 bool fromArchive, bool notfoundOk)
2569 2570
{
	char		xlogfname[MAXFNAMELEN];
2571
	char		activitymsg[MAXFNAMELEN + 16];
2572
	char		path[MAXPGPATH];
2573
	int			fd;
2574

2575 2576
		XLogFileName(xlogfname, tli, log, seg);

2577
		if (fromArchive)
2578
		{
2579 2580 2581 2582 2583
			/* Report recovery progress in PS display */
			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
					 xlogfname);
			set_ps_display(activitymsg, false);

2584
			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2585 2586
													  "RECOVERYXLOG",
													  XLogSegSize);
2587 2588
			if (!restoredFromArchive)
				return -1;
2589 2590
		}
		else
2591
		{
2592
			XLogFilePath(path, tli, log, seg);
2593 2594
			restoredFromArchive = false;
		}
2595 2596 2597 2598 2599 2600

		fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
		if (fd >= 0)
		{
			/* Success! */
			curFileTLI = tli;
2601 2602

			/* Report recovery progress in PS display */
2603 2604
			snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
					 xlogfname);
2605 2606
			set_ps_display(activitymsg, false);

2607 2608
			return fd;
		}
2609
		if (errno != ENOENT || !notfoundOk)	/* unexpected failure? */
2610 2611
			ereport(PANIC,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2612 2613
			errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				   path, log, seg)));
2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664
		return -1;
}

/*
 * Open a logfile segment for reading (during recovery).
 *
 * This version searches for the segment with any TLI listed in expectedTLIs.
 * If not in StandbyMode and fromArchive is true, the segment is also
 * searched in pg_xlog if not found in archive.
 */
static int
XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, bool fromArchive)
{
	char		path[MAXPGPATH];
	ListCell   *cell;
	int			fd;

	/*
	 * Loop looking for a suitable timeline ID: we might need to read any of
	 * the timelines listed in expectedTLIs.
	 *
	 * We expect curFileTLI on entry to be the TLI of the preceding file in
	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
	 * to go backwards; this prevents us from picking up the wrong file when a
	 * parent timeline extends to higher segment numbers than the child we
	 * want to read.
	 */
	foreach(cell, expectedTLIs)
	{
		TimeLineID	tli = (TimeLineID) lfirst_int(cell);

		if (tli < curFileTLI)
			break;				/* don't bother looking at too-old TLIs */

		fd = XLogFileRead(log, seg, emode, tli, fromArchive, true);
		if (fd != -1)
			return fd;

		/*
		 * If not in StandbyMode, fall back to searching pg_xlog. In
		 * StandbyMode we're streaming segments from the primary to pg_xlog,
		 * and we mustn't confuse the (possibly partial) segments in pg_xlog
		 * with complete segments ready to be applied. We rather wait for
		 * the records to arrive through streaming.
		 */
		if (!StandbyMode && fromArchive)
		{
			fd = XLogFileRead(log, seg, emode, tli, false, true);
			if (fd != -1)
				return fd;
		}
2665 2666 2667 2668 2669 2670 2671
	}

	/* Couldn't find it.  For simplicity, complain about front timeline */
	XLogFilePath(path, recoveryTargetTLI, log, seg);
	errno = ENOENT;
	ereport(emode,
			(errcode_for_file_access(),
B
Bruce Momjian 已提交
2672 2673
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2674
	return -1;
2675 2676
}

2677 2678 2679 2680 2681 2682 2683 2684 2685
/*
 * Close the current logfile segment for writing.
 */
static void
XLogFileClose(void)
{
	Assert(openLogFile >= 0);

	/*
2686
	 * WAL segment files will not be re-read in normal operation, so we advise
2687
	 * the OS to release any cached pages.	But do not do so if WAL archiving
2688 2689 2690 2691
	 * or streaming is active, because archiver and walsender process could use
	 * the cache to read the WAL segment.  Also, don't bother with it if we
	 * are using O_DIRECT, since the kernel is presumably not caching in that
	 * case.
2692
	 */
2693
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2694
	if (!XLogIsNeeded() &&
2695 2696
		(get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2697
#endif
2698

2699 2700
	if (close(openLogFile))
		ereport(PANIC,
B
Bruce Momjian 已提交
2701 2702 2703
				(errcode_for_file_access(),
				 errmsg("could not close log file %u, segment %u: %m",
						openLogId, openLogSeg)));
2704 2705 2706
	openLogFile = -1;
}

2707
/*
2708
 * Attempt to retrieve the specified file from off-line archival storage.
2709
 * If successful, fill "path" with its complete path (note that this will be
2710 2711
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
2712
 *
2713 2714 2715
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return
 * FALSE.
2716 2717 2718 2719
 *
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery.  If the file size is not
 * known, set expectedSize = 0.
2720
 */
2721 2722
static bool
RestoreArchivedFile(char *path, const char *xlogfname,
2723
					const char *recovername, off_t expectedSize)
2724
{
B
Bruce Momjian 已提交
2725 2726
	char		xlogpath[MAXPGPATH];
	char		xlogRestoreCmd[MAXPGPATH];
2727
	char		lastRestartPointFname[MAXPGPATH];
B
Bruce Momjian 已提交
2728 2729
	char	   *dp;
	char	   *endp;
2730
	const char *sp;
B
Bruce Momjian 已提交
2731
	int			rc;
2732
	bool		signaled;
2733
	struct stat stat_buf;
B
Bruce Momjian 已提交
2734 2735
	uint32		restartLog;
	uint32		restartSeg;
2736

2737
	/* In standby mode, restore_command might not be supplied */
2738
	if (recoveryRestoreCommand == NULL)
2739 2740
		goto not_available;

2741
	/*
B
Bruce Momjian 已提交
2742 2743 2744 2745
	 * When doing archive recovery, we always prefer an archived log file even
	 * if a file of the same name exists in XLOGDIR.  The reason is that the
	 * file in XLOGDIR could be an old, un-filled or partly-filled version
	 * that was copied and restored as part of backing up $PGDATA.
2746
	 *
B
Bruce Momjian 已提交
2747
	 * We could try to optimize this slightly by checking the local copy
B
Bruce Momjian 已提交
2748 2749 2750 2751
	 * lastchange timestamp against the archived copy, but we have no API to
	 * do this, nor can we guarantee that the lastchange timestamp was
	 * preserved correctly when we copied to archive. Our aim is robustness,
	 * so we elect not to do this.
2752
	 *
2753 2754 2755
	 * If we cannot obtain the log file from the archive, however, we will try
	 * to use the XLOGDIR file if it exists.  This is so that we can make use
	 * of log segments that weren't yet transferred to the archive.
2756
	 *
2757 2758 2759 2760
	 * Notice that we don't actually overwrite any files when we copy back
	 * from archive because the recoveryRestoreCommand may inadvertently
	 * restore inappropriate xlogs, or they may be corrupt, so we may wish to
	 * fallback to the segments remaining in current XLOGDIR later. The
B
Bruce Momjian 已提交
2761 2762
	 * copy-from-archive filename is always the same, ensuring that we don't
	 * run out of disk space on long recoveries.
2763
	 */
2764
	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2765 2766

	/*
2767
	 * Make sure there is no existing file named recovername.
2768 2769 2770 2771 2772 2773
	 */
	if (stat(xlogpath, &stat_buf) != 0)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
2774
					 errmsg("could not stat file \"%s\": %m",
2775 2776 2777 2778 2779 2780 2781
							xlogpath)));
	}
	else
	{
		if (unlink(xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
2782
					 errmsg("could not remove file \"%s\": %m",
2783 2784 2785
							xlogpath)));
	}

2786 2787
	/*
	 * Calculate the archive file cutoff point for use during log shipping
2788 2789
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
2790 2791
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
2792 2793
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
2794 2795 2796
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
2797 2798 2799 2800
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
2801 2802 2803 2804 2805 2806 2807 2808 2809
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
		/* we shouldn't need anything earlier than last restart point */
2810
		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2811 2812 2813 2814
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828
	/*
	 * construct the command to be executed
	 */
	dp = xlogRestoreCmd;
	endp = xlogRestoreCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryRestoreCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'p':
2829
					/* %p: relative path of target file */
2830
					sp++;
B
Bruce Momjian 已提交
2831
					StrNCpy(dp, xlogpath, endp - dp);
2832
					make_native_path(dp);
2833 2834 2835 2836 2837
					dp += strlen(dp);
					break;
				case 'f':
					/* %f: filename of desired file */
					sp++;
B
Bruce Momjian 已提交
2838
					StrNCpy(dp, xlogfname, endp - dp);
2839 2840
					dp += strlen(dp);
					break;
2841 2842 2843 2844 2845 2846
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
B
Bruce Momjian 已提交
2869
			(errmsg_internal("executing restore command \"%s\"",
2870 2871
							 xlogRestoreCmd)));

2872 2873
	/*
	 * Set in_restore_command to tell the signal handler that we should exit
2874
	 * right away on SIGTERM. We know that we're at a safe point to do that.
2875 2876 2877 2878 2879
	 * Check if we had already received the signal, so that we don't miss a
	 * shutdown request received just before this.
	 */
	in_restore_command = true;
	if (shutdown_requested)
2880
		proc_exit(1);
2881

2882
	/*
2883
	 * Copy xlog from archival storage to XLOGDIR
2884 2885
	 */
	rc = system(xlogRestoreCmd);
2886 2887 2888

	in_restore_command = false;

2889 2890
	if (rc == 0)
	{
2891 2892 2893 2894
		/*
		 * command apparently succeeded, but let's make sure the file is
		 * really there now and has the correct size.
		 *
2895 2896 2897 2898 2899
		 * XXX I made wrong-size a fatal error to ensure the DBA would notice
		 * it, but is that too strong?	We could try to plow ahead with a
		 * local copy of the file ... but the problem is that there probably
		 * isn't one, and we'd incorrectly conclude we've reached the end of
		 * WAL and we're done recovering ...
2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923
		 */
		if (stat(xlogpath, &stat_buf) == 0)
		{
			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
				ereport(FATAL,
						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
								xlogfname,
								(unsigned long) stat_buf.st_size,
								(unsigned long) expectedSize)));
			else
			{
				ereport(LOG,
						(errmsg("restored log file \"%s\" from archive",
								xlogfname)));
				strcpy(path, xlogpath);
				return true;
			}
		}
		else
		{
			/* stat failed */
			if (errno != ENOENT)
				ereport(FATAL,
						(errcode_for_file_access(),
P
Peter Eisentraut 已提交
2924
						 errmsg("could not stat file \"%s\": %m",
2925
								xlogpath)));
2926 2927 2928 2929
		}
	}

	/*
2930
	 * Remember, we rollforward UNTIL the restore fails so failure here is
B
Bruce Momjian 已提交
2931
	 * just part of the process... that makes it difficult to determine
B
Bruce Momjian 已提交
2932 2933 2934
	 * whether the restore failed because there isn't an archive to restore,
	 * or because the administrator has specified the restore program
	 * incorrectly.  We have to assume the former.
2935 2936
	 *
	 * However, if the failure was due to any sort of signal, it's best to
B
Bruce Momjian 已提交
2937 2938 2939
	 * punt and abort recovery.  (If we "return false" here, upper levels will
	 * assume that recovery is complete and start up the database!) It's
	 * essential to abort on child SIGINT and SIGQUIT, because per spec
2940
	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2941 2942 2943 2944 2945
	 * those it's a good bet we should have gotten it too.
	 *
	 * On SIGTERM, assume we have received a fast shutdown request, and exit
	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
	 * child process. If we receive it first, the signal handler will call
2946 2947 2948
	 * proc_exit, otherwise we do it here. If we or the child process received
	 * SIGTERM for any other reason than a fast shutdown request, postmaster
	 * will perform an immediate shutdown when it sees us exiting
2949
	 * unexpectedly.
2950
	 *
B
Bruce Momjian 已提交
2951 2952 2953 2954
	 * Per the Single Unix Spec, shells report exit status > 128 when a called
	 * command died on a signal.  Also, 126 and 127 are used to report
	 * problems such as an unfindable command; treat those as fatal errors
	 * too.
2955
	 */
2956
	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
2957
		proc_exit(1);
2958

2959 2960 2961
	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

	ereport(signaled ? FATAL : DEBUG2,
B
Bruce Momjian 已提交
2962 2963
		(errmsg("could not restore file \"%s\" from archive: return code %d",
				xlogfname, rc)));
2964

2965
not_available:
2966
	/*
B
Bruce Momjian 已提交
2967 2968
	 * if an archived file is not available, there might still be a version of
	 * this file in XLOGDIR, so return that as the filename to open.
2969
	 *
B
Bruce Momjian 已提交
2970 2971
	 * In many recovery scenarios we expect this to fail also, but if so that
	 * just means we've reached the end of WAL.
2972
	 */
2973
	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2974
	return false;
2975 2976
}

2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996
/*
 * Attempt to execute the recovery_end_command.
 */
static void
ExecuteRecoveryEndCommand(void)
{
	char		xlogRecoveryEndCmd[MAXPGPATH];
	char		lastRestartPointFname[MAXPGPATH];
	char	   *dp;
	char	   *endp;
	const char *sp;
	int			rc;
	bool		signaled;
	uint32		restartLog;
	uint32		restartSeg;

	Assert(recoveryEndCommand);

	/*
	 * Calculate the archive file cutoff point for use during log shipping
2997 2998
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
2999 3000
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
3001 3002
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
3003 3004 3005
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
3006 3007 3008 3009
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

	/*
	 * construct the command to be executed
	 */
	dp = xlogRecoveryEndCmd;
	endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryEndCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
			(errmsg_internal("executing recovery end command \"%s\"",
							 xlogRecoveryEndCmd)));

	/*
T
Tom Lane 已提交
3067
	 * execute the constructed command
3068 3069 3070 3071 3072 3073
	 */
	rc = system(xlogRecoveryEndCmd);
	if (rc != 0)
	{
		/*
		 * If the failure was due to any sort of signal, it's best to punt and
3074
		 * abort recovery. See also detailed comments on signals in
3075 3076 3077 3078 3079 3080
		 * RestoreArchivedFile().
		 */
		signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

		ereport(signaled ? FATAL : WARNING,
				(errmsg("recovery_end_command \"%s\": return code %d",
3081
						xlogRecoveryEndCmd, rc)));
3082 3083 3084
	}
}

V
Vadim B. Mikheev 已提交
3085
/*
3086 3087 3088 3089 3090 3091 3092 3093
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
T
Tom Lane 已提交
3094
 */
3095
static void
T
Tom Lane 已提交
3096 3097 3098 3099 3100
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
3101
	bool		use_existent;
T
Tom Lane 已提交
3102 3103

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
3104
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
3105
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
3106 3107
	{
		NextLogSeg(_logId, _logSeg);
3108 3109
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
3110
		close(lf);
3111
		if (!use_existent)
3112
			CheckpointStats.ckpt_segs_added++;
T
Tom Lane 已提交
3113 3114 3115 3116
	}
}

/*
3117
 * Recycle or remove all log files older or equal to passed log/seg#
3118 3119 3120
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
3121 3122
 */
static void
3123
RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
3124
{
3125 3126
	uint32		endlogId;
	uint32		endlogSeg;
3127
	int			max_advance;
B
Bruce Momjian 已提交
3128 3129
	DIR		   *xldir;
	struct dirent *xlde;
3130
	char		lastoff[MAXFNAMELEN];
B
Bruce Momjian 已提交
3131
	char		path[MAXPGPATH];
3132 3133 3134
#ifdef WIN32
	char		newpath[MAXPGPATH];
#endif
3135
	struct stat statbuf;
V
Vadim B. Mikheev 已提交
3136

3137 3138 3139 3140
	/*
	 * Initialize info about where to try to recycle to.  We allow recycling
	 * segments up to XLOGfileslop segments beyond the current XLOG location.
	 */
3141
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3142
	max_advance = XLOGfileslop;
V
Vadim B. Mikheev 已提交
3143

3144
	xldir = AllocateDir(XLOGDIR);
V
Vadim B. Mikheev 已提交
3145
	if (xldir == NULL)
3146
		ereport(ERROR,
3147
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3148 3149
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
V
Vadim B. Mikheev 已提交
3150

3151
	XLogFileName(lastoff, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
3152

3153
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
V
Vadim B. Mikheev 已提交
3154
	{
3155
		/*
3156
		 * We ignore the timeline part of the XLOG segment identifiers in
B
Bruce Momjian 已提交
3157 3158 3159 3160 3161
		 * deciding whether a segment is still needed.	This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
3162
		 *
B
Bruce Momjian 已提交
3163 3164
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
3165
		 */
3166 3167 3168
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
V
Vadim B. Mikheev 已提交
3169
		{
3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180
			/*
			 * Normally we don't delete old XLOG files during recovery to
			 * avoid accidentally deleting a file that looks stale due to a
			 * bug or hardware issue, but in fact contains important data.
			 * During streaming recovery, however, we will eventually fill the
			 * disk if we never clean up, so we have to. That's not an issue
			 * with file-based archive recovery because in that case we
			 * restore one XLOG file at a time, on-demand, and with a
			 * different filename that can't be confused with regular XLOG
			 * files.
			 */
3181
			if (WalRcvInProgress() || XLogArchiveCheckDone(xlde->d_name))
3182
			{
3183
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3184

3185
				/*
B
Bruce Momjian 已提交
3186
				 * Before deleting the file, see if it can be recycled as a
3187 3188 3189
				 * future log segment. Only recycle normal files, pg_standby
				 * for example can create symbolic links pointing to a
				 * separate archive directory.
3190
				 */
3191 3192 3193
				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
					InstallXLogFileSegment(&endlogId, &endlogSeg, path,
										   true, &max_advance, true))
3194
				{
3195
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3196 3197
							(errmsg("recycled transaction log file \"%s\"",
									xlde->d_name)));
3198
					CheckpointStats.ckpt_segs_recycled++;
3199 3200 3201 3202 3203 3204
					/* Needn't recheck that slot on future iterations */
					if (max_advance > 0)
					{
						NextLogSeg(endlogId, endlogSeg);
						max_advance--;
					}
3205 3206 3207 3208
				}
				else
				{
					/* No need for any more future segments... */
3209 3210
					int rc;

3211
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3212 3213
							(errmsg("removing transaction log file \"%s\"",
									xlde->d_name)));
3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230

#ifdef WIN32
					/*
					 * On Windows, if another process (e.g another backend)
					 * holds the file open in FILE_SHARE_DELETE mode, unlink
					 * will succeed, but the file will still show up in
					 * directory listing until the last handle is closed.
					 * To avoid confusing the lingering deleted file for a
					 * live WAL file that needs to be archived, rename it
					 * before deleting it.
					 *
					 * If another process holds the file open without
					 * FILE_SHARE_DELETE flag, rename will fail. We'll try
					 * again at the next checkpoint.
					 */
					snprintf(newpath, MAXPGPATH, "%s.deleted", path);
					if (rename(path, newpath) != 0)
3231 3232
					{
						ereport(LOG,
3233
								(errcode_for_file_access(),
3234
								 errmsg("could not rename old transaction log file \"%s\": %m",
3235
										path)));
3236 3237
						continue;
					}
3238 3239 3240 3241 3242
					rc = unlink(newpath);
#else
					rc = unlink(path);
#endif
					if (rc != 0)
3243 3244
					{
						ereport(LOG,
3245 3246 3247
								(errcode_for_file_access(),
								 errmsg("could not remove old transaction log file \"%s\": %m",
										path)));
3248 3249
						continue;
					}
3250
					CheckpointStats.ckpt_segs_removed++;
3251
				}
3252 3253

				XLogArchiveCleanup(xlde->d_name);
3254
			}
V
Vadim B. Mikheev 已提交
3255 3256
		}
	}
B
Bruce Momjian 已提交
3257

3258
	FreeDir(xldir);
V
Vadim B. Mikheev 已提交
3259 3260
}

3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277
/*
 * Verify whether pg_xlog and pg_xlog/archive_status exist.
 * If the latter does not exist, recreate it.
 *
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a cluster
 * copy for PITR purposes but omitted pg_xlog from the copy.
 *
 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
 * policy decision was made not to.  It is fairly common for pg_xlog to be
 * a symlink, and if that was the DBA's intent then automatically making a
 * plain directory would result in degraded performance with no notice.
 */
static void
ValidateXLOGDirectoryStructure(void)
{
	char		path[MAXPGPATH];
3278
	struct stat stat_buf;
3279 3280 3281 3282

	/* Check for pg_xlog; if it doesn't exist, error out */
	if (stat(XLOGDIR, &stat_buf) != 0 ||
		!S_ISDIR(stat_buf.st_mode))
3283
		ereport(FATAL,
3284 3285 3286 3287 3288 3289 3290 3291 3292
				(errmsg("required WAL directory \"%s\" does not exist",
						XLOGDIR)));

	/* Check for archive_status */
	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
	if (stat(path, &stat_buf) == 0)
	{
		/* Check for weird cases where it exists but isn't a directory */
		if (!S_ISDIR(stat_buf.st_mode))
3293
			ereport(FATAL,
3294 3295 3296 3297 3298 3299 3300 3301
					(errmsg("required WAL directory \"%s\" does not exist",
							path)));
	}
	else
	{
		ereport(LOG,
				(errmsg("creating missing WAL directory \"%s\"", path)));
		if (mkdir(path, 0700) < 0)
3302
			ereport(FATAL,
3303 3304 3305 3306 3307
					(errmsg("could not create missing directory \"%s\": %m",
							path)));
	}
}

3308
/*
3309 3310 3311
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
3312 3313
 */
static void
3314
CleanupBackupHistory(void)
3315 3316 3317 3318 3319
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		path[MAXPGPATH];

3320
	xldir = AllocateDir(XLOGDIR);
3321 3322 3323
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3324 3325
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
3326

3327
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3328 3329 3330 3331 3332 3333
	{
		if (strlen(xlde->d_name) > 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
				   ".backup") == 0)
		{
3334
			if (XLogArchiveCheckDone(xlde->d_name))
3335 3336
			{
				ereport(DEBUG2,
B
Bruce Momjian 已提交
3337 3338
				(errmsg("removing transaction log backup history file \"%s\"",
						xlde->d_name)));
3339
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3340 3341 3342 3343 3344 3345 3346 3347 3348
				unlink(path);
				XLogArchiveCleanup(xlde->d_name);
			}
		}
	}

	FreeDir(xldir);
}

T
Tom Lane 已提交
3349 3350 3351 3352
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
3353 3354 3355 3356 3357 3358 3359 3360 3361
 *
 * Note: when a backup block is available in XLOG, we restore it
 * unconditionally, even if the page in the database appears newer.
 * This is to protect ourselves against database pages that were partially
 * or incorrectly written during a crash.  We assume that the XLOG data
 * must be good because it has passed a CRC check, while the database
 * page might not be.  This will force us to replay all subsequent
 * modifications of the page that appear in XLOG, rather than possibly
 * ignoring them as already applied, but that's not a huge drawback.
3362 3363
 *
 * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3364 3365 3366 3367 3368
 * Otherwise, a normal exclusive lock is used.	During crash recovery, that's
 * just pro forma because there can't be any regular backends in the system,
 * but in hot standby mode the distinction is important. The 'cleanup'
 * argument applies to all backup blocks in the WAL record, that suffices for
 * now.
T
Tom Lane 已提交
3369
 */
3370 3371
void
RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3372 3373 3374 3375 3376 3377 3378
{
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

3379 3380 3381
	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
		return;

B
Bruce Momjian 已提交
3382
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
3383
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3384
	{
T
Tom Lane 已提交
3385
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3386 3387
			continue;

3388
		memcpy(&bkpb, blk, sizeof(BkpBlock));
3389 3390
		blk += sizeof(BkpBlock);

3391 3392
		buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
										RBM_ZERO);
3393
		Assert(BufferIsValid(buffer));
3394 3395 3396 3397 3398
		if (cleanup)
			LockBufferForCleanup(buffer);
		else
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

3399
		page = (Page) BufferGetPage(buffer);
3400

3401
		if (bkpb.hole_length == 0)
3402
		{
3403 3404 3405 3406 3407 3408 3409 3410 3411 3412
			memcpy((char *) page, blk, BLCKSZ);
		}
		else
		{
			/* must zero-fill the hole */
			MemSet((char *) page, 0, BLCKSZ);
			memcpy((char *) page, blk, bkpb.hole_offset);
			memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
				   blk + bkpb.hole_offset,
				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3413 3414
		}

3415 3416
		PageSetLSN(page, lsn);
		PageSetTLI(page, ThisTimeLineID);
3417 3418
		MarkBufferDirty(buffer);
		UnlockReleaseBuffer(buffer);
3419

3420
		blk += BLCKSZ - bkpb.hole_length;
3421 3422 3423
	}
}

T
Tom Lane 已提交
3424 3425 3426 3427 3428 3429 3430
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
3431 3432 3433
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
3434
	pg_crc32	crc;
3435 3436
	int			i;
	uint32		len = record->xl_len;
3437
	BkpBlock	bkpb;
3438 3439
	char	   *blk;

3440 3441 3442
	/* First the rmgr data */
	INIT_CRC32(crc);
	COMP_CRC32(crc, XLogRecGetData(record), len);
3443

3444
	/* Add in the backup blocks, if any */
B
Bruce Momjian 已提交
3445
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
3446
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3447
	{
B
Bruce Momjian 已提交
3448
		uint32		blen;
3449

T
Tom Lane 已提交
3450
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3451 3452
			continue;

3453 3454
		memcpy(&bkpb, blk, sizeof(BkpBlock));
		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3455
		{
3456
			ereport(emode,
3457 3458 3459
					(errmsg("incorrect hole size in record at %X/%X",
							recptr.xlogid, recptr.xrecoff)));
			return false;
3460
		}
3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482
		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
		COMP_CRC32(crc, blk, blen);
		blk += blen;
	}

	/* Check that xl_tot_len agrees with our calculation */
	if (blk != (char *) record + record->xl_tot_len)
	{
		ereport(emode,
				(errmsg("incorrect total length in record at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
		return false;
	}

	/* Finally include the record header */
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);

	if (!EQ_CRC32(record->xl_crc, crc))
	{
		ereport(emode,
B
Bruce Momjian 已提交
3483 3484
		(errmsg("incorrect resource manager data checksum in record at %X/%X",
				recptr.xlogid, recptr.xrecoff)));
3485
		return false;
3486 3487
	}

3488
	return true;
3489 3490
}

T
Tom Lane 已提交
3491 3492 3493 3494 3495 3496
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
3497
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3498
 * (emode must be either PANIC, LOG)
T
Tom Lane 已提交
3499
 *
3500 3501
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
T
Tom Lane 已提交
3502
 */
3503
static XLogRecord *
3504
ReadRecord(XLogRecPtr *RecPtr, int emode_arg, bool fetching_ckpt)
3505
{
3506
	XLogRecord *record;
3507
	char	   *buffer;
3508
	XLogRecPtr	tmpRecPtr = EndRecPtr;
3509
	bool		randAccess = false;
T
Tom Lane 已提交
3510 3511
	uint32		len,
				total_len;
3512 3513
	uint32		targetRecOff;
	uint32		pageHeaderSize;
3514 3515 3516 3517 3518 3519 3520
	int			emode;

	/*
	 * We don't expect any invalid records during streaming recovery: we
	 * should never hit the end of WAL because we wait for it to be streamed.
	 * Therefore treat any broken WAL as PANIC, instead of failing over.
	 */
3521
	if (StandbyMode)
3522 3523 3524
		emode = PANIC;
	else
		emode = emode_arg;
T
Tom Lane 已提交
3525 3526 3527 3528

	if (readBuf == NULL)
	{
		/*
B
Bruce Momjian 已提交
3529 3530 3531 3532 3533
		 * First time through, permanently allocate readBuf.  We do it this
		 * way, rather than just making a static array, for two reasons: (1)
		 * no need to waste the storage in most instantiations of the backend;
		 * (2) a static char array isn't guaranteed to have any particular
		 * alignment, whereas malloc() will provide MAXALIGN'd storage.
T
Tom Lane 已提交
3534
		 */
3535
		readBuf = (char *) malloc(XLOG_BLCKSZ);
T
Tom Lane 已提交
3536 3537
		Assert(readBuf != NULL);
	}
3538

T
Tom Lane 已提交
3539
	if (RecPtr == NULL)
3540
	{
3541
		RecPtr = &tmpRecPtr;
3542 3543

		/*
3544 3545
		 * Align recptr to next page if no more records can fit on the
		 * current page.
3546
		 */
3547 3548
		if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
		{
3549
			NextLogPage(tmpRecPtr);
3550 3551
			/* We will account for page header size below */
		}
3552 3553 3554 3555 3556 3557

		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
3558 3559 3560 3561 3562 3563 3564
	}
	else
	{
		if (!XRecOffIsValid(RecPtr->xrecoff))
			ereport(PANIC,
					(errmsg("invalid record offset at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
B
Bruce Momjian 已提交
3565

3566
		/*
B
Bruce Momjian 已提交
3567 3568 3569 3570 3571
		 * Since we are going to a random position in WAL, forget any prior
		 * state about what timeline we were in, and allow it to be any
		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
		 * to go backwards (but we can't reset that variable right here, since
		 * we might not change files at all).
3572 3573 3574
		 */
		lastPageTLI = 0;		/* see comment in ValidXLOGHeader */
		randAccess = true;		/* allow curFileTLI to go backwards too */
3575 3576
	}

3577 3578 3579
	/* Read the page containing the record */
	if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
		return NULL;
3580

3581
	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3582
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3583 3584 3585
	if (targetRecOff == 0)
	{
		/*
B
Bruce Momjian 已提交
3586 3587 3588
		 * Can only get here in the continuing-from-prev-page case, because
		 * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
		 * to skip over the new page's header.
3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599
		 */
		tmpRecPtr.xrecoff += pageHeaderSize;
		targetRecOff = pageHeaderSize;
	}
	else if (targetRecOff < pageHeaderSize)
	{
		ereport(emode,
				(errmsg("invalid record offset at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
T
Tom Lane 已提交
3600
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3601
		targetRecOff == pageHeaderSize)
3602
	{
3603 3604 3605
		ereport(emode,
				(errmsg("contrecord is requested by %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3606 3607
		goto next_record_is_invalid;
	}
3608
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3609

T
Tom Lane 已提交
3610
	/*
B
Bruce Momjian 已提交
3611 3612
	 * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
	 * required.
T
Tom Lane 已提交
3613
	 */
3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		if (record->xl_len != 0)
		{
			ereport(emode,
					(errmsg("invalid xlog switch record at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else if (record->xl_len == 0)
3625
	{
3626 3627 3628
		ereport(emode,
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3629 3630
		goto next_record_is_invalid;
	}
3631 3632 3633 3634 3635 3636 3637 3638 3639
	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
	{
		ereport(emode,
				(errmsg("invalid record length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
3640 3641 3642 3643
	if (record->xl_rmid > RM_MAX_ID)
	{
		ereport(emode,
				(errmsg("invalid resource manager ID %u at %X/%X",
B
Bruce Momjian 已提交
3644
						record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3645 3646
		goto next_record_is_invalid;
	}
3647 3648 3649
	if (randAccess)
	{
		/*
B
Bruce Momjian 已提交
3650 3651
		 * We can't exactly verify the prev-link, but surely it should be less
		 * than the record's own address.
3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664
		 */
		if (!XLByteLT(record->xl_prev, *RecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else
	{
		/*
B
Bruce Momjian 已提交
3665 3666 3667
		 * Record's prev-link should exactly match our previous location. This
		 * check guards against torn WAL pages where a stale but valid-looking
		 * WAL record starts on a sector boundary.
3668 3669 3670 3671 3672 3673 3674 3675 3676 3677
		 */
		if (!XLByteEQ(record->xl_prev, ReadRecPtr))
		{
			ereport(emode,
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
B
Bruce Momjian 已提交
3678

T
Tom Lane 已提交
3679
	/*
B
Bruce Momjian 已提交
3680
	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3681 3682 3683 3684
	 * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
	 * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
	 * enough for all "normal" records, but very large commit or abort records
	 * might need more space.)
T
Tom Lane 已提交
3685
	 */
3686
	total_len = record->xl_tot_len;
3687
	if (total_len > readRecordBufSize)
3688
	{
3689 3690
		uint32		newSize = total_len;

3691 3692
		newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
		newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705
		if (readRecordBuf)
			free(readRecordBuf);
		readRecordBuf = (char *) malloc(newSize);
		if (!readRecordBuf)
		{
			readRecordBufSize = 0;
			/* We treat this as a "bogus data" condition */
			ereport(emode,
					(errmsg("record length %u at %X/%X too long",
							total_len, RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
		readRecordBufSize = newSize;
3706
	}
3707 3708

	buffer = readRecordBuf;
3709
	len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
T
Tom Lane 已提交
3710
	if (total_len > len)
3711
	{
T
Tom Lane 已提交
3712 3713
		/* Need to reassemble record */
		XLogContRecord *contrecord;
3714
		XLogRecPtr	pagelsn;
B
Bruce Momjian 已提交
3715
		uint32		gotlen = len;
3716

3717 3718 3719 3720
		/* Initialize pagelsn to the beginning of the page this record is on */
		pagelsn = *RecPtr;
		pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;

T
Tom Lane 已提交
3721
		memcpy(buffer, record, len);
3722
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
3723
		buffer += len;
3724
		for (;;)
3725
		{
3726 3727 3728
			/* Calculate pointer to beginning of next page */
			pagelsn.xrecoff += XLOG_BLCKSZ;
			if (pagelsn.xrecoff >= XLogFileSize)
3729
			{
3730 3731
				(pagelsn.xlogid)++;
				pagelsn.xrecoff = 0;
3732
			}
3733 3734 3735
			/* Wait for the next page to become available */
			if (!XLogPageRead(&pagelsn, emode, false, false))
				return NULL;
3736

3737
			/* Check that the continuation record looks valid */
T
Tom Lane 已提交
3738
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3739
			{
3740 3741 3742
				ereport(emode,
						(errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
								readId, readSeg, readOff)));
3743 3744
				goto next_record_is_invalid;
			}
3745 3746
			pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
			contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
B
Bruce Momjian 已提交
3747
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
3748
				total_len != (contrecord->xl_rem_len + gotlen))
3749
			{
3750 3751 3752 3753
				ereport(emode,
						(errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
								contrecord->xl_rem_len,
								readId, readSeg, readOff)));
3754 3755
				goto next_record_is_invalid;
			}
3756
			len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
T
Tom Lane 已提交
3757
			if (contrecord->xl_rem_len > len)
3758
			{
B
Bruce Momjian 已提交
3759
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
3760 3761 3762 3763 3764 3765 3766 3767 3768 3769
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
		if (!RecordIsValid(record, *RecPtr, emode))
			goto next_record_is_invalid;
3770
		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
T
Tom Lane 已提交
3771 3772
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3773 3774
			pageHeaderSize +
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3775

T
Tom Lane 已提交
3776
		ReadRecPtr = *RecPtr;
3777
		/* needn't worry about XLOG SWITCH, it can't cross page boundaries */
T
Tom Lane 已提交
3778
		return record;
3779 3780
	}

T
Tom Lane 已提交
3781 3782 3783 3784 3785
	/* Record does not cross a page boundary */
	if (!RecordIsValid(record, *RecPtr, emode))
		goto next_record_is_invalid;
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3786

T
Tom Lane 已提交
3787 3788
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
B
Bruce Momjian 已提交
3789

3790 3791 3792 3793 3794 3795 3796 3797
	/*
	 * Special processing if it's an XLOG SWITCH record
	 */
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		/* Pretend it extends to end of segment */
		EndRecPtr.xrecoff += XLogSegSize - 1;
		EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
B
Bruce Momjian 已提交
3798

3799
		/*
B
Bruce Momjian 已提交
3800 3801 3802
		 * Pretend that readBuf contains the last page of the segment. This is
		 * just to avoid Assert failure in StartupXLOG if XLOG ends with this
		 * segment.
3803 3804 3805
		 */
		readOff = XLogSegSize - XLOG_BLCKSZ;
	}
T
Tom Lane 已提交
3806
	return (XLogRecord *) buffer;
3807

T
Tom Lane 已提交
3808
next_record_is_invalid:;
3809 3810 3811 3812 3813
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
T
Tom Lane 已提交
3814
	return NULL;
3815 3816
}

3817 3818 3819 3820
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
3821
 * ReadRecord.	It's not intended for use from anywhere else.
3822 3823
 */
static bool
3824
ValidXLOGHeader(XLogPageHeader hdr, int emode)
3825
{
3826 3827
	XLogRecPtr	recaddr;

3828 3829
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
3830 3831 3832
		ereport(emode,
				(errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
						hdr->xlp_magic, readId, readSeg, readOff)));
3833 3834 3835 3836
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
3837 3838 3839
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
3840 3841
		return false;
	}
3842
	if (hdr->xlp_info & XLP_LONG_HEADER)
3843
	{
3844
		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
B
Bruce Momjian 已提交
3845

3846
		if (longhdr->xlp_sysid != ControlFile->system_identifier)
3847
		{
3848 3849
			char		fhdrident_str[32];
			char		sysident_str[32];
3850

3851
			/*
B
Bruce Momjian 已提交
3852 3853
			 * Format sysids separately to keep platform-dependent format code
			 * out of the translatable message string.
3854 3855 3856 3857 3858 3859 3860
			 */
			snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
					 longhdr->xlp_sysid);
			snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
					 ControlFile->system_identifier);
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
3861 3862
					 errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
							   fhdrident_str, sysident_str)));
3863 3864 3865 3866 3867 3868
			return false;
		}
		if (longhdr->xlp_seg_size != XLogSegSize)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
B
Bruce Momjian 已提交
3869
					 errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3870 3871
			return false;
		}
3872 3873 3874 3875 3876 3877 3878
		if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errmsg("WAL file is from different system"),
					 errdetail("Incorrect XLOG_BLCKSZ in page header.")));
			return false;
		}
3879
	}
3880 3881 3882 3883 3884 3885 3886 3887 3888
	else if (readOff == 0)
	{
		/* hmm, first page of file doesn't have a long header? */
		ereport(emode,
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
		return false;
	}

3889 3890 3891 3892 3893 3894
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
		ereport(emode,
				(errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
B
Bruce Momjian 已提交
3895
						hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Check page TLI is one of the expected values.
	 */
	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
	{
		ereport(emode,
				(errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
						hdr->xlp_tli,
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Since child timelines are always assigned a TLI greater than their
	 * immediate parent's TLI, we should never see TLI go backwards across
	 * successive pages of a consistent WAL sequence.
	 *
B
Bruce Momjian 已提交
3917 3918 3919
	 * Of course this check should only be applied when advancing sequentially
	 * across pages; therefore ReadRecord resets lastPageTLI to zero when
	 * going to a random page.
3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936
	 */
	if (hdr->xlp_tli < lastPageTLI)
	{
		ereport(emode,
				(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
						hdr->xlp_tli, lastPageTLI,
						readId, readSeg, readOff)));
		return false;
	}
	lastPageTLI = hdr->xlp_tli;
	return true;
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
B
Bruce Momjian 已提交
3937
 * its ancestor TLIs).	If we can't find the history file, assume that the
3938 3939 3940 3941 3942 3943 3944 3945 3946 3947
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
static List *
readTimeLineHistory(TimeLineID targetTLI)
{
	List	   *result;
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		fline[MAXPGPATH];
B
Bruce Momjian 已提交
3948
	FILE	   *fd;
3949

3950 3951 3952 3953
	/* Timeline 1 does not have a history file, so no need to check */
	if (targetTLI == 1)
		return list_make1_int((int) targetTLI);

3954 3955 3956
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, targetTLI);
3957
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3958 3959 3960 3961
	}
	else
		TLHistoryFilePath(path, targetTLI);

B
Bruce Momjian 已提交
3962
	fd = AllocateFile(path, "r");
3963 3964 3965 3966 3967
	if (fd == NULL)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3968
					 errmsg("could not open file \"%s\": %m", path)));
3969 3970 3971 3972 3973 3974
		/* Not there, so assume no parents */
		return list_make1_int((int) targetTLI);
	}

	result = NIL;

B
Bruce Momjian 已提交
3975 3976 3977
	/*
	 * Parse the file...
	 */
3978
	while (fgets(fline, sizeof(fline), fd) != NULL)
3979 3980
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
3981 3982 3983
		char	   *ptr;
		char	   *endptr;
		TimeLineID	tli;
3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003

		for (ptr = fline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* expect a numeric timeline ID as first field of line */
		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
		if (endptr == ptr)
			ereport(FATAL,
					(errmsg("syntax error in history file: %s", fline),
					 errhint("Expected a numeric timeline ID.")));

		if (result &&
			tli <= (TimeLineID) linitial_int(result))
			ereport(FATAL,
					(errmsg("invalid data in history file: %s", fline),
B
Bruce Momjian 已提交
4004
				   errhint("Timeline IDs must be in increasing sequence.")));
4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017

		/* Build list with newest item first */
		result = lcons_int((int) tli, result);

		/* we ignore the remainder of each line */
	}

	FreeFile(fd);

	if (result &&
		targetTLI <= (TimeLineID) linitial_int(result))
		ereport(FATAL,
				(errmsg("invalid data in history file \"%s\"", path),
B
Bruce Momjian 已提交
4018
			errhint("Timeline IDs must be less than child timeline's ID.")));
4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036

	result = lcons_int((int) targetTLI, result);

	ereport(DEBUG3,
			(errmsg_internal("history of timeline %u is %s",
							 targetTLI, nodeToString(result))));

	return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool
existsTimeLineHistory(TimeLineID probeTLI)
{
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
B
Bruce Momjian 已提交
4037
	FILE	   *fd;
4038

4039 4040 4041 4042
	/* Timeline 1 does not have a history file, so no need to check */
	if (probeTLI == 1)
		return false;

4043 4044 4045
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, probeTLI);
4046
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061
	}
	else
		TLHistoryFilePath(path, probeTLI);

	fd = AllocateFile(path, "r");
	if (fd != NULL)
	{
		FreeFile(fd);
		return true;
	}
	else
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4062
					 errmsg("could not open file \"%s\": %m", path)));
4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080
		return false;
	}
}

/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
	TimeLineID	newestTLI;
	TimeLineID	probeTLI;

	/*
B
Bruce Momjian 已提交
4081 4082
	 * The algorithm is just to probe for the existence of timeline history
	 * files.  XXX is it useful to allow gaps in the sequence?
4083 4084 4085
	 */
	newestTLI = startTLI;

B
Bruce Momjian 已提交
4086
	for (probeTLI = startTLI + 1;; probeTLI++)
4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109
	{
		if (existsTimeLineHistory(probeTLI))
		{
			newestTLI = probeTLI;		/* probeTLI exists */
		}
		else
		{
			/* doesn't exist, assume we're done */
			break;
		}
	}

	return newestTLI;
}

/*
 * Create a new timeline history file.
 *
 *	newTLI: ID of the new timeline
 *	parentTLI: ID of its immediate parent
 *	endTLI et al: ID of the last used WAL file, for annotation purposes
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
4110
 * considerations.	But we should be just as tense as XLogFileInit to avoid
4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125
 * emplacing a bogus file.
 */
static void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
					 TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		xlogfname[MAXFNAMELEN];
	char		buffer[BLCKSZ];
	int			srcfd;
	int			fd;
	int			nbytes;

B
Bruce Momjian 已提交
4126
	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4127 4128 4129 4130

	/*
	 * Write into a temp file name.
	 */
4131
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4132 4133 4134

	unlink(tmppath);

4135
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
4136 4137 4138
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
4139
		ereport(ERROR,
4140 4141 4142 4143 4144 4145 4146 4147 4148
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * If a history file exists for the parent, copy it verbatim
	 */
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, parentTLI);
4149
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4150 4151 4152 4153 4154 4155 4156 4157
	}
	else
		TLHistoryFilePath(path, parentTLI);

	srcfd = BasicOpenFile(path, O_RDONLY, 0);
	if (srcfd < 0)
	{
		if (errno != ENOENT)
4158
			ereport(ERROR,
4159
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4160
					 errmsg("could not open file \"%s\": %m", path)));
4161 4162 4163 4164 4165 4166 4167 4168 4169
		/* Not there, so assume parent has no parents */
	}
	else
	{
		for (;;)
		{
			errno = 0;
			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
			if (nbytes < 0 || errno != 0)
4170
				ereport(ERROR,
4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			if (nbytes == 0)
				break;
			errno = 0;
			if ((int) write(fd, buffer, nbytes) != nbytes)
			{
				int			save_errno = errno;

				/*
				 * If we fail to make the file, delete it to release disk
				 * space
				 */
				unlink(tmppath);
B
Bruce Momjian 已提交
4185 4186

				/*
B
Bruce Momjian 已提交
4187
				 * if write didn't set errno, assume problem is no disk space
B
Bruce Momjian 已提交
4188
				 */
4189 4190
				errno = save_errno ? save_errno : ENOSPC;

4191
				ereport(ERROR,
4192
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
4193
					 errmsg("could not write to file \"%s\": %m", tmppath)));
4194 4195 4196 4197 4198 4199 4200 4201
			}
		}
		close(srcfd);
	}

	/*
	 * Append one line with the details of this timeline split.
	 *
B
Bruce Momjian 已提交
4202 4203
	 * If we did have a parent file, insert an extra newline just in case the
	 * parent file failed to end with one.
4204 4205 4206 4207 4208 4209 4210 4211 4212 4213
	 */
	XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);

	snprintf(buffer, sizeof(buffer),
			 "%s%u\t%s\t%s transaction %u at %s\n",
			 (srcfd < 0) ? "" : "\n",
			 parentTLI,
			 xlogfname,
			 recoveryStopAfter ? "after" : "before",
			 recoveryStopXid,
4214
			 timestamptz_to_str(recoveryStopTime));
4215 4216 4217 4218 4219 4220 4221 4222

	nbytes = strlen(buffer);
	errno = 0;
	if ((int) write(fd, buffer, nbytes) != nbytes)
	{
		int			save_errno = errno;

		/*
B
Bruce Momjian 已提交
4223
		 * If we fail to make the file, delete it to release disk space
4224 4225 4226 4227 4228
		 */
		unlink(tmppath);
		/* if write didn't set errno, assume problem is no disk space */
		errno = save_errno ? save_errno : ENOSPC;

4229
		ereport(ERROR,
4230 4231 4232 4233 4234
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", tmppath)));
	}

	if (pg_fsync(fd) != 0)
4235
		ereport(ERROR,
4236 4237 4238 4239
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
4240
		ereport(ERROR,
4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));


	/*
	 * Now move the completed history file into place with its final name.
	 */
	TLHistoryFilePath(path, newTLI);

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
	 */
#if HAVE_WORKING_LINK
	if (link(tmppath, path) < 0)
4257
		ereport(ERROR,
4258 4259 4260 4261 4262 4263
				(errcode_for_file_access(),
				 errmsg("could not link file \"%s\" to \"%s\": %m",
						tmppath, path)));
	unlink(tmppath);
#else
	if (rename(tmppath, path) < 0)
4264
		ereport(ERROR,
4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						tmppath, path)));
#endif

	/* The history file can be archived immediately. */
	TLHistoryFileName(histfname, newTLI);
	XLogArchiveNotify(histfname);
}

/*
 * I/O routines for pg_control
4277 4278
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
4279
 * contents of pg_control.	WriteControlFile() initializes pg_control
4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
4293
	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
4294 4295

	/*
T
Tom Lane 已提交
4296
	 * Initialize version and compatibility-check fields
4297
	 */
T
Tom Lane 已提交
4298 4299
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4300 4301 4302 4303

	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
	ControlFile->floatFormat = FLOATFORMAT_VALUE;

4304 4305
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
4306
	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4307
	ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4308 4309

	ControlFile->nameDataLen = NAMEDATALEN;
4310
	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4311

4312 4313
	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

4314
#ifdef HAVE_INT64_TIMESTAMP
4315
	ControlFile->enableIntTimes = true;
4316
#else
4317
	ControlFile->enableIntTimes = false;
4318
#endif
4319 4320
	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4321

T
Tom Lane 已提交
4322
	/* Contents are protected with a CRC */
4323 4324 4325 4326 4327
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
T
Tom Lane 已提交
4328

4329
	/*
4330 4331 4332 4333 4334
	 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
	 * excess over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail when we
	 * check the contents of the file, but hopefully with a more specific
	 * error than "couldn't read pg_control".
4335
	 */
4336 4337
	if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
		elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4338

4339
	memset(buffer, 0, PG_CONTROL_SIZE);
4340 4341
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

4342 4343
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4344
					   S_IRUSR | S_IWUSR);
4345
	if (fd < 0)
4346 4347 4348
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not create control file \"%s\": %m",
4349
						XLOG_CONTROL_FILE)));
4350

4351
	errno = 0;
4352
	if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4353 4354 4355 4356
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4357 4358
		ereport(PANIC,
				(errcode_for_file_access(),
4359
				 errmsg("could not write to control file: %m")));
4360
	}
4361

4362
	if (pg_fsync(fd) != 0)
4363 4364
		ereport(PANIC,
				(errcode_for_file_access(),
4365
				 errmsg("could not fsync control file: %m")));
4366

4367 4368 4369 4370
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4371 4372 4373 4374 4375
}

static void
ReadControlFile(void)
{
4376
	pg_crc32	crc;
4377 4378 4379 4380 4381
	int			fd;

	/*
	 * Read data...
	 */
4382 4383 4384
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4385
	if (fd < 0)
4386 4387 4388
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4389
						XLOG_CONTROL_FILE)));
4390 4391

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4392 4393
		ereport(PANIC,
				(errcode_for_file_access(),
4394
				 errmsg("could not read from control file: %m")));
4395 4396 4397

	close(fd);

T
Tom Lane 已提交
4398
	/*
B
Bruce Momjian 已提交
4399 4400 4401 4402
	 * Check for expected pg_control format version.  If this is wrong, the
	 * CRC check will likely fail because we'll be checking the wrong number
	 * of bytes.  Complaining about wrong version will probably be more
	 * enlightening than complaining about wrong CRC.
T
Tom Lane 已提交
4403
	 */
4404 4405 4406 4407 4408

	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4409 4410
		 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
			ControlFile->pg_control_version, ControlFile->pg_control_version,
4411 4412 4413
						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

T
Tom Lane 已提交
4414
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4415 4416 4417
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
B
Bruce Momjian 已提交
4418 4419
				  " but the server was compiled with PG_CONTROL_VERSION %d.",
						ControlFile->pg_control_version, PG_CONTROL_VERSION),
4420
				 errhint("It looks like you need to initdb.")));
4421

T
Tom Lane 已提交
4422
	/* Now check the CRC. */
4423 4424 4425 4426 4427
	INIT_CRC32(crc);
	COMP_CRC32(crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(crc);
4428

4429
	if (!EQ_CRC32(crc, ControlFile->crc))
4430
		ereport(FATAL,
4431
				(errmsg("incorrect checksum in control file")));
4432

4433
	/*
4434
	 * Do compatibility checking immediately.  If the database isn't
4435 4436
	 * compatible with the backend executable, we want to abort before we can
	 * possibly do any damage.
4437
	 */
T
Tom Lane 已提交
4438
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4439 4440 4441
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
B
Bruce Momjian 已提交
4442 4443
				  " but the server was compiled with CATALOG_VERSION_NO %d.",
						ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4444
				 errhint("It looks like you need to initdb.")));
4445 4446 4447
	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4448 4449 4450 4451
		   errdetail("The database cluster was initialized with MAXALIGN %d,"
					 " but the server was compiled with MAXALIGN %d.",
					 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
				 errhint("It looks like you need to initdb.")));
4452 4453 4454
	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
P
Peter Eisentraut 已提交
4455
				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4456
				 errhint("It looks like you need to initdb.")));
4457
	if (ControlFile->blcksz != BLCKSZ)
4458 4459
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4460 4461 4462 4463
			 errdetail("The database cluster was initialized with BLCKSZ %d,"
					   " but the server was compiled with BLCKSZ %d.",
					   ControlFile->blcksz, BLCKSZ),
				 errhint("It looks like you need to recompile or initdb.")));
4464
	if (ControlFile->relseg_size != RELSEG_SIZE)
4465 4466
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4467 4468 4469 4470
		errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
				  " but the server was compiled with RELSEG_SIZE %d.",
				  ControlFile->relseg_size, RELSEG_SIZE),
				 errhint("It looks like you need to recompile or initdb.")));
4471 4472 4473
	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4474 4475 4476
		errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
				  " but the server was compiled with XLOG_BLCKSZ %d.",
				  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4477
				 errhint("It looks like you need to recompile or initdb.")));
4478 4479 4480 4481
	if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
B
Bruce Momjian 已提交
4482
					   " but the server was compiled with XLOG_SEG_SIZE %d.",
4483
						   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
B
Bruce Momjian 已提交
4484
				 errhint("It looks like you need to recompile or initdb.")));
4485
	if (ControlFile->nameDataLen != NAMEDATALEN)
4486 4487
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4488 4489 4490 4491
		errdetail("The database cluster was initialized with NAMEDATALEN %d,"
				  " but the server was compiled with NAMEDATALEN %d.",
				  ControlFile->nameDataLen, NAMEDATALEN),
				 errhint("It looks like you need to recompile or initdb.")));
4492
	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4493 4494
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4495
				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
B
Bruce Momjian 已提交
4496
					  " but the server was compiled with INDEX_MAX_KEYS %d.",
4497
						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
B
Bruce Momjian 已提交
4498
				 errhint("It looks like you need to recompile or initdb.")));
4499 4500 4501 4502
	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
B
Bruce Momjian 已提交
4503 4504
				" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
			  ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4505
				 errhint("It looks like you need to recompile or initdb.")));
4506 4507

#ifdef HAVE_INT64_TIMESTAMP
4508
	if (ControlFile->enableIntTimes != true)
4509 4510 4511
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4512 4513
				  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4514
#else
4515
	if (ControlFile->enableIntTimes != false)
4516 4517 4518
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4519 4520
			   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4521 4522
#endif

4523 4524 4525 4526 4527
#ifdef USE_FLOAT4_BYVAL
	if (ControlFile->float4ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4528
					  " but the server was compiled with USE_FLOAT4_BYVAL."),
4529 4530 4531 4532 4533
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float4ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4534 4535
		errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
				  " but the server was compiled without USE_FLOAT4_BYVAL."),
4536 4537 4538 4539 4540 4541 4542 4543
				 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
	if (ControlFile->float8ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4544
					  " but the server was compiled with USE_FLOAT8_BYVAL."),
4545 4546 4547 4548 4549
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float8ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4550 4551
		errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
				  " but the server was compiled without USE_FLOAT8_BYVAL."),
4552 4553
				 errhint("It looks like you need to recompile or initdb.")));
#endif
4554 4555
}

4556
void
4557
UpdateControlFile(void)
4558
{
4559
	int			fd;
4560

4561 4562 4563 4564 4565
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
4566

4567 4568 4569
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4570
	if (fd < 0)
4571 4572 4573
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4574
						XLOG_CONTROL_FILE)));
4575

4576
	errno = 0;
4577
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4578 4579 4580 4581
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4582 4583
		ereport(PANIC,
				(errcode_for_file_access(),
4584
				 errmsg("could not write to control file: %m")));
4585
	}
4586

4587
	if (pg_fsync(fd) != 0)
4588 4589
		ereport(PANIC,
				(errcode_for_file_access(),
4590
				 errmsg("could not fsync control file: %m")));
4591

4592 4593 4594 4595
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4596 4597
}

4598 4599 4600 4601 4602 4603 4604 4605 4606 4607
/*
 * Returns the unique system identifier from control file.
 */
uint64
GetSystemIdentifier(void)
{
	Assert(ControlFile != NULL);
	return ControlFile->system_identifier;
}

4608
/*
T
Tom Lane 已提交
4609
 * Initialization of shared memory for XLOG
4610
 */
4611
Size
4612
XLOGShmemSize(void)
4613
{
4614
	Size		size;
4615

4616 4617 4618 4619 4620 4621 4622
	/* XLogCtl */
	size = sizeof(XLogCtlData);
	/* xlblocks array */
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers */
	size = add_size(size, ALIGNOF_XLOG_BUFFER);
	/* and the buffers themselves */
4623
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4624 4625

	/*
B
Bruce Momjian 已提交
4626 4627 4628
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
4629 4630 4631
	 */

	return size;
4632 4633 4634 4635 4636
}

void
XLOGShmemInit(void)
{
4637 4638
	bool		foundCFile,
				foundXLog;
4639
	char	   *allocptr;
4640

4641
	ControlFile = (ControlFileData *)
4642
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4643 4644
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4645

4646
	if (foundCFile || foundXLog)
4647 4648
	{
		/* both should be present or neither */
4649
		Assert(foundCFile && foundXLog);
4650 4651
		return;
	}
4652

T
Tom Lane 已提交
4653
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
4654

T
Tom Lane 已提交
4655
	/*
B
Bruce Momjian 已提交
4656 4657 4658
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
T
Tom Lane 已提交
4659
	 */
4660 4661
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
T
Tom Lane 已提交
4662
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4663
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
B
Bruce Momjian 已提交
4664

T
Tom Lane 已提交
4665
	/*
4666
	 * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
T
Tom Lane 已提交
4667
	 */
4668 4669
	allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
	XLogCtl->pages = allocptr;
4670
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
T
Tom Lane 已提交
4671 4672

	/*
B
Bruce Momjian 已提交
4673 4674
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
T
Tom Lane 已提交
4675 4676
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4677
	XLogCtl->SharedRecoveryInProgress = true;
T
Tom Lane 已提交
4678
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4679
	SpinLockInit(&XLogCtl->info_lck);
T
Tom Lane 已提交
4680

4681
	/*
B
Bruce Momjian 已提交
4682 4683 4684
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
4685 4686 4687
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
4688 4689 4690
}

/*
T
Tom Lane 已提交
4691 4692
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
4693 4694
 */
void
T
Tom Lane 已提交
4695
BootStrapXLOG(void)
4696
{
4697
	CheckPoint	checkPoint;
T
Tom Lane 已提交
4698 4699
	char	   *buffer;
	XLogPageHeader page;
4700
	XLogLongPageHeader longpage;
4701
	XLogRecord *record;
B
Bruce Momjian 已提交
4702
	bool		use_existent;
4703 4704
	uint64		sysidentifier;
	struct timeval tv;
4705
	pg_crc32	crc;
4706

4707
	/*
B
Bruce Momjian 已提交
4708 4709 4710 4711 4712 4713 4714 4715 4716 4717
	 * Select a hopefully-unique system identifier code for this installation.
	 * We use the result of gettimeofday(), including the fractional seconds
	 * field, as being about as unique as we can easily get.  (Think not to
	 * use random(), since it hasn't been seeded and there's no portable way
	 * to seed it other than the system clock value...)  The upper half of the
	 * uint64 value is just the tv_sec part, while the lower half is the XOR
	 * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
	 * unnecessarily if "uint64" is really only 32 bits wide.  A person
	 * knowing this encoding can determine the initialization time of the
	 * installation, which could perhaps be useful sometimes.
4718 4719 4720 4721 4722
	 */
	gettimeofday(&tv, NULL);
	sysidentifier = ((uint64) tv.tv_sec) << 32;
	sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

4723 4724 4725
	/* First timeline ID is always 1 */
	ThisTimeLineID = 1;

4726
	/* page buffer must be aligned suitably for O_DIRECT */
4727
	buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4728
	page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4729
	memset(page, 0, XLOG_BLCKSZ);
T
Tom Lane 已提交
4730

4731
	/* Set up information for the initial checkpoint record */
4732
	checkPoint.redo.xlogid = 0;
4733 4734
	checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
	checkPoint.ThisTimeLineID = ThisTimeLineID;
4735
	checkPoint.nextXidEpoch = 0;
4736
	checkPoint.nextXid = FirstNormalTransactionId;
4737
	checkPoint.nextOid = FirstBootstrapObjectId;
4738
	checkPoint.nextMulti = FirstMultiXactId;
4739
	checkPoint.nextMultiOffset = 0;
4740 4741
	checkPoint.oldestXid = FirstNormalTransactionId;
	checkPoint.oldestXidDB = TemplateDbOid;
4742
	checkPoint.time = (pg_time_t) time(NULL);
4743
	checkPoint.oldestActiveXid = InvalidTransactionId;
4744

4745 4746 4747
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;
4748
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4749 4750
	ShmemVariableCache->oldestXid = checkPoint.oldestXid;
	ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
4751

4752
	/* Set up the XLOG page header */
4753
	page->xlp_magic = XLOG_PAGE_MAGIC;
4754 4755
	page->xlp_info = XLP_LONG_HEADER;
	page->xlp_tli = ThisTimeLineID;
4756 4757
	page->xlp_pageaddr.xlogid = 0;
	page->xlp_pageaddr.xrecoff = 0;
4758 4759 4760
	longpage = (XLogLongPageHeader) page;
	longpage->xlp_sysid = sysidentifier;
	longpage->xlp_seg_size = XLogSegSize;
4761
	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4762 4763

	/* Insert the initial checkpoint record */
4764
	record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4765
	record->xl_prev.xlogid = 0;
4766
	record->xl_prev.xrecoff = 0;
4767
	record->xl_xid = InvalidTransactionId;
4768
	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4769
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
4770
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4771
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
4772
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4773

4774 4775 4776 4777 4778
	INIT_CRC32(crc);
	COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);
4779 4780
	record->xl_crc = crc;

4781
	/* Create first XLOG segment file */
4782 4783
	use_existent = false;
	openLogFile = XLogFileInit(0, 0, &use_existent, false);
4784

4785
	/* Write the first page with the initial record */
4786
	errno = 0;
4787
	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4788 4789 4790 4791
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4792 4793
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4794
			  errmsg("could not write bootstrap transaction log file: %m")));
4795
	}
4796

T
Tom Lane 已提交
4797
	if (pg_fsync(openLogFile) != 0)
4798 4799
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4800
			  errmsg("could not fsync bootstrap transaction log file: %m")));
4801

4802 4803 4804
	if (close(openLogFile))
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
4805
			  errmsg("could not close bootstrap transaction log file: %m")));
4806

T
Tom Lane 已提交
4807
	openLogFile = -1;
4808

4809 4810
	/* Now create pg_control */

4811
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
4812
	/* Initialize pg_control status fields */
4813
	ControlFile->system_identifier = sysidentifier;
T
Tom Lane 已提交
4814 4815
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
4816
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
4817
	ControlFile->checkPointCopy = checkPoint;
4818
	/* some additional ControlFile fields are set in WriteControlFile() */
4819

4820
	WriteControlFile();
4821 4822 4823

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
4824
	BootStrapSUBTRANS();
4825
	BootStrapMultiXact();
4826

4827
	pfree(buffer);
4828 4829
}

4830
static char *
4831
str_time(pg_time_t tnow)
4832
{
4833
	static char buf[128];
4834

4835 4836 4837
	pg_strftime(buf, sizeof(buf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&tnow, log_timezone));
4838

4839
	return buf;
4840 4841
}

4842 4843
/*
 * See if there is a recovery command file (recovery.conf), and if so
4844
 * read in parameters for archive recovery and XLOG streaming.
4845 4846 4847 4848 4849 4850 4851 4852
 *
 * XXX longer term intention is to expand this to
 * cater for additional parameters and controls
 * possibly use a flex lexer similar to the GUC one
 */
static void
readRecoveryCommandFile(void)
{
B
Bruce Momjian 已提交
4853 4854 4855 4856 4857 4858
	FILE	   *fd;
	char		cmdline[MAXPGPATH];
	TimeLineID	rtli = 0;
	bool		rtliGiven = false;
	bool		syntaxError = false;

4859
	fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4860 4861 4862 4863 4864
	if (fd == NULL)
	{
		if (errno == ENOENT)
			return;				/* not there, so no archive recovery */
		ereport(FATAL,
B
Bruce Momjian 已提交
4865
				(errcode_for_file_access(),
4866
				 errmsg("could not open recovery command file \"%s\": %m",
4867
						RECOVERY_COMMAND_FILE)));
4868 4869 4870
	}

	ereport(LOG,
B
Bruce Momjian 已提交
4871
			(errmsg("starting archive recovery")));
4872

B
Bruce Momjian 已提交
4873 4874 4875
	/*
	 * Parse the file...
	 */
4876
	while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4877 4878
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
4879 4880 4881
		char	   *ptr;
		char	   *tok1;
		char	   *tok2;
4882 4883 4884 4885 4886 4887 4888 4889 4890 4891

		for (ptr = cmdline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* identify the quoted parameter value */
B
Bruce Momjian 已提交
4892
		tok1 = strtok(ptr, "'");
4893 4894 4895 4896 4897
		if (!tok1)
		{
			syntaxError = true;
			break;
		}
B
Bruce Momjian 已提交
4898
		tok2 = strtok(NULL, "'");
4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911
		if (!tok2)
		{
			syntaxError = true;
			break;
		}
		/* reparse to get just the parameter name */
		tok1 = strtok(ptr, " \t=");
		if (!tok1)
		{
			syntaxError = true;
			break;
		}

B
Bruce Momjian 已提交
4912 4913
		if (strcmp(tok1, "restore_command") == 0)
		{
4914
			recoveryRestoreCommand = pstrdup(tok2);
4915
			ereport(LOG,
4916
					(errmsg("restore_command = '%s'",
4917 4918
							recoveryRestoreCommand)));
		}
4919 4920 4921 4922 4923 4924 4925
		else if (strcmp(tok1, "recovery_end_command") == 0)
		{
			recoveryEndCommand = pstrdup(tok2);
			ereport(LOG,
					(errmsg("recovery_end_command = '%s'",
							recoveryEndCommand)));
		}
B
Bruce Momjian 已提交
4926 4927
		else if (strcmp(tok1, "recovery_target_timeline") == 0)
		{
4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946
			rtliGiven = true;
			if (strcmp(tok2, "latest") == 0)
				rtli = 0;
			else
			{
				errno = 0;
				rtli = (TimeLineID) strtoul(tok2, NULL, 0);
				if (errno == EINVAL || errno == ERANGE)
					ereport(FATAL,
							(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
									tok2)));
			}
			if (rtli)
				ereport(LOG,
						(errmsg("recovery_target_timeline = %u", rtli)));
			else
				ereport(LOG,
						(errmsg("recovery_target_timeline = latest")));
		}
B
Bruce Momjian 已提交
4947 4948
		else if (strcmp(tok1, "recovery_target_xid") == 0)
		{
4949 4950 4951 4952
			errno = 0;
			recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
			if (errno == EINVAL || errno == ERANGE)
				ereport(FATAL,
B
Bruce Momjian 已提交
4953 4954
				 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
						 tok2)));
4955 4956 4957 4958 4959 4960
			ereport(LOG,
					(errmsg("recovery_target_xid = %u",
							recoveryTargetXid)));
			recoveryTarget = true;
			recoveryTargetExact = true;
		}
B
Bruce Momjian 已提交
4961 4962
		else if (strcmp(tok1, "recovery_target_time") == 0)
		{
4963 4964 4965 4966 4967 4968 4969 4970
			/*
			 * if recovery_target_xid specified, then this overrides
			 * recovery_target_time
			 */
			if (recoveryTargetExact)
				continue;
			recoveryTarget = true;
			recoveryTargetExact = false;
B
Bruce Momjian 已提交
4971

4972
			/*
4973
			 * Convert the time string given by the user to TimestampTz form.
4974
			 */
4975 4976
			recoveryTargetTime =
				DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
B
Bruce Momjian 已提交
4977
														CStringGetDatum(tok2),
4978 4979
												ObjectIdGetDatum(InvalidOid),
														Int32GetDatum(-1)));
4980
			ereport(LOG,
4981
					(errmsg("recovery_target_time = '%s'",
4982
							timestamptz_to_str(recoveryTargetTime))));
4983
		}
B
Bruce Momjian 已提交
4984 4985
		else if (strcmp(tok1, "recovery_target_inclusive") == 0)
		{
4986 4987 4988
			/*
			 * does nothing if a recovery_target is not also set
			 */
4989
			if (!parse_bool(tok2, &recoveryTargetInclusive))
4990 4991 4992
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4993 4994 4995
			ereport(LOG,
					(errmsg("recovery_target_inclusive = %s", tok2)));
		}
4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018
		else if (strcmp(tok1, "standby_mode") == 0)
		{
			if (!parse_bool(tok2, &StandbyMode))
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("parameter \"standby_mode\" requires a Boolean value")));
			ereport(LOG,
					(errmsg("standby_mode = '%s'", tok2)));
		}
		else if (strcmp(tok1, "primary_conninfo") == 0)
		{
			PrimaryConnInfo = pstrdup(tok2);
			ereport(LOG,
					(errmsg("primary_conninfo = '%s'",
							PrimaryConnInfo)));
		}
		else if (strcmp(tok1, "trigger_file") == 0)
		{
			TriggerFile = pstrdup(tok2);
			ereport(LOG,
					(errmsg("trigger_file = '%s'",
							TriggerFile)));
		}
5019 5020 5021 5022 5023 5024 5025 5026
		else
			ereport(FATAL,
					(errmsg("unrecognized recovery parameter \"%s\"",
							tok1)));
	}

	FreeFile(fd);

B
Bruce Momjian 已提交
5027 5028
	if (syntaxError)
		ereport(FATAL,
5029 5030
				(errmsg("syntax error in recovery command file: %s",
						cmdline),
B
Bruce Momjian 已提交
5031
			  errhint("Lines should have the format parameter = 'value'.")));
5032

5033 5034
	/* If not in standby mode, restore_command must be supplied */
	if (!StandbyMode && recoveryRestoreCommand == NULL)
5035
		ereport(FATAL,
5036
				(errmsg("recovery command file \"%s\" did not specify restore_command nor standby_mode",
5037
						RECOVERY_COMMAND_FILE)));
5038

5039 5040 5041
	/* Enable fetching from archive recovery area */
	InArchiveRecovery = true;

5042
	/*
B
Bruce Momjian 已提交
5043 5044 5045 5046
	 * If user specified recovery_target_timeline, validate it or compute the
	 * "latest" value.	We can't do this until after we've gotten the restore
	 * command and set InArchiveRecovery, because we need to fetch timeline
	 * history files from the archive.
5047
	 */
5048 5049 5050 5051 5052 5053 5054
	if (rtliGiven)
	{
		if (rtli)
		{
			/* Timeline 1 does not have a history file, all else should */
			if (rtli != 1 && !existsTimeLineHistory(rtli))
				ereport(FATAL,
5055
						(errmsg("recovery target timeline %u does not exist",
B
Bruce Momjian 已提交
5056
								rtli)));
5057 5058 5059 5060 5061 5062 5063 5064
			recoveryTargetTLI = rtli;
		}
		else
		{
			/* We start the "latest" search from pg_control's timeline */
			recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
		}
	}
5065 5066 5067 5068 5069 5070
}

/*
 * Exit archive-recovery state
 */
static void
5071
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5072
{
B
Bruce Momjian 已提交
5073 5074
	char		recoveryPath[MAXPGPATH];
	char		xlogpath[MAXPGPATH];
5075
	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
5076 5077

	/*
5078
	 * We are no longer in archive recovery state.
5079 5080 5081
	 */
	InArchiveRecovery = false;

5082 5083 5084 5085 5086
	/*
	 * Update min recovery point one last time.
	 */
	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

5087
	/*
B
Bruce Momjian 已提交
5088 5089 5090
	 * We should have the ending log segment currently open.  Verify, and then
	 * close it (to avoid problems on Windows with trying to rename or delete
	 * an open file).
5091 5092 5093 5094 5095 5096 5097 5098 5099
	 */
	Assert(readFile >= 0);
	Assert(readId == endLogId);
	Assert(readSeg == endLogSeg);

	close(readFile);
	readFile = -1;

	/*
B
Bruce Momjian 已提交
5100 5101 5102 5103 5104 5105 5106
	 * If the segment was fetched from archival storage, we want to replace
	 * the existing xlog segment (if any) with the archival version.  This is
	 * because whatever is in XLOGDIR is very possibly older than what we have
	 * from the archives, since it could have come from restoring a PGDATA
	 * backup.	In any case, the archival version certainly is more
	 * descriptive of what our current database state is, because that is what
	 * we replayed from.
5107
	 *
5108 5109
	 * Note that if we are establishing a new timeline, ThisTimeLineID is
	 * already set to the new value, and so we will create a new file instead
5110 5111
	 * of overwriting any existing file.  (This is, in fact, always the case
	 * at present.)
5112
	 */
5113
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5114
	XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5115 5116 5117 5118 5119 5120 5121 5122 5123 5124

	if (restoredFromArchive)
	{
		ereport(DEBUG3,
				(errmsg_internal("moving last restored xlog to \"%s\"",
								 xlogpath)));
		unlink(xlogpath);		/* might or might not exist */
		if (rename(recoveryPath, xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
5125
					 errmsg("could not rename file \"%s\" to \"%s\": %m",
5126 5127 5128 5129 5130 5131 5132 5133 5134 5135
							recoveryPath, xlogpath)));
		/* XXX might we need to fix permissions on the file? */
	}
	else
	{
		/*
		 * If the latest segment is not archival, but there's still a
		 * RECOVERYXLOG laying about, get rid of it.
		 */
		unlink(recoveryPath);	/* ignore any error */
B
Bruce Momjian 已提交
5136

5137
		/*
B
Bruce Momjian 已提交
5138 5139 5140
		 * If we are establishing a new timeline, we have to copy data from
		 * the last WAL segment of the old timeline to create a starting WAL
		 * segment for the new timeline.
5141 5142 5143 5144
		 *
		 * Notify the archiver that the last WAL segment of the old timeline
		 * is ready to copy to archival storage. Otherwise, it is not archived
		 * for a while.
5145 5146
		 */
		if (endTLI != ThisTimeLineID)
5147
		{
5148 5149
			XLogFileCopy(endLogId, endLogSeg,
						 endTLI, endLogId, endLogSeg);
5150 5151 5152 5153 5154 5155 5156

			if (XLogArchivingActive())
			{
				XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
				XLogArchiveNotify(xlogpath);
			}
		}
5157 5158 5159
	}

	/*
B
Bruce Momjian 已提交
5160 5161
	 * Let's just make real sure there are not .ready or .done flags posted
	 * for the new segment.
5162
	 */
5163 5164
	XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
	XLogArchiveCleanup(xlogpath);
5165

5166
	/* Get rid of any remaining recovered timeline-history file, too */
5167
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
B
Bruce Momjian 已提交
5168
	unlink(recoveryPath);		/* ignore any error */
5169 5170

	/*
B
Bruce Momjian 已提交
5171 5172
	 * Rename the config file out of the way, so that we don't accidentally
	 * re-enter archive recovery mode in a subsequent crash.
5173
	 */
5174 5175
	unlink(RECOVERY_COMMAND_DONE);
	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5176 5177
		ereport(FATAL,
				(errcode_for_file_access(),
5178
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
5179
						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190

	ereport(LOG,
			(errmsg("archive recovery complete")));
}

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
5191 5192 5193
 *
 * We also track the timestamp of the latest applied COMMIT/ABORT record
 * in recoveryLastXTime, for logging purposes.
5194 5195
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
5196 5197 5198 5199 5200
 */
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
	bool		stopsHere;
B
Bruce Momjian 已提交
5201
	uint8		record_info;
B
Bruce Momjian 已提交
5202
	TimestampTz recordXtime;
5203 5204

	/* We only consider stopping at COMMIT or ABORT records */
5205
	if (record->xl_rmid == RM_XACT_ID)
5206
	{
5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217
		record_info = record->xl_info & ~XLR_INFO_MASK;
		if (record_info == XLOG_XACT_COMMIT)
		{
			xl_xact_commit *recordXactCommitData;

			recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
			recordXtime = recordXactCommitData->xact_time;
		}
		else if (record_info == XLOG_XACT_ABORT)
		{
			xl_xact_abort *recordXactAbortData;
5218

5219 5220 5221 5222 5223
			recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
			recordXtime = recordXactAbortData->xact_time;
		}
		else
			return false;
5224
	}
5225
	else if (record->xl_rmid == RM_XLOG_ID)
5226
	{
5227 5228 5229 5230 5231 5232 5233 5234 5235
		record_info = record->xl_info & ~XLR_INFO_MASK;
		if (record_info == XLOG_CHECKPOINT_SHUTDOWN ||
			record_info == XLOG_CHECKPOINT_ONLINE)
		{
			CheckPoint	checkPoint;

			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
			recoveryLastXTime = checkPoint.time;
		}
5236

5237 5238 5239 5240 5241
		/*
		 * We don't want to stop recovery on a checkpoint record, but we do
		 * want to update recoveryLastXTime. So return is unconditional.
		 */
		return false;
5242 5243 5244 5245
	}
	else
		return false;

5246 5247
	/* Do we have a PITR target at all? */
	if (!recoveryTarget)
5248 5249
	{
		recoveryLastXTime = recordXtime;
5250
		return false;
5251
	}
5252

5253 5254 5255
	if (recoveryTargetExact)
	{
		/*
B
Bruce Momjian 已提交
5256 5257
		 * there can be only one transaction end record with this exact
		 * transactionid
5258
		 *
B
Bruce Momjian 已提交
5259
		 * when testing for an xid, we MUST test for equality only, since
B
Bruce Momjian 已提交
5260 5261 5262
		 * transactions are numbered in the order they start, not the order
		 * they complete. A higher numbered xid will complete before you about
		 * 50% of the time...
5263 5264 5265 5266 5267 5268 5269 5270
		 */
		stopsHere = (record->xl_xid == recoveryTargetXid);
		if (stopsHere)
			*includeThis = recoveryTargetInclusive;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
5271 5272 5273
		 * there can be many transactions that share the same commit time, so
		 * we stop after the last one, if we are inclusive, or stop at the
		 * first one if we are exclusive
5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284
		 */
		if (recoveryTargetInclusive)
			stopsHere = (recordXtime > recoveryTargetTime);
		else
			stopsHere = (recordXtime >= recoveryTargetTime);
		if (stopsHere)
			*includeThis = false;
	}

	if (stopsHere)
	{
5285 5286 5287 5288
		recoveryStopXid = record->xl_xid;
		recoveryStopTime = recordXtime;
		recoveryStopAfter = *includeThis;

5289 5290
		if (record_info == XLOG_XACT_COMMIT)
		{
5291
			if (recoveryStopAfter)
5292 5293
				ereport(LOG,
						(errmsg("recovery stopping after commit of transaction %u, time %s",
5294 5295
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5296 5297 5298
			else
				ereport(LOG,
						(errmsg("recovery stopping before commit of transaction %u, time %s",
5299 5300
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5301 5302 5303
		}
		else
		{
5304
			if (recoveryStopAfter)
5305 5306
				ereport(LOG,
						(errmsg("recovery stopping after abort of transaction %u, time %s",
5307 5308
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5309 5310 5311
			else
				ereport(LOG,
						(errmsg("recovery stopping before abort of transaction %u, time %s",
5312 5313
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5314
		}
5315 5316 5317

		if (recoveryStopAfter)
			recoveryLastXTime = recordXtime;
5318
	}
5319 5320
	else
		recoveryLastXTime = recordXtime;
5321 5322 5323 5324

	return stopsHere;
}

5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385
/*
 * Returns bool with current recovery mode, a global state.
 */
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
	PG_RETURN_BOOL(RecoveryInProgress());
}

/*
 * Returns timestamp of last recovered commit/abort record.
 */
TimestampTz
GetLatestXLogTime(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	recoveryLastXTime = xlogctl->recoveryLastXTime;
	SpinLockRelease(&xlogctl->info_lck);

	return recoveryLastXTime;
}

/*
 * Note that text field supplied is a parameter name and does not require translation
 */
#define RecoveryRequiresIntParameter(param_name, currValue, checkpointValue) \
{ \
	if (currValue < checkpointValue) \
		ereport(ERROR, \
			(errmsg("recovery connections cannot continue because " \
					"%s = %u is a lower setting than on WAL source server (value was %u)", \
					param_name, \
					currValue, \
					checkpointValue))); \
}

/*
 * Check to see if required parameters are set high enough on this server
 * for various aspects of recovery operation.
 */
static void
CheckRequiredParameterValues(CheckPoint checkPoint)
{
	/* We ignore autovacuum_max_workers when we make this test. */
	RecoveryRequiresIntParameter("max_connections",
									MaxConnections, checkPoint.MaxConnections);

	RecoveryRequiresIntParameter("max_prepared_xacts",
									max_prepared_xacts, checkPoint.max_prepared_xacts);
	RecoveryRequiresIntParameter("max_locks_per_xact",
									max_locks_per_xact, checkPoint.max_locks_per_xact);

	if (!checkPoint.XLogStandbyInfoMode)
		ereport(ERROR,
			(errmsg("recovery connections cannot start because the recovery_connections "
					"parameter is disabled on the WAL source server")));
}

5386
/*
T
Tom Lane 已提交
5387
 * This must be called ONCE during postmaster or standalone-backend startup
5388 5389
 */
void
T
Tom Lane 已提交
5390
StartupXLOG(void)
5391
{
5392 5393
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
5394
	bool		wasShutdown;
5395
	bool		reachedStopPoint = false;
5396
	bool		haveBackupLabel = false;
5397
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
5398 5399
				checkPointLoc,
				EndOfLog;
5400 5401
	uint32		endLogId;
	uint32		endLogSeg;
5402
	XLogRecord *record;
5403
	uint32		freespace;
5404
	TransactionId oldestActiveXID;
5405
	bool		bgwriterLaunched = false;
5406
	bool		backendsAllowed = false;
5407

5408
	/*
5409 5410
	 * Read control file and check XLOG status looks valid.
	 *
5411 5412
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
5413
	 */
5414
	ReadControlFile();
5415

5416
	if (ControlFile->state < DB_SHUTDOWNED ||
5417
		ControlFile->state > DB_IN_PRODUCTION ||
5418
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5419 5420
		ereport(FATAL,
				(errmsg("control file contains invalid data")));
5421 5422

	if (ControlFile->state == DB_SHUTDOWNED)
5423 5424 5425
		ereport(LOG,
				(errmsg("database system was shut down at %s",
						str_time(ControlFile->time))));
5426
	else if (ControlFile->state == DB_SHUTDOWNING)
5427
		ereport(LOG,
5428
				(errmsg("database system shutdown was interrupted; last known up at %s",
5429
						str_time(ControlFile->time))));
5430
	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5431
		ereport(LOG,
B
Bruce Momjian 已提交
5432 5433 5434 5435
		   (errmsg("database system was interrupted while in recovery at %s",
				   str_time(ControlFile->time)),
			errhint("This probably means that some data is corrupted and"
					" you will have to use the last backup for recovery.")));
5436 5437
	else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
		ereport(LOG,
B
Bruce Momjian 已提交
5438 5439
				(errmsg("database system was interrupted while in recovery at log time %s",
						str_time(ControlFile->checkPointCopy.time)),
5440
				 errhint("If this has occurred more than once some data might be corrupted"
B
Bruce Momjian 已提交
5441
			  " and you might need to choose an earlier recovery target.")));
5442
	else if (ControlFile->state == DB_IN_PRODUCTION)
5443
		ereport(LOG,
B
Bruce Momjian 已提交
5444 5445
			  (errmsg("database system was interrupted; last known up at %s",
					  str_time(ControlFile->time))));
5446

5447 5448
	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
5449
	if (ControlFile->state != DB_SHUTDOWNED)
5450
		pg_usleep(60000000L);
5451 5452
#endif

5453 5454
	/*
	 * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5455 5456
	 * someone has performed a copy for PITR, these directories may have been
	 * excluded and need to be re-created.
5457 5458 5459
	 */
	ValidateXLOGDirectoryStructure();

5460 5461 5462 5463 5464 5465 5466 5467 5468 5469
	/*
	 * Clear out any old relcache cache files.  This is *necessary* if we
	 * do any WAL replay, since that would probably result in the cache files
	 * being out of sync with database reality.  In theory we could leave
	 * them in place if the database had been cleanly shut down, but it
	 * seems safest to just remove them always and let them be rebuilt
	 * during the first backend startup.
	 */
	RelationCacheInitFileRemove();

5470
	/*
B
Bruce Momjian 已提交
5471 5472
	 * Initialize on the assumption we want to recover to the same timeline
	 * that's active according to pg_control.
5473 5474 5475
	 */
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

5476
	/*
B
Bruce Momjian 已提交
5477 5478
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
5479 5480 5481
	 */
	readRecoveryCommandFile();

5482 5483 5484
	/* Now we can determine the list of expected TLIs */
	expectedTLIs = readTimeLineHistory(recoveryTargetTLI);

5485 5486 5487 5488 5489 5490
	/*
	 * If pg_control's timeline is not in expectedTLIs, then we cannot
	 * proceed: the backup is not part of the history of the requested
	 * timeline.
	 */
	if (!list_member_int(expectedTLIs,
B
Bruce Momjian 已提交
5491
						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
5492 5493 5494 5495 5496
		ereport(FATAL,
				(errmsg("requested timeline %u is not a child of database system timeline %u",
						recoveryTargetTLI,
						ControlFile->checkPointCopy.ThisTimeLineID)));

5497 5498 5499
	/* Save the selected recovery target timeline ID in shared memory */
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;

5500
	if (read_backup_label(&checkPointLoc))
T
Tom Lane 已提交
5501
	{
5502
		/*
B
Bruce Momjian 已提交
5503 5504
		 * When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control.
5505
		 */
5506
		record = ReadCheckpointRecord(checkPointLoc, 0);
5507 5508
		if (record != NULL)
		{
5509
			ereport(DEBUG1,
5510
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
5511
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5512 5513 5514 5515 5516
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
		}
		else
		{
			ereport(PANIC,
B
Bruce Momjian 已提交
5517 5518
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5519
		}
5520 5521
		/* set flag to delete it later */
		haveBackupLabel = true;
T
Tom Lane 已提交
5522 5523 5524
	}
	else
	{
5525
		/*
B
Bruce Momjian 已提交
5526 5527
		 * Get the last valid checkpoint record.  If the latest one according
		 * to pg_control is broken, try the next-to-last one.
5528 5529
		 */
		checkPointLoc = ControlFile->checkPoint;
5530
		RedoStartLSN = ControlFile->checkPointCopy.redo;
5531
		record = ReadCheckpointRecord(checkPointLoc, 1);
T
Tom Lane 已提交
5532 5533
		if (record != NULL)
		{
5534
			ereport(DEBUG1,
5535
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
5536
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
T
Tom Lane 已提交
5537
		}
5538
		else if (StandbyMode)
5539 5540 5541 5542 5543 5544 5545 5546
		{
			/*
			 * The last valid checkpoint record required for a streaming
			 * recovery exists in neither standby nor the primary.
			 */
			ereport(PANIC,
					(errmsg("could not locate a valid checkpoint record")));
		}
T
Tom Lane 已提交
5547
		else
5548 5549
		{
			checkPointLoc = ControlFile->prevCheckPoint;
5550
			record = ReadCheckpointRecord(checkPointLoc, 2);
5551 5552 5553
			if (record != NULL)
			{
				ereport(LOG,
B
Bruce Momjian 已提交
5554 5555 5556
						(errmsg("using previous checkpoint record at %X/%X",
							  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
				InRecovery = true;		/* force recovery even if SHUTDOWNED */
5557 5558 5559
			}
			else
				ereport(PANIC,
B
Bruce Momjian 已提交
5560
					 (errmsg("could not locate a valid checkpoint record")));
5561
		}
T
Tom Lane 已提交
5562
	}
5563

T
Tom Lane 已提交
5564 5565 5566
	LastRec = RecPtr = checkPointLoc;
	memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
	wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5567

5568
	ereport(DEBUG1,
B
Bruce Momjian 已提交
5569 5570 5571
			(errmsg("redo record is at %X/%X; shutdown %s",
					checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
					wasShutdown ? "TRUE" : "FALSE")));
5572
	ereport(DEBUG1,
5573 5574 5575
			(errmsg("next transaction ID: %u/%u; next OID: %u",
					checkPoint.nextXidEpoch, checkPoint.nextXid,
					checkPoint.nextOid)));
5576
	ereport(DEBUG1,
5577 5578
			(errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
					checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5579 5580 5581
	ereport(DEBUG1,
			(errmsg("oldest unfrozen transaction ID: %u, in database %u",
					checkPoint.oldestXid, checkPoint.oldestXidDB)));
5582
	if (!TransactionIdIsNormal(checkPoint.nextXid))
5583
		ereport(PANIC,
5584
				(errmsg("invalid next transaction ID")));
5585 5586 5587

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
5588
	ShmemVariableCache->oidCount = 0;
5589
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5590 5591
	ShmemVariableCache->oldestXid = checkPoint.oldestXid;
	ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
5592

5593
	/*
B
Bruce Momjian 已提交
5594 5595 5596
	 * We must replay WAL entries using the same TimeLineID they were created
	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
	 * also xlog_redo()).
5597
	 */
5598
	ThisTimeLineID = checkPoint.ThisTimeLineID;
5599

5600
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
5601

5602
	if (XLByteLT(RecPtr, checkPoint.redo))
5603 5604
		ereport(PANIC,
				(errmsg("invalid redo in checkpoint record")));
5605

5606
	/*
B
Bruce Momjian 已提交
5607
	 * Check whether we need to force recovery from WAL.  If it appears to
B
Bruce Momjian 已提交
5608 5609
	 * have been a clean shutdown and we did not have a recovery.conf file,
	 * then assume no recovery needed.
5610
	 */
5611
	if (XLByteLT(checkPoint.redo, RecPtr))
5612
	{
T
Tom Lane 已提交
5613
		if (wasShutdown)
5614
			ereport(PANIC,
B
Bruce Momjian 已提交
5615
					(errmsg("invalid redo record in shutdown checkpoint")));
V
WAL  
Vadim B. Mikheev 已提交
5616
		InRecovery = true;
5617 5618
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
5619
		InRecovery = true;
5620 5621 5622 5623 5624
	else if (InArchiveRecovery)
	{
		/* force recovery due to presence of recovery.conf */
		InRecovery = true;
	}
5625

V
WAL  
Vadim B. Mikheev 已提交
5626
	/* REDO */
5627
	if (InRecovery)
5628
	{
B
Bruce Momjian 已提交
5629
		int			rmid;
5630

5631
		/*
B
Bruce Momjian 已提交
5632 5633 5634 5635
		 * Update pg_control to show that we are recovering and to show the
		 * selected checkpoint as the place we are starting from. We also mark
		 * pg_control with any minimum recovery stop point obtained from a
		 * backup history file.
5636
		 */
5637
		if (InArchiveRecovery)
5638
		{
5639
			ereport(LOG,
5640
					(errmsg("automatic recovery in progress")));
5641 5642
			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
		}
5643
		else
5644
		{
5645
			ereport(LOG,
5646 5647
					(errmsg("database system was not properly shut down; "
							"automatic recovery in progress")));
5648 5649 5650 5651 5652
			ControlFile->state = DB_IN_CRASH_RECOVERY;
		}
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = checkPointLoc;
		ControlFile->checkPointCopy = checkPoint;
5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664
		if (InArchiveRecovery)
		{
			/* initialize minRecoveryPoint if not set yet */
			if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
				ControlFile->minRecoveryPoint = checkPoint.redo;
		}
		/*
		 * set backupStartupPoint if we're starting archive recovery from a
		 * base backup
		 */
		if (haveBackupLabel)
			ControlFile->backupStartPoint = checkPoint.redo;
5665
		ControlFile->time = (pg_time_t) time(NULL);
5666
		/* No need to hold ControlFileLock yet, we aren't up far enough */
5667 5668
		UpdateControlFile();

5669
		/* initialize our local copy of minRecoveryPoint */
5670 5671 5672 5673 5674 5675 5676
		minRecoveryPoint = ControlFile->minRecoveryPoint;

		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
		pgstat_reset_all();

5677
		/*
B
Bruce Momjian 已提交
5678 5679 5680 5681 5682 5683
		 * If there was a backup label file, it's done its job and the info
		 * has now been propagated into pg_control.  We must get rid of the
		 * label file so that if we crash during recovery, we'll pick up at
		 * the latest recovery restartpoint instead of going all the way back
		 * to the backup start point.  It seems prudent though to just rename
		 * the file out of the way rather than delete it completely.
5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694
		 */
		if (haveBackupLabel)
		{
			unlink(BACKUP_LABEL_OLD);
			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
				ereport(FATAL,
						(errcode_for_file_access(),
						 errmsg("could not rename file \"%s\" to \"%s\": %m",
								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
		}

5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726
		/*
		 * Initialize recovery connections, if enabled. We won't let backends
		 * in yet, not until we've reached the min recovery point specified
		 * in control file and we've established a recovery snapshot from a
		 * running-xacts WAL record.
		 */
		if (InArchiveRecovery && XLogRequestRecoveryConnections)
		{
			TransactionId *xids;
			int nxids;

			CheckRequiredParameterValues(checkPoint);

			ereport(LOG,
				(errmsg("initializing recovery connections")));

			InitRecoveryTransactionEnvironment();

			if (wasShutdown)
				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
			else
				oldestActiveXID = checkPoint.oldestActiveXid;
			Assert(TransactionIdIsValid(oldestActiveXID));

			/* Startup commit log and related stuff */
			StartupCLOG();
			StartupSUBTRANS(oldestActiveXID);
			StartupMultiXact();

			ProcArrayInitRecoveryInfo(oldestActiveXID);
		}

5727
		/* Initialize resource managers */
5728 5729 5730 5731 5732 5733
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

5734
		/*
B
Bruce Momjian 已提交
5735 5736
		 * Find the first record that logically follows the checkpoint --- it
		 * might physically precede it, though.
5737
		 */
5738
		if (XLByteLT(checkPoint.redo, RecPtr))
5739 5740
		{
			/* back up to find the record */
5741
			record = ReadRecord(&(checkPoint.redo), PANIC, false);
5742
		}
B
Bruce Momjian 已提交
5743
		else
5744
		{
5745
			/* just have to read next record after CheckPoint */
5746
			record = ReadRecord(NULL, LOG, false);
5747
		}
5748

T
Tom Lane 已提交
5749
		if (record != NULL)
5750
		{
5751 5752
			bool		recoveryContinue = true;
			bool		recoveryApply = true;
5753
			bool		reachedMinRecoveryPoint = false;
B
Bruce Momjian 已提交
5754
			ErrorContextCallback errcontext;
5755

5756 5757 5758
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

5759
			/* initialize shared replayEndRecPtr and recoveryLastRecPtr */
5760 5761
			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->replayEndRecPtr = ReadRecPtr;
5762
			xlogctl->recoveryLastRecPtr = ReadRecPtr;
5763
			SpinLockRelease(&xlogctl->info_lck);
5764

V
WAL  
Vadim B. Mikheev 已提交
5765
			InRedo = true;
5766

5767 5768 5769
			ereport(LOG,
					(errmsg("redo starts at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5770 5771 5772 5773 5774 5775 5776 5777 5778 5779

			/*
			 * Let postmaster know we've started redo now, so that it can
			 * launch bgwriter to perform restartpoints.  We don't bother
			 * during crash recovery as restartpoints can only be performed
			 * during archive recovery.  And we'd like to keep crash recovery
			 * simple, to avoid introducing bugs that could you from
			 * recovering after crash.
			 *
			 * After this point, we can no longer assume that we're the only
5780 5781
			 * process in addition to postmaster!  Also, fsync requests are
			 * subsequently to be handled by the bgwriter, not locally.
5782 5783
			 */
			if (InArchiveRecovery && IsUnderPostmaster)
5784 5785
			{
				SetForwardFsyncRequests();
5786
				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
5787 5788
				bgwriterLaunched = true;
			}
5789 5790 5791 5792

			/*
			 * main redo apply loop
			 */
5793 5794
			do
			{
5795
#ifdef WAL_DEBUG
5796 5797 5798
				if (XLOG_DEBUG ||
					(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
V
WAL  
Vadim B. Mikheev 已提交
5799
				{
B
Bruce Momjian 已提交
5800
					StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
5801

5802 5803
					initStringInfo(&buf);
					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
5804 5805
									 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
									 EndRecPtr.xlogid, EndRecPtr.xrecoff);
5806 5807 5808 5809
					xlog_outrec(&buf, record);
					appendStringInfo(&buf, " - ");
					RmgrTable[record->xl_rmid].rm_desc(&buf,
													   record->xl_info,
B
Bruce Momjian 已提交
5810
													 XLogRecGetData(record));
5811 5812
					elog(LOG, "%s", buf.data);
					pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
5813
				}
5814
#endif
V
WAL  
Vadim B. Mikheev 已提交
5815

5816 5817
				/* Handle interrupt signals of startup process */
				HandleStartupProcInterrupts();
5818

5819
				/*
5820
				 * Have we passed our safe starting point?
5821
				 */
5822
				if (!reachedMinRecoveryPoint &&
5823 5824
					XLByteLE(minRecoveryPoint, EndRecPtr) &&
					XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
5825 5826 5827 5828 5829 5830
				{
					reachedMinRecoveryPoint = true;
					ereport(LOG,
							(errmsg("consistent recovery state reached at %X/%X",
									EndRecPtr.xlogid, EndRecPtr.xrecoff)));
				}
5831 5832

				/*
5833 5834 5835
				 * Have we got a valid starting snapshot that will allow
				 * queries to be run? If so, we can tell postmaster that
				 * the database is consistent now, enabling connections.
5836
				 */
5837 5838 5839 5840
				if (standbyState == STANDBY_SNAPSHOT_READY &&
					!backendsAllowed &&
					reachedMinRecoveryPoint &&
					IsUnderPostmaster)
5841
				{
5842 5843
					backendsAllowed = true;
					SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
5844 5845
				}

5846 5847 5848 5849 5850
				/*
				 * Have we reached our recovery target?
				 */
				if (recoveryStopsHere(record, &recoveryApply))
				{
B
Bruce Momjian 已提交
5851
					reachedStopPoint = true;	/* see below */
5852 5853 5854 5855 5856
					recoveryContinue = false;
					if (!recoveryApply)
						break;
				}

5857 5858 5859 5860 5861 5862
				/* Setup error traceback support for ereport() */
				errcontext.callback = rm_redo_error_callback;
				errcontext.arg = (void *) record;
				errcontext.previous = error_context_stack;
				error_context_stack = &errcontext;

5863 5864
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
B
Bruce Momjian 已提交
5865
												 ShmemVariableCache->nextXid))
5866 5867 5868 5869 5870
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}

5871
				/*
5872 5873
				 * Update shared replayEndRecPtr before replaying this record,
				 * so that XLogFlush will update minRecoveryPoint correctly.
5874 5875 5876
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->replayEndRecPtr = EndRecPtr;
5877
				xlogctl->recoveryLastXTime = recoveryLastXTime;
5878 5879
				SpinLockRelease(&xlogctl->info_lck);

5880 5881 5882 5883
				/* In Hot Standby mode, keep track of XIDs we've seen */
				if (InHotStandby && TransactionIdIsValid(record->xl_xid))
					RecordKnownAssignedTransactionIds(record->xl_xid);

5884
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5885

5886 5887 5888
				/* Pop the error context stack */
				error_context_stack = errcontext.previous;

5889 5890 5891 5892 5893 5894 5895 5896
				/*
				 * Update shared recoveryLastRecPtr after this record has been
				 * replayed.
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->recoveryLastRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);

5897 5898
				LastRec = ReadRecPtr;

5899
				record = ReadRecord(NULL, LOG, false);
5900
			} while (record != NULL && recoveryContinue);
B
Bruce Momjian 已提交
5901

5902 5903 5904 5905
			/*
			 * end of main redo apply loop
			 */

5906 5907 5908
			ereport(LOG,
					(errmsg("redo done at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5909 5910
			if (recoveryLastXTime)
				ereport(LOG,
B
Bruce Momjian 已提交
5911 5912
					 (errmsg("last completed transaction was at log time %s",
							 timestamptz_to_str(recoveryLastXTime))));
V
WAL  
Vadim B. Mikheev 已提交
5913
			InRedo = false;
5914 5915
		}
		else
5916 5917
		{
			/* there are no WAL records following the checkpoint */
5918 5919
			ereport(LOG,
					(errmsg("redo is not required")));
5920
		}
V
WAL  
Vadim B. Mikheev 已提交
5921 5922
	}

5923 5924 5925 5926 5927 5928 5929 5930 5931 5932
	/*
	 * If we launched a WAL receiver, it should be gone by now. It will trump
	 * over the startup checkpoint and subsequent records if it's still alive,
	 * so be extra sure that it's gone.
	 */
	if (WalRcvInProgress())
		elog(PANIC, "wal receiver still active");

	/*
	 * We are now done reading the xlog from stream. Turn off streaming
5933 5934 5935
	 * recovery to force fetching the files (which would be required
	 * at end of recovery, e.g., timeline history file) from archive or
	 * pg_xlog.
5936
	 */
5937
	StandbyMode = false;
5938

T
Tom Lane 已提交
5939
	/*
B
Bruce Momjian 已提交
5940 5941
	 * Re-fetch the last valid or last applied record, so we can identify the
	 * exact endpoint of what we consider the valid portion of WAL.
T
Tom Lane 已提交
5942
	 */
5943
	record = ReadRecord(&LastRec, PANIC, false);
T
Tom Lane 已提交
5944
	EndOfLog = EndRecPtr;
5945 5946
	XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);

5947 5948
	/*
	 * Complain if we did not roll forward far enough to render the backup
5949 5950 5951 5952
	 * dump consistent.  Note: it is indeed okay to look at the local variable
	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
	 * advanced beyond the WAL we processed.
5953
	 */
5954 5955 5956
	if (InArchiveRecovery &&
		(XLByteLT(EndOfLog, minRecoveryPoint) ||
		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
5957
	{
5958
		if (reachedStopPoint)	/* stopped because of stop request */
5959
			ereport(FATAL,
5960
					(errmsg("requested recovery stop point is before consistent recovery point")));
B
Bruce Momjian 已提交
5961
		else	/* ran off end of WAL */
5962
			ereport(FATAL,
5963
					(errmsg("WAL ends before consistent recovery point")));
5964 5965
	}

5966 5967 5968
	/*
	 * Consider whether we need to assign a new timeline ID.
	 *
B
Bruce Momjian 已提交
5969 5970
	 * If we are doing an archive recovery, we always assign a new ID.	This
	 * handles a couple of issues.	If we stopped short of the end of WAL
5971 5972
	 * during recovery, then we are clearly generating a new timeline and must
	 * assign it a unique new ID.  Even if we ran to the end, modifying the
B
Bruce Momjian 已提交
5973 5974
	 * current last segment is problematic because it may result in trying to
	 * overwrite an already-archived copy of that segment, and we encourage
5975 5976 5977 5978
	 * DBAs to make their archive_commands reject that.  We can dodge the
	 * problem by making the new active segment have a new timeline ID.
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
5979
	 */
5980
	if (InArchiveRecovery)
5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991
	{
		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
		ereport(LOG,
				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
							 curFileTLI, endLogId, endLogSeg);
	}

	/* Save the selected TimeLineID in shared memory, too */
	XLogCtl->ThisTimeLineID = ThisTimeLineID;

5992
	/*
B
Bruce Momjian 已提交
5993 5994 5995 5996
	 * We are now done reading the old WAL.  Turn off archive fetching if it
	 * was active, and make a writable copy of the last WAL segment. (Note
	 * that we also have a copy of the last block of the old WAL in readBuf;
	 * we will use that below.)
5997 5998
	 */
	if (InArchiveRecovery)
5999
		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6000 6001 6002 6003 6004 6005 6006 6007

	/*
	 * Prepare to write WAL starting at EndOfLog position, and init xlog
	 * buffer cache using the block containing the last record from the
	 * previous incarnation.
	 */
	openLogId = endLogId;
	openLogSeg = endLogSeg;
6008
	openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
6009
	openLogOff = 0;
V
WAL  
Vadim B. Mikheev 已提交
6010
	Insert = &XLogCtl->Insert;
6011
	Insert->PrevRecord = LastRec;
6012 6013
	XLogCtl->xlblocks[0].xlogid = openLogId;
	XLogCtl->xlblocks[0].xrecoff =
6014
		((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
B
Bruce Momjian 已提交
6015 6016

	/*
B
Bruce Momjian 已提交
6017 6018 6019
	 * Tricky point here: readBuf contains the *last* block that the LastRec
	 * record spans, not the one it starts in.	The last block is indeed the
	 * one we want to use.
T
Tom Lane 已提交
6020
	 */
6021 6022
	Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
	memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6023
	Insert->currpos = (char *) Insert->currpage +
6024
		(EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
6025

T
Tom Lane 已提交
6026
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
6027

T
Tom Lane 已提交
6028 6029 6030
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
6031

T
Tom Lane 已提交
6032 6033
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
6034

6035 6036 6037 6038 6039 6040 6041 6042 6043 6044
	freespace = INSERT_FREESPACE(Insert);
	if (freespace > 0)
	{
		/* Make sure rest of page is zero */
		MemSet(Insert->currpos, 0, freespace);
		XLogCtl->Write.curridx = 0;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
6045 6046
		 * Whenever Write.LogwrtResult points to exactly the end of a page,
		 * Write.curridx must point to the *next* page (see XLogWrite()).
6047
		 *
B
Bruce Momjian 已提交
6048
		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
B
Bruce Momjian 已提交
6049
		 * this is sufficient.	The first actual attempt to insert a log
6050
		 * record will advance the insert state.
6051 6052 6053 6054
		 */
		XLogCtl->Write.curridx = NextBufIdx(0);
	}

6055
	/* Pre-scan prepared transactions to find out the range of XIDs present */
6056
	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6057

V
WAL  
Vadim B. Mikheev 已提交
6058
	if (InRecovery)
6059
	{
B
Bruce Momjian 已提交
6060
		int			rmid;
6061

6062 6063 6064 6065 6066 6067 6068
		/*
		 * Resource managers might need to write WAL records, eg, to record
		 * index cleanup actions.  So temporarily enable XLogInsertAllowed in
		 * this process only.
		 */
		LocalSetXLogInsertAllowed();

6069 6070 6071 6072 6073 6074 6075 6076 6077
		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

6078 6079 6080
		/* Disallow XLogInsert again */
		LocalXLogInsertAllowed = -1;

6081 6082 6083 6084 6085 6086
		/*
		 * Check to see if the XLOG sequence contained any unresolved
		 * references to uninitialized pages.
		 */
		XLogCheckInvalidPages();

T
Tom Lane 已提交
6087
		/*
6088
		 * Perform a checkpoint to update all our recovery activity to disk.
6089
		 *
6090 6091 6092 6093 6094
		 * Note that we write a shutdown checkpoint rather than an on-line
		 * one. This is not particularly critical, but since we may be
		 * assigning a new TLI, using a shutdown checkpoint allows us to have
		 * the rule that TLI only changes in shutdown checkpoints, which
		 * allows some extra error checking in xlog_redo.
T
Tom Lane 已提交
6095
		 */
6096 6097 6098 6099 6100 6101
		if (bgwriterLaunched)
			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
							  CHECKPOINT_IMMEDIATE |
							  CHECKPOINT_WAIT);
		else
			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6102

T
Tom Lane 已提交
6103 6104 6105
		/*
		 * And finally, execute the recovery_end_command, if any.
		 */
6106 6107
		if (recoveryEndCommand)
			ExecuteRecoveryEndCommand();
6108
	}
6109

T
Tom Lane 已提交
6110 6111 6112
	/*
	 * Preallocate additional log files, if wanted.
	 */
6113
	PreallocXlogFiles(EndOfLog);
6114

6115 6116 6117
	/*
	 * Okay, we're officially UP.
	 */
V
WAL  
Vadim B. Mikheev 已提交
6118
	InRecovery = false;
6119

6120
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6121
	ControlFile->state = DB_IN_PRODUCTION;
6122
	ControlFile->time = (pg_time_t) time(NULL);
6123
	UpdateControlFile();
6124
	LWLockRelease(ControlFileLock);
6125

6126
	/* start the archive_timeout timer running */
6127
	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6128

6129 6130 6131 6132
	/* initialize shared-memory copy of latest checkpoint XID/epoch */
	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;

6133 6134 6135 6136
	/* also initialize latestCompletedXid, to nextXid - 1 */
	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);

6137 6138 6139 6140 6141 6142 6143 6144 6145 6146
	/*
	 * Start up the commit log and related stuff, too. In hot standby mode
	 * we did this already before WAL replay.
	 */
	if (standbyState == STANDBY_DISABLED)
	{
		StartupCLOG();
		StartupSUBTRANS(oldestActiveXID);
		StartupMultiXact();
	}
6147

6148 6149 6150
	/* Reload shared-memory state for prepared transactions */
	RecoverPreparedTransactions();

6151 6152 6153 6154 6155 6156 6157
	/*
	 * Shutdown the recovery environment. This must occur after
	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
	 */
	if (standbyState != STANDBY_DISABLED)
		ShutdownRecoveryTransactionEnvironment();

T
Tom Lane 已提交
6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}
6169 6170 6171 6172 6173 6174
	if (readRecordBuf)
	{
		free(readRecordBuf);
		readRecordBuf = NULL;
		readRecordBufSize = 0;
	}
6175 6176

	/*
6177 6178 6179 6180
	 * All done.  Allow backends to write WAL.  (Although the bool flag is
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
6181
	 */
6182 6183 6184 6185 6186 6187 6188 6189
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
6190 6191 6192 6193 6194
}

/*
 * Is the system still in recovery?
 *
6195 6196 6197
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
6198 6199 6200 6201 6202 6203 6204
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
 */
bool
RecoveryInProgress(void)
{
	/*
6205 6206 6207
	 * We check shared state each time only until we leave recovery mode.
	 * We can't re-enter recovery, so there's no need to keep checking after
	 * the shared variable has once been seen false.
6208 6209 6210 6211 6212 6213 6214 6215
	 */
	if (!LocalRecoveryInProgress)
		return false;
	else
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

6216 6217
		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
6218
		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6219
		SpinLockRelease(&xlogctl->info_lck);
6220 6221

		/*
6222
		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6223 6224 6225
		 * is finished. InitPostgres() relies upon this behaviour to ensure
		 * that InitXLOGAccess() is called at backend startup.  (If you change
		 * this, see also LocalSetXLogInsertAllowed.)
6226 6227 6228 6229 6230 6231
		 */
		if (!LocalRecoveryInProgress)
			InitXLOGAccess();

		return LocalRecoveryInProgress;
	}
T
Tom Lane 已提交
6232 6233
}

6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267
/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool
XLogInsertAllowed(void)
{
	/*
	 * If value is "unconditionally true" or "unconditionally false",
	 * just return it.  This provides the normal fast path once recovery
	 * is known done.
	 */
	if (LocalXLogInsertAllowed >= 0)
		return (bool) LocalXLogInsertAllowed;

	/*
	 * Else, must check to see if we're still in recovery.
	 */
	if (RecoveryInProgress())
		return false;

	/*
	 * On exit from recovery, reset to "unconditionally true", since there
	 * is no need to keep checking.
	 */
	LocalXLogInsertAllowed = 1;
	return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
6268 6269 6270
 *
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
 * and even call LocalSetXLogInsertAllowed() again after that.
6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281
 */
static void
LocalSetXLogInsertAllowed(void)
{
	Assert(LocalXLogInsertAllowed == -1);
	LocalXLogInsertAllowed = 1;

	/* Initialize as RecoveryInProgress() would do when switching state */
	InitXLOGAccess();
}

6282 6283
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
6284 6285 6286
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6287
 */
T
Tom Lane 已提交
6288
static XLogRecord *
6289
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
T
Tom Lane 已提交
6290 6291 6292 6293 6294
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
6295 6296 6297 6298
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6299
				(errmsg("invalid primary checkpoint link in control file")));
6300 6301 6302 6303 6304 6305 6306
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint link in control file")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
6307
				   (errmsg("invalid checkpoint link in backup_label file")));
6308 6309
				break;
		}
T
Tom Lane 已提交
6310 6311 6312
		return NULL;
	}

6313
	record = ReadRecord(&RecPtr, LOG, true);
T
Tom Lane 已提交
6314 6315 6316

	if (record == NULL)
	{
6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6332 6333 6334 6335
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid resource manager ID in primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid resource manager ID in secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
6348
				(errmsg("invalid resource manager ID in checkpoint record")));
6349 6350
				break;
		}
T
Tom Lane 已提交
6351 6352 6353 6354 6355
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
6356 6357 6358 6359
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6360
				   (errmsg("invalid xl_info in primary checkpoint record")));
6361 6362 6363
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
6364
				 (errmsg("invalid xl_info in secondary checkpoint record")));
6365 6366 6367 6368 6369 6370
				break;
			default:
				ereport(LOG,
						(errmsg("invalid xl_info in checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6371 6372
		return NULL;
	}
6373 6374
	if (record->xl_len != sizeof(CheckPoint) ||
		record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
T
Tom Lane 已提交
6375
	{
6376 6377 6378 6379
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
6380
					(errmsg("invalid length of primary checkpoint record")));
6381 6382 6383
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
6384
				  (errmsg("invalid length of secondary checkpoint record")));
6385 6386 6387 6388 6389 6390
				break;
			default:
				ereport(LOG,
						(errmsg("invalid length of checkpoint record")));
				break;
		}
T
Tom Lane 已提交
6391 6392 6393
		return NULL;
	}
	return record;
6394 6395
}

V
WAL  
Vadim B. Mikheev 已提交
6396
/*
6397 6398
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
6399
 * instead).  We need to initialize the local copies of ThisTimeLineID and
6400 6401
 * RedoRecPtr.
 *
6402
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6403
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6404
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
V
WAL  
Vadim B. Mikheev 已提交
6405 6406
 */
void
6407
InitXLOGAccess(void)
V
WAL  
Vadim B. Mikheev 已提交
6408
{
6409 6410
	/* ThisTimeLineID doesn't change so we need no lock to copy it */
	ThisTimeLineID = XLogCtl->ThisTimeLineID;
6411
	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6412

6413 6414
	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
	(void) GetRedoRecPtr();
6415 6416 6417 6418 6419 6420 6421 6422
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
6423 6424
GetRedoRecPtr(void)
{
6425 6426 6427
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

6428
	SpinLockAcquire(&xlogctl->info_lck);
6429 6430
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6431
	SpinLockRelease(&xlogctl->info_lck);
6432 6433

	return RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
6434 6435
}

6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449
/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr
GetInsertRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
B
Bruce Momjian 已提交
6450
	XLogRecPtr	recptr;
6451 6452 6453 6454 6455 6456 6457 6458

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtRqst.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478
/*
 * GetWriteRecPtr -- Returns the current write position.
 *
 * NOTE: The value returned lags behind the real write position. But,
 * an approximation is enough for the current usage of this function.
 */
XLogRecPtr
GetWriteRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtResult.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

6479 6480 6481
/*
 * Get the time of the last xlog segment switch
 */
6482
pg_time_t
6483 6484
GetLastSegSwitchTime(void)
{
6485
	pg_time_t	result;
6486 6487 6488 6489 6490 6491 6492 6493 6494

	/* Need WALWriteLock, but shared lock is sufficient */
	LWLockAcquire(WALWriteLock, LW_SHARED);
	result = XLogCtl->Write.lastSegSwitchTime;
	LWLockRelease(WALWriteLock);

	return result;
}

6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505
/*
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
 *
 * This is exported for use by code that would like to have 64-bit XIDs.
 * We don't really support such things, but all XIDs within the system
 * can be presumed "close to" the result, and thus the epoch associated
 * with them can be determined.
 */
void
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
{
B
Bruce Momjian 已提交
6506 6507 6508
	uint32		ckptXidEpoch;
	TransactionId ckptXid;
	TransactionId nextXid;
6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534

	/* Must read checkpoint info first, else have race condition */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		ckptXidEpoch = xlogctl->ckptXidEpoch;
		ckptXid = xlogctl->ckptXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* Now fetch current nextXid */
	nextXid = ReadNewTransactionId();

	/*
	 * nextXid is certainly logically later than ckptXid.  So if it's
	 * numerically less, it must have wrapped into the next epoch.
	 */
	if (nextXid < ckptXid)
		ckptXidEpoch++;

	*xid = nextXid;
	*epoch = ckptXidEpoch;
}

6535 6536 6537 6538 6539 6540 6541 6542 6543 6544
/*
 * GetRecoveryTargetTLI - get the recovery target timeline ID
 */
TimeLineID
GetRecoveryTargetTLI(void)
{
	/* RecoveryTargetTLI doesn't change so we need no lock to copy it */
	return XLogCtl->RecoveryTargetTLI;
}

6545
/*
T
Tom Lane 已提交
6546
 * This must be called ONCE during postmaster or standalone-backend shutdown
6547 6548
 */
void
6549
ShutdownXLOG(int code, Datum arg)
6550
{
6551 6552
	ereport(LOG,
			(errmsg("shutting down")));
6553

6554 6555 6556
	if (RecoveryInProgress())
		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	else
6557 6558 6559 6560 6561 6562 6563 6564 6565 6566
	{
		/*
		 * If archiving is enabled, rotate the last XLOG file so that all the
		 * remaining records are archived (postmaster wakes up the archiver
		 * process one more time at the end of shutdown). The checkpoint
		 * record will go to the next XLOG file and won't be archived (yet).
		 */
		if (XLogArchivingActive() && XLogArchiveCommandSet())
			RequestXLogSwitch();

6567
		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6568
	}
6569
	ShutdownCLOG();
6570
	ShutdownSUBTRANS();
6571
	ShutdownMultiXact();
6572

6573 6574
	ereport(LOG,
			(errmsg("database system is shut down")));
6575 6576
}

6577
/*
6578 6579 6580
 * Log start of a checkpoint.
 */
static void
6581
LogCheckpointStart(int flags, bool restartpoint)
6582
{
6583
	const char *msg;
6584 6585

	/*
6586 6587
	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
	 * the main message, but what about all the flags?
6588 6589
	 */
	if (restartpoint)
6590
		msg = "restartpoint starting:%s%s%s%s%s%s%s";
6591
	else
6592
		msg = "checkpoint starting:%s%s%s%s%s%s%s";
6593 6594

	elog(LOG, msg,
6595
		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6596
		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6597 6598 6599 6600 6601 6602 6603
		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
		 (flags & CHECKPOINT_FORCE) ? " force" : "",
		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

6604
/*
6605 6606 6607
 * Log end of a checkpoint.
 */
static void
6608
LogCheckpointEnd(bool restartpoint)
6609
{
B
Bruce Momjian 已提交
6610 6611 6612 6613 6614 6615
	long		write_secs,
				sync_secs,
				total_secs;
	int			write_usecs,
				sync_usecs,
				total_usecs;
6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630

	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

	TimestampDifference(CheckpointStats.ckpt_start_t,
						CheckpointStats.ckpt_end_t,
						&total_secs, &total_usecs);

	TimestampDifference(CheckpointStats.ckpt_write_t,
						CheckpointStats.ckpt_sync_t,
						&write_secs, &write_usecs);

	TimestampDifference(CheckpointStats.ckpt_sync_t,
						CheckpointStats.ckpt_sync_end_t,
						&sync_secs, &sync_usecs);

6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650
	if (restartpoint)
		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
	else
		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
			 "%d transaction log file(s) added, %d removed, %d recycled; "
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
			 total_secs, total_usecs / 1000);
6651 6652
}

T
Tom Lane 已提交
6653 6654
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
6655
 *
6656 6657
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
6658
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
6659
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
6660
 *		ignoring checkpoint_completion_target parameter.
6661
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
6662
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
6663
 *		CHECKPOINT_END_OF_RECOVERY).
6664
 *
6665
 * Note: flags contains other bits, of interest here only for logging purposes.
6666 6667
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
T
Tom Lane 已提交
6668
 */
6669
void
6670
CreateCheckPoint(int flags)
6671
{
6672
	bool		shutdown;
6673 6674 6675
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
6676
	XLogRecData rdata;
6677
	uint32		freespace;
V
Vadim B. Mikheev 已提交
6678 6679
	uint32		_logId;
	uint32		_logSeg;
6680 6681
	TransactionId *inCommitXids;
	int			nInCommit;
V
Vadim B. Mikheev 已提交
6682

6683 6684 6685 6686
	/*
	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
	 * issued at a different time.
	 */
6687
	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
6688 6689 6690
		shutdown = true;
	else
		shutdown = false;
6691

6692 6693 6694
	/* sanity check */
	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
		elog(ERROR, "can't create a checkpoint during recovery");
6695

6696 6697 6698 6699 6700 6701 6702 6703
	/*
	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
	 * (This is just pro forma, since in the present system structure there is
	 * only one process that is allowed to issue checkpoints at any given
	 * time.)
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

6704 6705 6706 6707 6708 6709 6710 6711 6712 6713
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

6714 6715 6716
	/*
	 * Use a critical section to force system panic if we have trouble.
	 */
6717 6718
	START_CRIT_SECTION();

6719 6720
	if (shutdown)
	{
6721
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6722
		ControlFile->state = DB_SHUTDOWNING;
6723
		ControlFile->time = (pg_time_t) time(NULL);
6724
		UpdateControlFile();
6725
		LWLockRelease(ControlFileLock);
6726
	}
T
Tom Lane 已提交
6727

6728
	/*
B
Bruce Momjian 已提交
6729 6730 6731
	 * Let smgr prepare for checkpoint; this has to happen before we determine
	 * the REDO pointer.  Note that smgr must not do anything that'd have to
	 * be undone if we decide no checkpoint is needed.
6732 6733 6734 6735
	 */
	smgrpreckpt();

	/* Begin filling in the checkpoint WAL record */
6736
	MemSet(&checkPoint, 0, sizeof(checkPoint));
6737
	checkPoint.time = (pg_time_t) time(NULL);
6738

6739 6740 6741 6742 6743 6744
	/* Set important parameter values for use when replaying WAL */
	checkPoint.MaxConnections = MaxConnections;
	checkPoint.max_prepared_xacts = max_prepared_xacts;
	checkPoint.max_locks_per_xact = max_locks_per_xact;
	checkPoint.XLogStandbyInfoMode = XLogStandbyInfoActive();

6745
	/*
6746 6747
	 * We must hold WALInsertLock while examining insert state to determine
	 * the checkpoint REDO pointer.
6748
	 */
6749
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
6750 6751

	/*
B
Bruce Momjian 已提交
6752 6753 6754 6755 6756 6757 6758 6759
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
6760
	 *
6761 6762 6763 6764
	 * We have to make two tests to determine that nothing has happened since
	 * the start of the last checkpoint: current insertion point must match
	 * the end of the last checkpoint record, and its redo pointer must point
	 * to itself.
T
Tom Lane 已提交
6765
	 */
6766 6767
	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
				  CHECKPOINT_FORCE)) == 0)
T
Tom Lane 已提交
6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
6780 6781
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
6782 6783 6784 6785 6786
			END_CRIT_SECTION();
			return;
		}
	}

6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797
	/*
	 * An end-of-recovery checkpoint is created before anyone is allowed to
	 * write WAL. To allow us to write the checkpoint record, temporarily
	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
	 */
	if (flags & CHECKPOINT_END_OF_RECOVERY)
		LocalSetXLogInsertAllowed();

	checkPoint.ThisTimeLineID = ThisTimeLineID;

T
Tom Lane 已提交
6798 6799 6800
	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
B
Bruce Momjian 已提交
6801 6802 6803 6804
	 * NB: this is NOT necessarily where the checkpoint record itself will be,
	 * since other backends may insert more XLOG records while we're off doing
	 * the buffer flush work.  Those XLOG records are logically after the
	 * checkpoint, even though physically before it.  Got that?
T
Tom Lane 已提交
6805 6806
	 */
	freespace = INSERT_FREESPACE(Insert);
6807 6808
	if (freespace < SizeOfXLogRecord)
	{
6809
		(void) AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
6810
		/* OK to ignore update return flag, since we will do flush anyway */
6811
		freespace = INSERT_FREESPACE(Insert);
6812
	}
T
Tom Lane 已提交
6813
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
6814

T
Tom Lane 已提交
6815
	/*
B
Bruce Momjian 已提交
6816 6817
	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
	 * must be done while holding the insert lock AND the info_lck.
6818
	 *
B
Bruce Momjian 已提交
6819
	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
B
Bruce Momjian 已提交
6820 6821 6822 6823 6824
	 * pointing past where it really needs to point.  This is okay; the only
	 * consequence is that XLogInsert might back up whole buffers that it
	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
	 * XLogInserts that happen while we are dumping buffers must assume that
	 * their buffer changes are not included in the checkpoint.
T
Tom Lane 已提交
6825
	 */
6826 6827 6828 6829
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

6830
		SpinLockAcquire(&xlogctl->info_lck);
6831
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
6832
		SpinLockRelease(&xlogctl->info_lck);
6833
	}
B
Bruce Momjian 已提交
6834

T
Tom Lane 已提交
6835
	/*
6836 6837
	 * Now we can release WAL insert lock, allowing other xacts to proceed
	 * while we are flushing disk buffers.
T
Tom Lane 已提交
6838
	 */
6839
	LWLockRelease(WALInsertLock);
6840

6841
	/*
B
Bruce Momjian 已提交
6842 6843
	 * If enabled, log checkpoint start.  We postpone this until now so as not
	 * to log anything if we decided to skip the checkpoint.
6844 6845
	 */
	if (log_checkpoints)
6846
		LogCheckpointStart(flags, false);
6847

6848 6849
	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864
	/*
	 * Before flushing data, we must wait for any transactions that are
	 * currently in their commit critical sections.  If an xact inserted its
	 * commit record into XLOG just before the REDO point, then a crash
	 * restart from the REDO point would not replay that record, which means
	 * that our flushing had better include the xact's update of pg_clog.  So
	 * we wait till he's out of his commit critical section before proceeding.
	 * See notes in RecordTransactionCommit().
	 *
	 * Because we've already released WALInsertLock, this test is a bit fuzzy:
	 * it is possible that we will wait for xacts we didn't really need to
	 * wait for.  But the delay should be short and it seems better to make
	 * checkpoint take a bit longer than to hold locks longer than necessary.
	 * (In fact, the whole reason we have this issue is that xact.c does
	 * commit record XLOG insertion and clog update as two separate steps
B
Bruce Momjian 已提交
6865 6866
	 * protected by different locks, but again that seems best on grounds of
	 * minimizing lock contention.)
6867
	 *
B
Bruce Momjian 已提交
6868 6869
	 * A transaction that has not yet set inCommit when we look cannot be at
	 * risk, since he's not inserted his commit record yet; and one that's
6870 6871 6872 6873 6874 6875 6876
	 * already cleared it is not at risk either, since he's done fixing clog
	 * and we will correctly flush the update below.  So we cannot miss any
	 * xacts we need to wait for.
	 */
	nInCommit = GetTransactionsInCommit(&inCommitXids);
	if (nInCommit > 0)
	{
B
Bruce Momjian 已提交
6877 6878 6879
		do
		{
			pg_usleep(10000L);	/* wait for 10 msec */
6880 6881 6882 6883
		} while (HaveTransactionsInCommit(inCommitXids, nInCommit));
	}
	pfree(inCommitXids);

6884 6885 6886
	/*
	 * Get the other info we need for the checkpoint record.
	 */
6887
	LWLockAcquire(XidGenLock, LW_SHARED);
6888
	checkPoint.nextXid = ShmemVariableCache->nextXid;
6889 6890
	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
6891
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
6892

6893 6894 6895 6896 6897
	/* Increase XID epoch if we've wrapped around since last checkpoint */
	checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
		checkPoint.nextXidEpoch++;

6898
	LWLockAcquire(OidGenLock, LW_SHARED);
6899
	checkPoint.nextOid = ShmemVariableCache->nextOid;
6900 6901
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
6902
	LWLockRelease(OidGenLock);
6903

6904 6905 6906
	MultiXactGetCheckptMulti(shutdown,
							 &checkPoint.nextMulti,
							 &checkPoint.nextMultiOffset);
6907

T
Tom Lane 已提交
6908
	/*
B
Bruce Momjian 已提交
6909 6910
	 * Having constructed the checkpoint record, ensure all shmem disk buffers
	 * and commit-log buffers are flushed to disk.
6911
	 *
6912 6913
	 * This I/O could fail for various reasons.  If so, we will fail to
	 * complete the checkpoint, but there is no reason to force a system
6914
	 * panic. Accordingly, exit critical section while doing it.
T
Tom Lane 已提交
6915
	 */
6916 6917
	END_CRIT_SECTION();

6918
	CheckPointGuts(checkPoint.redo, flags);
6919

6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934
	/*
	 * Take a snapshot of running transactions and write this to WAL.
	 * This allows us to reconstruct the state of running transactions
	 * during archive recovery, if required. Skip, if this info disabled.
	 *
	 * If we are shutting down, or Startup process is completing crash
	 * recovery we don't need to write running xact data.
	 *
	 * Update checkPoint.nextXid since we have a later value
	 */
	if (!shutdown && XLogStandbyInfoActive())
		 LogStandbySnapshot(&checkPoint.oldestActiveXid, &checkPoint.nextXid);
	else
		checkPoint.oldestActiveXid = InvalidTransactionId;

6935 6936
	START_CRIT_SECTION();

T
Tom Lane 已提交
6937 6938 6939
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
B
Bruce Momjian 已提交
6940
	rdata.data = (char *) (&checkPoint);
6941
	rdata.len = sizeof(checkPoint);
6942
	rdata.buffer = InvalidBuffer;
6943 6944
	rdata.next = NULL;

T
Tom Lane 已提交
6945 6946 6947 6948 6949 6950
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
6951

6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966
	/*
	 * We mustn't write any new WAL after a shutdown checkpoint, or it will
	 * be overwritten at next startup.  No-one should even try, this just
	 * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
	 * we want to just temporarily disable writing until the system has exited
	 * recovery.
	 */
	if (shutdown)
	{
		if (flags & CHECKPOINT_END_OF_RECOVERY)
			LocalXLogInsertAllowed = -1;	/* return to "check" state */
		else
			LocalXLogInsertAllowed = 0;		/* never again write WAL */
	}

T
Tom Lane 已提交
6967
	/*
B
Bruce Momjian 已提交
6968 6969
	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
	 * = end of actual checkpoint record.
T
Tom Lane 已提交
6970 6971
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
6972 6973
		ereport(PANIC,
				(errmsg("concurrent transaction log activity while database system is shutting down")));
6974

T
Tom Lane 已提交
6975
	/*
6976 6977
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
T
Tom Lane 已提交
6978
	 */
6979
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
6980

T
Tom Lane 已提交
6981 6982 6983
	/*
	 * Update the control file.
	 */
6984
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6985 6986
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
6987 6988 6989
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
6990
	ControlFile->time = (pg_time_t) time(NULL);
6991 6992
	/* crash recovery should always recover to the end of WAL */
	MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
6993
	UpdateControlFile();
6994
	LWLockRelease(ControlFileLock);
6995

6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006
	/* Update shared-memory copy of checkpoint XID/epoch */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
		xlogctl->ckptXid = checkPoint.nextXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

7007
	/*
B
Bruce Momjian 已提交
7008
	 * We are now done with critical updates; no need for system panic if we
7009
	 * have trouble while fooling with old log segments.
7010 7011 7012
	 */
	END_CRIT_SECTION();

7013 7014 7015 7016 7017
	/*
	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
	 */
	smgrpostckpt();

V
Vadim B. Mikheev 已提交
7018
	/*
7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046
	 * If there's connected standby servers doing XLOG streaming, don't
	 * delete XLOG files that have not been streamed to all of them yet.
	 * This does nothing to prevent them from being deleted when the
	 * standby is disconnected (e.g because of network problems), but at
	 * least it avoids an open replication connection from failing because
	 * of that.
	 */
	if ((_logId || _logSeg) && MaxWalSenders > 0)
	{
		XLogRecPtr oldest;
		uint32	log;
		uint32	seg;

		oldest = GetOldestWALSendPointer();
		if (oldest.xlogid != 0 || oldest.xrecoff != 0)
		{
			XLByteToSeg(oldest, log, seg);
			if (log < _logId || (log == _logId && seg < _logSeg))
			{
				_logId	= log;
				_logSeg	= seg;
			}
		}
	}

	/*
	 * Delete old log files (those no longer needed even for
	 * previous checkpoint or the standbys in XLOG streaming).
V
Vadim B. Mikheev 已提交
7047 7048 7049
	 */
	if (_logId || _logSeg)
	{
T
Tom Lane 已提交
7050
		PrevLogSeg(_logId, _logSeg);
7051
		RemoveOldXlogFiles(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
7052 7053
	}

T
Tom Lane 已提交
7054
	/*
7055 7056
	 * Make more log segments if needed.  (Do this after recycling old log
	 * segments, since that may supply some of the needed files.)
T
Tom Lane 已提交
7057 7058
	 */
	if (!shutdown)
7059
		PreallocXlogFiles(recptr);
T
Tom Lane 已提交
7060

7061
	/*
B
Bruce Momjian 已提交
7062 7063 7064 7065 7066
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	During recovery, though, we mustn't do this because
	 * StartupSUBTRANS hasn't been called yet.
7067
	 */
7068
	if (!RecoveryInProgress())
7069
		TruncateSUBTRANS(GetOldestXmin(true, false));
7070

7071 7072
	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
7073
		LogCheckpointEnd(false);
7074

7075 7076 7077 7078 7079
	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
									 NBuffers,
									 CheckpointStats.ckpt_segs_added,
									 CheckpointStats.ckpt_segs_removed,
									 CheckpointStats.ckpt_segs_recycled);
7080

7081
	LWLockRelease(CheckpointLock);
7082
}
V
WAL  
Vadim B. Mikheev 已提交
7083

7084 7085 7086 7087 7088 7089 7090
/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void
7091
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7092 7093 7094 7095
{
	CheckPointCLOG();
	CheckPointSUBTRANS();
	CheckPointMultiXact();
7096
	CheckPointRelationMap();
B
Bruce Momjian 已提交
7097
	CheckPointBuffers(flags);	/* performs all required fsyncs */
7098 7099 7100 7101 7102
	/* We deliberately delay 2PC checkpointing as long as possible */
	CheckPointTwoPhase(checkPointRedo);
}

/*
7103 7104 7105 7106 7107 7108 7109 7110
 * Save a checkpoint for recovery restart if appropriate
 *
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the bgwriter, while this one will be executed by the startup
 * process.)
7111 7112 7113 7114
 */
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
B
Bruce Momjian 已提交
7115
	int			rmid;
7116

7117 7118
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129

	/*
	 * Is it safe to checkpoint?  We must ask each of the resource managers
	 * whether they have any partial state information that might prevent a
	 * correct restart from this point.  If so, we skip this opportunity, but
	 * return at the next checkpoint record for another try.
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7130
			{
7131
				elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
7132 7133 7134
					 rmid,
					 checkPoint->redo.xlogid,
					 checkPoint->redo.xrecoff);
7135
				return;
7136
			}
7137 7138 7139
	}

	/*
7140 7141
	 * Copy the checkpoint record to shared memory, so that bgwriter can use
	 * it the next time it wants to perform a restartpoint.
7142 7143 7144 7145 7146 7147 7148 7149
	 */
	SpinLockAcquire(&xlogctl->info_lck);
	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
}

/*
7150 7151
 * Establish a restartpoint if possible.
 *
7152 7153 7154 7155 7156
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
7157
 * a restartpoint if we have replayed a safe checkpoint record since last
7158 7159 7160 7161 7162
 * restartpoint.
 */
bool
CreateRestartPoint(int flags)
{
7163 7164
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;
7165 7166
	uint32		_logId;
	uint32		_logSeg;
7167

7168 7169 7170 7171 7172 7173 7174 7175 7176
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
	 * happens at a time.
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

7177
	/* Get a local copy of the last safe checkpoint record. */
7178 7179 7180 7181 7182
	SpinLockAcquire(&xlogctl->info_lck);
	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);

7183
	/*
7184 7185 7186 7187 7188 7189
	 * Check that we're still in recovery mode. It's ok if we exit recovery
	 * mode after this check, the restart point is valid anyway.
	 */
	if (!RecoveryInProgress())
	{
		ereport(DEBUG2,
7190
			  (errmsg("skipping restartpoint, recovery has already ended")));
7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205
		LWLockRelease(CheckpointLock);
		return false;
	}

	/*
	 * If the last checkpoint record we've replayed is already our last
	 * restartpoint, we can't perform a new restart point. We still update
	 * minRecoveryPoint in that case, so that if this is a shutdown restart
	 * point, we won't start up earlier than before. That's not strictly
	 * necessary, but when we get hot standby capability, it would be rather
	 * weird if the database opened up for read-only connections at a
	 * point-in-time before the last shutdown. Such time travel is still
	 * possible in case of immediate shutdown, though.
	 *
	 * We don't explicitly advance minRecoveryPoint when we do create a
7206 7207
	 * restartpoint. It's assumed that flushing the buffers will do that as a
	 * side-effect.
7208
	 */
7209 7210 7211
	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
	{
7212 7213
		XLogRecPtr	InvalidXLogRecPtr = {0, 0};

7214 7215
		ereport(DEBUG2,
				(errmsg("skipping restartpoint, already performed at %X/%X",
7216
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234

		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
		LWLockRelease(CheckpointLock);
		return false;
	}

	if (log_checkpoints)
	{
		/*
		 * Prepare to accumulate statistics.
		 */
		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

		LogCheckpointStart(flags, true);
	}

	CheckPointGuts(lastCheckPoint.redo, flags);
7235

7236 7237 7238 7239 7240 7241
	/*
	 * Select point at which we can truncate the xlog, which we base on the
	 * prior checkpoint's earliest info.
	 */
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);

7242
	/*
7243 7244 7245 7246
	 * Update pg_control, using current time.  Check that it still shows
	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
	 * this is a quick hack to make sure nothing really bad happens if
	 * somehow we get here after the end-of-recovery checkpoint.
7247
	 */
7248
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7249 7250 7251 7252 7253 7254 7255 7256 7257
	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
	{
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = lastCheckPointRecPtr;
		ControlFile->checkPointCopy = lastCheckPoint;
		ControlFile->time = (pg_time_t) time(NULL);
		UpdateControlFile();
	}
7258
	LWLockRelease(ControlFileLock);
7259

7260 7261 7262 7263 7264 7265 7266
	/*
	 * Delete old log files (those no longer needed even for previous
	 * checkpoint/restartpoint) to prevent the disk holding the xlog from
	 * growing full. We don't need do this during normal recovery, but during
	 * streaming recovery we have to or the disk will eventually fill up from
	 * old log files streamed from master.
	 */
7267
	if (WalRcvInProgress() && (_logId || _logSeg))
7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283
	{
		XLogRecPtr	endptr;

		/* Get the current (or recent) end of xlog */
		endptr = GetWalRcvWriteRecPtr();

		PrevLogSeg(_logId, _logSeg);
		RemoveOldXlogFiles(_logId, _logSeg, endptr);

		/*
		 * Make more log segments if needed.  (Do this after recycling old log
		 * segments, since that may supply some of the needed files.)
		 */
		PreallocXlogFiles(endptr);
	}

7284
	/*
7285 7286 7287
	 * Currently, there is no need to truncate pg_subtrans during recovery. If
	 * we did do that, we will need to have called StartupSUBTRANS() already
	 * and then TruncateSUBTRANS() would go here.
7288 7289 7290 7291 7292 7293 7294
	 */

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
		LogCheckpointEnd(true);

	ereport((log_checkpoints ? LOG : DEBUG2),
7295 7296 7297
			(errmsg("recovery restart point at %X/%X with latest known log time %s",
					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff,
					timestamptz_to_str(GetLatestXLogTime()))));
7298 7299 7300

	LWLockRelease(CheckpointLock);
	return true;
7301 7302
}

T
Tom Lane 已提交
7303 7304 7305
/*
 * Write a NEXTOID log record
 */
7306 7307 7308
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
7309
	XLogRecData rdata;
7310

B
Bruce Momjian 已提交
7311
	rdata.data = (char *) (&nextOid);
7312
	rdata.len = sizeof(Oid);
7313
	rdata.buffer = InvalidBuffer;
7314 7315
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
B
Bruce Momjian 已提交
7316

7317 7318
	/*
	 * We need not flush the NEXTOID record immediately, because any of the
B
Bruce Momjian 已提交
7319 7320 7321 7322 7323
	 * just-allocated OIDs could only reach disk as part of a tuple insert or
	 * update that would have its own XLOG record that must follow the NEXTOID
	 * record.	Therefore, the standard buffer LSN interlock applied to those
	 * records will ensure no such OID reaches disk before the NEXTOID record
	 * does.
7324 7325
	 *
	 * Note, however, that the above statement only covers state "within" the
B
Bruce Momjian 已提交
7326 7327
	 * database.  When we use a generated OID as a file or directory name, we
	 * are in a sense violating the basic WAL rule, because that filesystem
7328
	 * change may reach disk before the NEXTOID WAL record does.  The impact
B
Bruce Momjian 已提交
7329 7330 7331 7332 7333
	 * of this is that if a database crash occurs immediately afterward, we
	 * might after restart re-generate the same OID and find that it conflicts
	 * with the leftover file or directory.  But since for safety's sake we
	 * always loop until finding a nonconflicting filename, this poses no real
	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7334 7335 7336
	 */
}

7337 7338 7339 7340 7341 7342 7343 7344 7345 7346
/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
7347
XLogRecPtr
7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363
RequestXLogSwitch(void)
{
	XLogRecPtr	RecPtr;
	XLogRecData rdata;

	/* XLOG SWITCH, alone among xlog record types, has no data */
	rdata.buffer = InvalidBuffer;
	rdata.data = NULL;
	rdata.len = 0;
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

	return RecPtr;
}

7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388
/*
 * Write an XLOG UNLOGGED record, indicating that some operation was
 * performed on data that we fsync()'d directly to disk, skipping
 * WAL-logging.
 *
 * Such operations screw up archive recovery, so we complain if we see
 * these records during archive recovery. That shouldn't happen in a
 * correctly configured server, but you can induce it by temporarily
 * disabling archiving and restarting, so it's good to at least get a
 * warning of silent data loss in such cases. These records serve no
 * other purpose and are simply ignored during crash recovery.
 */
void
XLogReportUnloggedStatement(char *reason)
{
	XLogRecData rdata;

	rdata.buffer = InvalidBuffer;
	rdata.data = reason;
	rdata.len = strlen(reason) + 1;
	rdata.next = NULL;

	XLogInsert(RM_XLOG_ID, XLOG_UNLOGGED, &rdata);
}

T
Tom Lane 已提交
7389 7390
/*
 * XLOG resource manager's routines
7391 7392
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
7393
 * not all record types are related to control file updates.
T
Tom Lane 已提交
7394
 */
V
WAL  
Vadim B. Mikheev 已提交
7395 7396 7397
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
7398
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
7399

7400 7401 7402
	/* Backup blocks are not used in xlog records */
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

7403
	if (info == XLOG_NEXTOID)
7404
	{
B
Bruce Momjian 已提交
7405
		Oid			nextOid;
7406 7407 7408

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
7409
		{
7410
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
7423 7424
		MultiXactSetNextMXact(checkPoint.nextMulti,
							  checkPoint.nextMultiOffset);
7425 7426
		ShmemVariableCache->oldestXid = checkPoint.oldestXid;
		ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
B
Bruce Momjian 已提交
7427

7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440
		/* Check to see if any changes to max_connections give problems */
		if (standbyState != STANDBY_DISABLED)
			CheckRequiredParameterValues(checkPoint);

		if (standbyState >= STANDBY_INITIALIZED)
		{
			/*
			 * Remove stale transactions, if any.
			 */
			ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
			StandbyReleaseOldLocks(checkPoint.nextXid);
		}

7441 7442 7443 7444
		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

7445
		/*
B
Bruce Momjian 已提交
7446
		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
7447 7448 7449 7450 7451 7452 7453 7454
		 */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
		{
			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
				!list_member_int(expectedTLIs,
								 (int) checkPoint.ThisTimeLineID))
				ereport(PANIC,
						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7455 7456 7457
								checkPoint.ThisTimeLineID, ThisTimeLineID)));
			/* Following WAL records should be run with new TLI */
			ThisTimeLineID = checkPoint.ThisTimeLineID;
7458
		}
7459 7460

		RecoveryRestartPoint(&checkPoint);
T
Tom Lane 已提交
7461 7462 7463 7464 7465 7466
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7467
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
7468 7469
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
7470 7471 7472 7473 7474 7475
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
7476 7477
		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
								  checkPoint.nextMultiOffset);
7478 7479 7480 7481 7482 7483
		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
								  checkPoint.oldestXid))
		{
			ShmemVariableCache->oldestXid = checkPoint.oldestXid;
			ShmemVariableCache->oldestXidDB = checkPoint.oldestXidDB;
		}
7484 7485 7486 7487 7488

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

7489 7490
		/* TLI should not change in an on-line checkpoint */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7491
			ereport(PANIC,
7492 7493
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							checkPoint.ThisTimeLineID, ThisTimeLineID)));
7494 7495

		RecoveryRestartPoint(&checkPoint);
7496
	}
7497 7498 7499 7500
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
7501 7502 7503 7504
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr	startpoint;
		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done. The data on disk is now consistent.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
				ControlFile->minRecoveryPoint = lsn;
			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543
	else if (info == XLOG_UNLOGGED)
	{
		if (InArchiveRecovery)
		{
			/*
			 * Note: We don't print the reason string from the record,
			 * because that gets added as a line using xlog_desc()
			 */
			ereport(WARNING,
					(errmsg("unlogged operation performed, data may be missing"),
					 errhint("This can happen if you temporarily disable archive_mode without taking a new base backup.")));
		}
	}
V
WAL  
Vadim B. Mikheev 已提交
7544
}
B
Bruce Momjian 已提交
7545

V
WAL  
Vadim B. Mikheev 已提交
7546
void
7547
xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
7548
{
B
Bruce Momjian 已提交
7549
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
7550

T
Tom Lane 已提交
7551 7552
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
7553
	{
B
Bruce Momjian 已提交
7554 7555
		CheckPoint *checkpoint = (CheckPoint *) rec;

7556
		appendStringInfo(buf, "checkpoint: redo %X/%X; "
7557
						 "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
7558
						 "oldest xid %u in DB %u; oldest running xid %u; %s",
B
Bruce Momjian 已提交
7559 7560 7561 7562 7563 7564
						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
						 checkpoint->ThisTimeLineID,
						 checkpoint->nextXidEpoch, checkpoint->nextXid,
						 checkpoint->nextOid,
						 checkpoint->nextMulti,
						 checkpoint->nextMultiOffset,
7565 7566
						 checkpoint->oldestXid,
						 checkpoint->oldestXidDB,
7567
						 checkpoint->oldestActiveXid,
B
Bruce Momjian 已提交
7568
				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
7569
	}
7570 7571 7572 7573
	else if (info == XLOG_NOOP)
	{
		appendStringInfo(buf, "xlog no-op");
	}
7574 7575
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
7576
		Oid			nextOid;
7577 7578

		memcpy(&nextOid, rec, sizeof(Oid));
7579
		appendStringInfo(buf, "nextOid: %u", nextOid);
7580
	}
7581 7582 7583 7584
	else if (info == XLOG_SWITCH)
	{
		appendStringInfo(buf, "xlog switch");
	}
7585 7586 7587 7588 7589 7590 7591 7592
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr startpoint;

		memcpy(&startpoint, rec, sizeof(XLogRecPtr));
		appendStringInfo(buf, "backup end: %X/%X",
						 startpoint.xlogid, startpoint.xrecoff);
	}
7593 7594 7595 7596 7597 7598
	else if (info == XLOG_UNLOGGED)
	{
		char *reason = rec;

		appendStringInfo(buf, "unlogged operation: %s", reason);
	}
V
WAL  
Vadim B. Mikheev 已提交
7599
	else
7600
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
7601 7602
}

7603
#ifdef WAL_DEBUG
7604

V
WAL  
Vadim B. Mikheev 已提交
7605
static void
7606
xlog_outrec(StringInfo buf, XLogRecord *record)
V
WAL  
Vadim B. Mikheev 已提交
7607
{
B
Bruce Momjian 已提交
7608
	int			i;
7609

7610
	appendStringInfo(buf, "prev %X/%X; xid %u",
7611 7612
					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
					 record->xl_xid);
7613

7614 7615 7616
	appendStringInfo(buf, "; len %u",
					 record->xl_len);

7617
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
7618
	{
7619
		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
B
Bruce Momjian 已提交
7620
			appendStringInfo(buf, "; bkpb%d", i + 1);
7621 7622
	}

7623
	appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
7624
}
B
Bruce Momjian 已提交
7625
#endif   /* WAL_DEBUG */
7626 7627 7628


/*
7629 7630
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
7631
 */
7632 7633
static int
get_sync_bit(int method)
7634
{
7635 7636 7637
	/* If fsync is disabled, never open in sync mode */
	if (!enableFsync)
		return 0;
7638

7639
	switch (method)
7640
	{
7641 7642 7643 7644 7645 7646
			/*
			 * enum values for all sync options are defined even if they are
			 * not supported on the current platform.  But if not, they are
			 * not included in the enum option array, and therefore will never
			 * be seen here.
			 */
7647 7648 7649
		case SYNC_METHOD_FSYNC:
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
		case SYNC_METHOD_FDATASYNC:
7650
			return 0;
7651
#ifdef OPEN_SYNC_FLAG
7652
		case SYNC_METHOD_OPEN:
7653
			return OPEN_SYNC_FLAG;
7654 7655
#endif
#ifdef OPEN_DATASYNC_FLAG
7656
		case SYNC_METHOD_OPEN_DSYNC:
7657
			return OPEN_DATASYNC_FLAG;
7658
#endif
7659
		default:
7660 7661
			/* can't happen (unless we are out of sync with option array) */
			elog(ERROR, "unrecognized wal_sync_method: %d", method);
7662
			return 0;			/* silence warning */
7663
	}
7664
}
7665

7666 7667 7668 7669 7670 7671
/*
 * GUC support
 */
bool
assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
{
7672
	if (!doit)
7673
		return true;
7674

7675
	if (sync_method != new_sync_method)
7676 7677
	{
		/*
B
Bruce Momjian 已提交
7678 7679
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
B
Bruce Momjian 已提交
7680 7681
		 * changing, close the log file so it will be reopened (with new flag
		 * bit) at next use.
7682 7683 7684 7685
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
7686 7687
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
7688 7689
						 errmsg("could not fsync log file %u, segment %u: %m",
								openLogId, openLogSeg)));
7690
			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
7691
				XLogFileClose();
7692 7693
		}
	}
7694

7695
	return true;
7696 7697 7698 7699
}


/*
7700 7701 7702 7703
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
7704
 */
7705 7706
void
issue_xlog_fsync(int fd, uint32 log, uint32 seg)
7707 7708 7709
{
	switch (sync_method)
	{
7710
		case SYNC_METHOD_FSYNC:
7711
			if (pg_fsync_no_writethrough(fd) != 0)
7712 7713
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
7714
						 errmsg("could not fsync log file %u, segment %u: %m",
7715
								log, seg)));
7716
			break;
7717 7718
#ifdef HAVE_FSYNC_WRITETHROUGH
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
7719
			if (pg_fsync_writethrough(fd) != 0)
7720 7721
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
7722
						 errmsg("could not fsync write-through log file %u, segment %u: %m",
7723
								log, seg)));
7724 7725
			break;
#endif
7726 7727
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
7728
			if (pg_fdatasync(fd) != 0)
7729 7730
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
7731
					errmsg("could not fdatasync log file %u, segment %u: %m",
7732
						   log, seg)));
7733 7734 7735
			break;
#endif
		case SYNC_METHOD_OPEN:
7736
		case SYNC_METHOD_OPEN_DSYNC:
7737 7738 7739
			/* write synced it already */
			break;
		default:
7740
			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
7741 7742 7743
			break;
	}
}
7744 7745 7746 7747 7748 7749 7750 7751 7752


/*
 * pg_start_backup: set up for taking an on-line backup dump
 *
 * Essentially what this does is to create a backup label file in $PGDATA,
 * where it will be archived as part of the backup dump.  The label file
 * contains the user-supplied label string (typically this would be used
 * to tell where the backup dump will be stored) and the starting time and
7753
 * starting WAL location for the dump.
7754 7755 7756 7757 7758
 */
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
	text	   *backupid = PG_GETARG_TEXT_P(0);
7759
	bool		fast = PG_GETARG_BOOL(1);
7760
	char	   *backupidstr;
7761
	XLogRecPtr	checkpointloc;
7762
	XLogRecPtr	startpoint;
7763
	pg_time_t	stamp_time;
7764 7765 7766 7767 7768 7769 7770
	char		strfbuf[128];
	char		xlogfilename[MAXFNAMELEN];
	uint32		_logId;
	uint32		_logSeg;
	struct stat stat_buf;
	FILE	   *fp;

B
Bruce Momjian 已提交
7771
	if (!superuser())
7772 7773
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
7774
				 errmsg("must be superuser to run a backup")));
7775

7776 7777 7778 7779 7780 7781
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

7782 7783 7784
	if (!XLogArchivingActive())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7785 7786 7787 7788 7789 7790 7791 7792 7793
				 errmsg("WAL archiving is not active"),
				 errhint("archive_mode must be enabled at server start.")));

	if (!XLogArchiveCommandSet())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL archiving is not active"),
				 errhint("archive_command must be defined before "
						 "online backups can be made safely.")));
7794

7795
	backupidstr = text_to_cstring(backupid);
B
Bruce Momjian 已提交
7796

7797
	/*
7798 7799 7800
	 * Mark backup active in shared memory.  We must do full-page WAL writes
	 * during an on-line backup even if not doing so at other times, because
	 * it's quite possible for the backup dump to obtain a "torn" (partially
B
Bruce Momjian 已提交
7801 7802 7803 7804 7805 7806 7807 7808 7809
	 * written) copy of a database page if it reads the page concurrently with
	 * our write to the same page.	This can be fixed as long as the first
	 * write to the page in the WAL sequence is a full-page write. Hence, we
	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
	 * are no dirty pages in shared memory that might get dumped while the
	 * backup is in progress without having a corresponding WAL record.  (Once
	 * the backup is complete, we need not force full-page writes anymore,
	 * since we expect that any pages not modified during the backup interval
	 * must have been correctly captured by the backup.)
7810
	 *
B
Bruce Momjian 已提交
7811 7812
	 * We must hold WALInsertLock to change the value of forcePageWrites, to
	 * ensure adequate interlocking against XLogInsert().
7813
	 */
7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	if (XLogCtl->Insert.forcePageWrites)
	{
		LWLockRelease(WALInsertLock);
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("a backup is already in progress"),
				 errhint("Run pg_stop_backup() and try again.")));
	}
	XLogCtl->Insert.forcePageWrites = true;
	LWLockRelease(WALInsertLock);
B
Bruce Momjian 已提交
7825

7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838
	/*
	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
	 * segment the checkpoint is written to doesn't contain pages with old
	 * timeline IDs. That would otherwise happen if you called
	 * pg_start_backup() right after restoring from a PITR archive: the first
	 * WAL segment containing the startup checkpoint has pages in the
	 * beginning with the old timeline ID. That can cause trouble at recovery:
	 * we won't have a history file covering the old timeline if pg_xlog
	 * directory was not included in the base backup and the WAL archive was
	 * cleared too before starting the backup.
	 */
	RequestXLogSwitch();

7839 7840
	/* Ensure we release forcePageWrites if fail below */
	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
7841 7842
	{
		/*
B
Bruce Momjian 已提交
7843
		 * Force a CHECKPOINT.	Aside from being necessary to prevent torn
7844 7845 7846
		 * page problems, this guarantees that two successive backup runs will
		 * have different checkpoint positions and hence different history
		 * file names, even if nothing happened in between.
7847
		 *
7848 7849
		 * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
		 * fast = true).  Otherwise this can take awhile.
7850
		 */
7851 7852
		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
						  (fast ? CHECKPOINT_IMMEDIATE : 0));
7853

7854 7855 7856 7857 7858 7859 7860 7861 7862
		/*
		 * Now we need to fetch the checkpoint record location, and also its
		 * REDO pointer.  The oldest point in WAL that would be needed to
		 * restore starting from the checkpoint is precisely the REDO pointer.
		 */
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
		checkpointloc = ControlFile->checkPoint;
		startpoint = ControlFile->checkPointCopy.redo;
		LWLockRelease(ControlFileLock);
B
Bruce Momjian 已提交
7863

7864 7865
		XLByteToSeg(startpoint, _logId, _logSeg);
		XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
B
Bruce Momjian 已提交
7866

7867 7868 7869 7870 7871
		/* Use the log timezone here, not the session timezone */
		stamp_time = (pg_time_t) time(NULL);
		pg_strftime(strfbuf, sizeof(strfbuf),
					"%Y-%m-%d %H:%M:%S %Z",
					pg_localtime(&stamp_time, log_timezone));
7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897

		/*
		 * Check for existing backup label --- implies a backup is already
		 * running.  (XXX given that we checked forcePageWrites above, maybe
		 * it would be OK to just unlink any such label file?)
		 */
		if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
		{
			if (errno != ENOENT)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not stat file \"%s\": %m",
								BACKUP_LABEL_FILE)));
		}
		else
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is already in progress"),
					 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
							 BACKUP_LABEL_FILE)));

		/*
		 * Okay, write the file
		 */
		fp = AllocateFile(BACKUP_LABEL_FILE, "w");
		if (!fp)
7898 7899
			ereport(ERROR,
					(errcode_for_file_access(),
7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911
					 errmsg("could not create file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
				startpoint.xlogid, startpoint.xrecoff, xlogfilename);
		fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
				checkpointloc.xlogid, checkpointloc.xrecoff);
		fprintf(fp, "START TIME: %s\n", strfbuf);
		fprintf(fp, "LABEL: %s\n", backupidstr);
		if (fflush(fp) || ferror(fp) || FreeFile(fp))
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write file \"%s\": %m",
7912
							BACKUP_LABEL_FILE)));
7913
	}
7914
	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
B
Bruce Momjian 已提交
7915

7916
	/*
7917
	 * We're done.  As a convenience, return the starting WAL location.
7918 7919 7920
	 */
	snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
			 startpoint.xlogid, startpoint.xrecoff);
7921
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
7922 7923
}

7924 7925 7926 7927 7928 7929 7930 7931 7932 7933
/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
	/* Turn off forcePageWrites on failure */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	XLogCtl->Insert.forcePageWrites = false;
	LWLockRelease(WALInsertLock);
}

7934 7935 7936
/*
 * pg_stop_backup: finish taking an on-line backup dump
 *
7937 7938 7939 7940 7941 7942 7943 7944
 * We write an end-of-backup WAL record, and remove the backup label file
 * created by pg_start_backup, creating a backup history file in pg_xlog
 * instead (whence it will immediately be archived). The backup history file
 * contains the same info found in the label file, plus the backup-end time
 * and WAL location. Before 8.5, the backup-end time was read from the backup
 * history file at the beginning of archive recovery, but we now use the WAL
 * record for that and the file is for informational and debug purposes only.
 *
7945
 * Note: different from CancelBackup which just cancels online backup mode.
7946 7947 7948 7949 7950 7951
 */
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
	XLogRecPtr	startpoint;
	XLogRecPtr	stoppoint;
7952
	XLogRecData	rdata;
7953
	pg_time_t	stamp_time;
7954
	char		strfbuf[128];
7955
	char		histfilepath[MAXPGPATH];
7956 7957
	char		startxlogfilename[MAXFNAMELEN];
	char		stopxlogfilename[MAXFNAMELEN];
7958 7959
	char		lastxlogfilename[MAXFNAMELEN];
	char		histfilename[MAXFNAMELEN];
7960 7961 7962 7963 7964 7965
	uint32		_logId;
	uint32		_logSeg;
	FILE	   *lfp;
	FILE	   *fp;
	char		ch;
	int			ich;
7966 7967
	int			seconds_before_warning;
	int			waits = 0;
7968

B
Bruce Momjian 已提交
7969
	if (!superuser())
7970 7971 7972
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to run a backup"))));
B
Bruce Momjian 已提交
7973

7974 7975 7976 7977 7978 7979
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

7980 7981 7982 7983 7984 7985
	if (!XLogArchivingActive())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("WAL archiving is not active"),
				 errhint("archive_mode must be enabled at server start.")));

7986
	/*
7987
	 * OK to clear forcePageWrites
7988 7989
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7990
	XLogCtl->Insert.forcePageWrites = false;
7991 7992 7993 7994 7995
	LWLockRelease(WALInsertLock);

	/*
	 * Open the existing label file
	 */
7996
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
7997 7998 7999 8000 8001 8002
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
8003
							BACKUP_LABEL_FILE)));
8004 8005 8006 8007
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("a backup is not in progress")));
	}
B
Bruce Momjian 已提交
8008

8009
	/*
B
Bruce Momjian 已提交
8010 8011
	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
	 * but we are not expecting any variability in the file format).
8012 8013 8014 8015 8016 8017
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
			   &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
			   &ch) != 4 || ch != '\n')
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8018
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
8019

8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043
	/*
	 * Write the backup-end xlog record
	 */
	rdata.data = (char *) (&startpoint);
	rdata.len = sizeof(startpoint);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);

	/*
	 * Force a switch to a new xlog segment file, so that the backup is valid
	 * as soon as archiver moves out the current segment file.
	 */
	RequestXLogSwitch();

	XLByteToSeg(stoppoint, _logId, _logSeg);
	XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);

	/* Use the log timezone here, not the session timezone */
	stamp_time = (pg_time_t) time(NULL);
	pg_strftime(strfbuf, sizeof(strfbuf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&stamp_time, log_timezone));

8044 8045 8046 8047
	/*
	 * Write the backup history file
	 */
	XLByteToSeg(startpoint, _logId, _logSeg);
8048
	BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
8049
						  startpoint.xrecoff % XLogSegSize);
8050
	fp = AllocateFile(histfilepath, "w");
8051 8052 8053 8054
	if (!fp)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m",
8055
						histfilepath)));
8056 8057 8058 8059
	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
			startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
			stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
8060
	/* transfer remaining lines from label to history file */
8061 8062 8063 8064 8065 8066 8067
	while ((ich = fgetc(lfp)) != EOF)
		fputc(ich, fp);
	fprintf(fp, "STOP TIME: %s\n", strfbuf);
	if (fflush(fp) || ferror(fp) || FreeFile(fp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write file \"%s\": %m",
8068
						histfilepath)));
B
Bruce Momjian 已提交
8069

8070 8071 8072 8073 8074 8075 8076
	/*
	 * Close and remove the backup label file
	 */
	if (ferror(lfp) || FreeFile(lfp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
8077 8078
						BACKUP_LABEL_FILE)));
	if (unlink(BACKUP_LABEL_FILE) != 0)
8079 8080 8081
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not remove file \"%s\": %m",
8082
						BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
8083

8084
	/*
B
Bruce Momjian 已提交
8085 8086 8087
	 * Clean out any no-longer-needed history files.  As a side effect, this
	 * will post a .ready file for the newly created history file, notifying
	 * the archiver that history file may be archived immediately.
8088
	 */
8089
	CleanupBackupHistory();
B
Bruce Momjian 已提交
8090

8091
	/*
8092 8093 8094 8095
	 * Wait until both the last WAL file filled during backup and the history
	 * file have been archived.  We assume that the alphabetic sorting
	 * property of the WAL files ensures any earlier WAL files are safely
	 * archived as well.
8096
	 *
8097 8098 8099
	 * We wait forever, since archive_command is supposed to work and we
	 * assume the admin wanted his backup to work completely. If you don't
	 * wish to wait, you can set statement_timeout.
8100
	 */
8101 8102 8103 8104 8105
	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
	XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);

	XLByteToSeg(startpoint, _logId, _logSeg);
	BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
8106 8107 8108 8109 8110
						  startpoint.xrecoff % XLogSegSize);

	seconds_before_warning = 60;
	waits = 0;

8111 8112
	while (XLogArchiveIsBusy(lastxlogfilename) ||
		   XLogArchiveIsBusy(histfilename))
8113 8114 8115 8116 8117 8118 8119
	{
		CHECK_FOR_INTERRUPTS();

		pg_usleep(1000000L);

		if (++waits >= seconds_before_warning)
		{
8120
			seconds_before_warning *= 2;		/* This wraps in >10 years... */
8121 8122 8123
			ereport(WARNING,
					(errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
							waits)));
8124 8125 8126
		}
	}

8127
	/*
8128
	 * We're done.  As a convenience, return the ending WAL location.
8129 8130 8131
	 */
	snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
			 stoppoint.xlogid, stoppoint.xrecoff);
8132
	PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
8133
}
8134

8135 8136 8137 8138 8139 8140
/*
 * pg_switch_xlog: switch to next xlog file
 */
Datum
pg_switch_xlog(PG_FUNCTION_ARGS)
{
B
Bruce Momjian 已提交
8141
	XLogRecPtr	switchpoint;
8142 8143 8144 8145 8146
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
B
Bruce Momjian 已提交
8147
			 (errmsg("must be superuser to switch transaction log files"))));
8148

8149 8150 8151 8152 8153 8154
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8155 8156 8157 8158 8159 8160 8161
	switchpoint = RequestXLogSwitch();

	/*
	 * As a convenience, return the WAL location of the switch record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			 switchpoint.xlogid, switchpoint.xrecoff);
8162
	PG_RETURN_TEXT_P(cstring_to_text(location));
8163 8164 8165
}

/*
8166 8167 8168 8169 8170
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
8171 8172 8173
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS)
8174 8175 8176
{
	char		location[MAXFNAMELEN];

8177 8178 8179 8180 8181 8182
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194
	/* Make sure we have an up-to-date local LogwrtResult */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	snprintf(location, sizeof(location), "%X/%X",
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
8195
	PG_RETURN_TEXT_P(cstring_to_text(location));
8196 8197 8198 8199 8200 8201 8202 8203 8204
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
8205 8206 8207 8208 8209
{
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecPtr	current_recptr;
	char		location[MAXFNAMELEN];

8210 8211 8212 8213 8214 8215
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8216 8217 8218 8219 8220 8221 8222 8223 8224
	/*
	 * Get the current end-of-WAL position ... shared lock is sufficient
	 */
	LWLockAcquire(WALInsertLock, LW_SHARED);
	INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
	LWLockRelease(WALInsertLock);

	snprintf(location, sizeof(location), "%X/%X",
			 current_recptr.xlogid, current_recptr.xrecoff);
8225
	PG_RETURN_TEXT_P(cstring_to_text(location));
8226 8227
}

8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269
/*
 * Report the last WAL receive location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is guaranteed to be received
 * and synced to disk by walreceiver.
 */
Datum
pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
{
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

	recptr = GetWalRcvWriteRecPtr();

	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

/*
 * Report the last WAL replay location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to read-only
 * connections during recovery.
 */
Datum
pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->recoveryLastRecPtr;
	SpinLockRelease(&xlogctl->info_lck);

	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289
/*
 * Compute an xlog file name and decimal byte offset given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 *
 * Note that a location exactly at a segment boundary is taken to be in
 * the previous segment.  This is usually the right thing, since the
 * expected usage is to determine which xlog file(s) are ready to archive.
 */
Datum
pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	uint32		xrecoff;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];
B
Bruce Momjian 已提交
8290 8291 8292 8293 8294
	Datum		values[2];
	bool		isnull[2];
	TupleDesc	resultTupleDesc;
	HeapTuple	resultHeapTuple;
	Datum		result;
8295

8296 8297 8298
	/*
	 * Read input and parse
	 */
8299
	locationstr = text_to_cstring(location);
8300 8301 8302 8303

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
8304
				 errmsg("could not parse transaction log location \"%s\"",
8305 8306 8307 8308 8309
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

8310
	/*
B
Bruce Momjian 已提交
8311 8312
	 * Construct a tuple descriptor for the result row.  This must match this
	 * function's pg_proc entry!
8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324
	 */
	resultTupleDesc = CreateTemplateTupleDesc(2, false);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
					   TEXTOID, -1, 0);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
					   INT4OID, -1, 0);

	resultTupleDesc = BlessTupleDesc(resultTupleDesc);

	/*
	 * xlogfilename
	 */
8325 8326 8327
	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

8328
	values[0] = CStringGetTextDatum(xlogfilename);
8329 8330 8331 8332 8333
	isnull[0] = false;

	/*
	 * offset
	 */
8334 8335
	xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;

8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346
	values[1] = UInt32GetDatum(xrecoff);
	isnull[1] = false;

	/*
	 * Tuple jam: Having first prepared your Datums, then squash together
	 */
	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);

	result = HeapTupleGetDatum(resultHeapTuple);

	PG_RETURN_DATUM(result);
8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364
}

/*
 * Compute an xlog file name given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 */
Datum
pg_xlogfile_name(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];

8365
	locationstr = text_to_cstring(location);
8366 8367 8368 8369

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
8370
				 errmsg("could not parse transaction log location \"%s\"",
8371 8372 8373 8374 8375 8376 8377 8378
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

8379
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
8380 8381
}

8382 8383 8384 8385 8386
/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
B
Bruce Momjian 已提交
8387
 * identified by the label file, NOT what pg_control says.	This avoids the
8388 8389 8390 8391 8392
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
8393 8394
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not.
8395 8396
 */
static bool
8397
read_backup_label(XLogRecPtr *checkPointLoc)
8398 8399 8400 8401 8402 8403 8404 8405 8406
{
	char		startxlogfilename[MAXFNAMELEN];
	TimeLineID	tli;
	FILE	   *lfp;
	char		ch;

	/*
	 * See if label file is present
	 */
8407
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8408 8409 8410 8411 8412 8413
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
8414
							BACKUP_LABEL_FILE)));
8415 8416
		return false;			/* it's not there, all is fine */
	}
B
Bruce Momjian 已提交
8417

8418
	/*
B
Bruce Momjian 已提交
8419 8420 8421
	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
	 * is pretty crude, but we are not expecting any variability in the file
	 * format).
8422 8423
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
8424
			   &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
8425 8426 8427
			   startxlogfilename, &ch) != 5 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8428
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8429 8430 8431 8432 8433
	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
			   &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
			   &ch) != 3 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8434
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8435 8436 8437 8438
	if (ferror(lfp) || FreeFile(lfp))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
8439
						BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
8440

8441 8442 8443
	return true;
}

8444 8445 8446 8447 8448 8449
/*
 * Error context callback for errors occurring during rm_redo().
 */
static void
rm_redo_error_callback(void *arg)
{
B
Bruce Momjian 已提交
8450 8451
	XLogRecord *record = (XLogRecord *) arg;
	StringInfoData buf;
8452 8453

	initStringInfo(&buf);
8454 8455
	RmgrTable[record->xl_rmid].rm_desc(&buf,
									   record->xl_info,
8456 8457 8458 8459 8460 8461 8462 8463
									   XLogRecGetData(record));

	/* don't bother emitting empty description */
	if (buf.len > 0)
		errcontext("xlog redo %s", buf.data);

	pfree(buf.data);
}
8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500

/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool
BackupInProgress(void)
{
	struct stat stat_buf;

	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}

/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
void
CancelBackup(void)
{
	struct stat stat_buf;

	/* if the file is not there, return */
	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
		return;

	/* remove leftover file from previously cancelled backup if it exists */
	unlink(BACKUP_LABEL_OLD);

	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
	{
		ereport(LOG,
				(errmsg("online backup mode cancelled"),
8501
				 errdetail("\"%s\" was renamed to \"%s\".",
8502
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
8503 8504 8505 8506 8507
	}
	else
	{
		ereport(WARNING,
				(errcode_for_file_access(),
8508 8509
				 errmsg("online backup mode was not cancelled"),
				 errdetail("Could not rename \"%s\" to \"%s\": %m.",
8510
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
8511 8512 8513
	}
}

8514
/* ------------------------------------------------------
8515
 *	Startup Process main entry point and signal handlers
8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530
 * ------------------------------------------------------
 */

/*
 * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
 *
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void
startupproc_quickdie(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);

	/*
8531 8532 8533 8534 8535 8536 8537 8538 8539 8540
	 * We DO NOT want to run proc_exit() callbacks -- we're here because
	 * shared memory may be corrupted, so we don't want to try to clean up our
	 * transaction.  Just nail the windows shut and get out of town.  Now that
	 * there's an atexit callback to prevent third-party code from breaking
	 * things by calling exit() directly, we have to reset the callbacks
	 * explicitly to make this work as intended.
	 */
	on_exit_reset();

	/*
8541 8542 8543
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
8544
	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
8545 8546
	 * should ensure the postmaster sees this as a crash, too, but no harm in
	 * being doubly sure.)
8547 8548 8549 8550 8551
	 */
	exit(2);
}


8552 8553 8554 8555 8556 8557 8558
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
	got_SIGHUP = true;
}

8559 8560 8561 8562 8563
/* SIGTERM: set flag to abort redo and exit */
static void
StartupProcShutdownHandler(SIGNAL_ARGS)
{
	if (in_restore_command)
8564
		proc_exit(1);
8565 8566 8567 8568
	else
		shutdown_requested = true;
}

8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585
/* Handle SIGHUP and SIGTERM signals of startup process */
void
HandleStartupProcInterrupts(void)
{
	/*
	 * Check if we were requested to re-read config file.
	 */
	if (got_SIGHUP)
	{
		got_SIGHUP = false;
		ProcessConfigFile(PGC_SIGHUP);
	}
	/*
	 * Check if we were requested to exit without finishing recovery.
	 */
	if (shutdown_requested)
		proc_exit(1);
8586 8587 8588 8589 8590 8591 8592

	/*
	 * Emergency bailout if postmaster has died.  This is to avoid the
	 * necessity for manual cleanup of all postmaster children.
	 */
	if (IsUnderPostmaster && !PostmasterIsAlive(true))
		exit(1);
8593 8594
}

8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610
/* Main entry point for startup process */
void
StartupProcessMain(void)
{
	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 */
8611 8612
	pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
	pqsignal(SIGINT, SIG_IGN);	/* ignore query cancel */
8613 8614 8615 8616 8617 8618
	pqsignal(SIGTERM, StartupProcShutdownHandler);	/* request shutdown */
	pqsignal(SIGQUIT, startupproc_quickdie);		/* hard crash time */
	if (XLogRequestRecoveryConnections)
		pqsignal(SIGALRM, handle_standby_sig_alarm); /* ignored unless InHotStandby */
	else
		pqsignal(SIGALRM, SIG_IGN);
8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN);
	pqsignal(SIGUSR2, SIG_IGN);

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

8637
	StartupXLOG();
8638

8639
	/*
8640 8641
	 * Exit normally. Exit code 0 tells postmaster that we completed recovery
	 * successfully.
8642
	 */
8643 8644
	proc_exit(0);
}
8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922

/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
 * Returns true if successful, false otherwise or fails if emode is PANIC.
 *
 * This is responsible for restoring files from archive as needed, as well
 * as for waiting for the requested WAL record to arrive in standby mode.
 */
static bool
XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess)
{
	static XLogRecPtr receivedUpto = {0, 0};
	bool		switched_segment = false;
	uint32		targetPageOff;
	uint32		targetRecOff;
	uint32		targetId;
	uint32		targetSeg;

	XLByteToSeg(*RecPtr, targetId, targetSeg);
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;

	/* Fast exit if we have read the record in the current buffer already */
	if (targetId == readId && targetSeg == readSeg &&
		targetPageOff == readOff && targetRecOff < readLen)
		return true;

	/*
	 * See if we need to switch to a new segment because the requested record
	 * is not in the currently open one.
	 */
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
	{
		close(readFile);
		readFile = -1;
	}

	XLByteToSeg(*RecPtr, readId, readSeg);

	/* See if we need to retrieve more data */
	if (readFile < 0 ||
		(readStreamed && !XLByteLT(*RecPtr, receivedUpto)))
	{
		if (StandbyMode)
		{
			bool last_restore_failed = false;

			/*
			 * In standby mode, wait for the requested record to become
			 * available, either via restore_command succeeding to restore
			 * the segment, or via walreceiver having streamed the record.
			 */
			for (;;)
			{
				if (WalRcvInProgress())
				{
					/*
					 * While walreceiver is active, wait for new WAL to
					 * arrive from primary.
					 */
					receivedUpto = GetWalRcvWriteRecPtr();
					if (XLByteLT(*RecPtr, receivedUpto))
					{
						/*
						 * Great, streamed far enough. Open the file if it's
						 * not open already.
						 */
						if (readFile < 0)
						{
							readFile =
								XLogFileRead(readId, readSeg, PANIC,
											 recoveryTargetTLI, false, false);
							switched_segment = true;
							readStreamed = true;
						}
						break;
					}

					if (CheckForStandbyTrigger())
						goto next_record_is_invalid;

					/*
					 * When streaming is active, we want to react quickly when
					 * the next WAL record arrives, so sleep only a bit.
					 */
					pg_usleep(100000L); /* 100ms */
				}
				else
				{
					/*
					 * Until walreceiver manages to reconnect, poll the
					 * archive.
					 */
					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}
					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2, true);
					switched_segment = true;
					readStreamed = false;
					if (readFile != -1)
					{
						elog(DEBUG1, "got WAL segment from archive");
						break;
					}

					/*
					 * If we succeeded restoring some segments from archive
					 * since the last connection attempt (or we haven't
					 * tried streaming yet, retry immediately. But if we
					 * haven't, assume the problem is persistent, so be
					 * less aggressive.
					 */
					if (last_restore_failed)
					{
						/*
						 * Check to see if the trigger file exists. Note that
						 * we do this only after failure, so when you create
						 * the trigger file, we still finish replaying as much
						 * as we can before failover.
						 */
						if (CheckForStandbyTrigger())
							goto next_record_is_invalid;
						pg_usleep(5000000L); /* 5 seconds */
					}
					last_restore_failed = true;

					/*
					 * Nope, not found in archive. Try to stream it.
					 *
					 * If fetching_ckpt is TRUE, RecPtr points to the initial
					 * checkpoint location. In that case, we use RedoStartLSN
					 * as the streaming start position instead of RecPtr, so
					 * that when we later jump backwards to start redo at
					 * RedoStartLSN, we will have the logs streamed already.
					 */
					RequestXLogStreaming(fetching_ckpt ? RedoStartLSN : *RecPtr,
										 PrimaryConnInfo);
				}

				/*
				 * This possibly-long loop needs to handle interrupts of startup
				 * process.
				 */
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In archive or crash recovery. */
			if (readFile < 0)
			{
				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;
				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
											  InArchiveRecovery);
				switched_segment = true;
				readStreamed = false;
				if (readFile < 0)
					return false;
			}
		}
	}

	/*
	 * At this point, we have the right segment open and we know the
	 * requested record is in it.
	 */
	Assert(readFile != -1);

	/*
	 * If the current segment is being streamed from master, calculate
	 * how much of the current page we have received already. We know the
	 * requested record has been received, but this is for the benefit
	 * of future calls, to allow quick exit at the top of this function.
	 */
	if (readStreamed)
	{
		if (RecPtr->xlogid != receivedUpto.xlogid ||
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
		/*
		 * Whenever switching to a new WAL segment, we read the first page of
		 * the file and validate its header, even if that's not where the
		 * target record is.  This is so that we can check the additional
		 * identification info that is present in the first page's "long"
		 * header.
		 */
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
			ereport(emode,
					(errcode_for_file_access(),
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
							readId, readSeg, readOff)));
			goto next_record_is_invalid;
		}
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
			goto next_record_is_invalid;
	}

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
		ereport(emode,
				(errcode_for_file_access(),
				 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
						readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
		ereport(emode,
				(errcode_for_file_access(),
				 errmsg("could not read from log file %u, segment %u, offset %u: %m",
						readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
		goto next_record_is_invalid;

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readStreamed = false;
	readLen = 0;

	return false;
}

/*
 * Check to see if the trigger file exists. If it does, request postmaster
 * to shut down walreceiver, wait for it to exit, remove the trigger
 * file, and return true.
 */
static bool
CheckForStandbyTrigger(void)
{
	struct stat stat_buf;

	if (TriggerFile == NULL)
		return false;

	if (stat(TriggerFile, &stat_buf) == 0)
	{
		ereport(LOG,
				(errmsg("trigger file found: %s", TriggerFile)));
		ShutdownWalRcv();
		unlink(TriggerFile);
		return true;
	}
	return false;
}