xlog.c 309.7 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 *
 * xlog.c
4
 *		PostgreSQL transaction log manager
5 6
 *
 *
B
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * src/backend/access/transam/xlog.c
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14

15 16
#include "postgres.h"

17
#include <ctype.h>
T
Tom Lane 已提交
18
#include <signal.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23 24
#include <sys/wait.h>
#include <unistd.h>
25

26
#include "access/clog.h"
27
#include "access/multixact.h"
28
#include "access/subtrans.h"
29
#include "access/transam.h"
30
#include "access/tuptoaster.h"
31
#include "access/twophase.h"
32
#include "access/xact.h"
33
#include "access/xlog_internal.h"
34
#include "access/xlogutils.h"
35
#include "catalog/catversion.h"
T
Tom Lane 已提交
36
#include "catalog/pg_control.h"
37
#include "catalog/pg_database.h"
38 39
#include "catalog/pg_type.h"
#include "funcapi.h"
40
#include "libpq/pqsignal.h"
41
#include "miscadmin.h"
42
#include "pgstat.h"
43
#include "postmaster/bgwriter.h"
44 45
#include "replication/walreceiver.h"
#include "replication/walsender.h"
46
#include "storage/bufmgr.h"
47
#include "storage/fd.h"
48
#include "storage/ipc.h"
49
#include "storage/latch.h"
50
#include "storage/pmsignal.h"
51
#include "storage/predicate.h"
52
#include "storage/proc.h"
53
#include "storage/procarray.h"
R
Robert Haas 已提交
54
#include "storage/reinit.h"
55
#include "storage/smgr.h"
56
#include "storage/spin.h"
57
#include "utils/builtins.h"
58
#include "utils/guc.h"
59
#include "utils/ps_status.h"
60
#include "utils/relmapper.h"
61
#include "pg_trace.h"
62

63

64 65 66
/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE	"recovery.conf"
#define RECOVERY_COMMAND_DONE	"recovery.done"
R
Robert Haas 已提交
67
#define PROMOTE_SIGNAL_FILE	"promote"
68 69


T
Tom Lane 已提交
70 71
/* User-settable parameters */
int			CheckPointSegments = 3;
72
int			wal_keep_segments = 0;
73
int			XLOGbuffers = -1;
74
int			XLogArchiveTimeout = 0;
75
bool		XLogArchiveMode = false;
76
char	   *XLogArchiveCommand = NULL;
77
bool		EnableHotStandby = false;
78
bool		fullPageWrites = true;
79
bool		log_checkpoints = false;
80
int			sync_method = DEFAULT_SYNC_METHOD;
81
int			wal_level = WAL_LEVEL_MINIMAL;
T
Tom Lane 已提交
82

83 84 85 86
#ifdef WAL_DEBUG
bool		XLOG_DEBUG = false;
#endif

87
/*
88 89 90 91 92
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
B
Bruce Momjian 已提交
93
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
94 95 96
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
97 98 99
 */
#define XLOGfileslop	(2*CheckPointSegments + 1)

100 101 102
/*
 * GUC support
 */
103 104 105 106 107 108 109
const struct config_enum_entry wal_level_options[] = {
	{"minimal", WAL_LEVEL_MINIMAL, false},
	{"archive", WAL_LEVEL_ARCHIVE, false},
	{"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
	{NULL, 0, false}
};

110
const struct config_enum_entry sync_method_options[] = {
111
	{"fsync", SYNC_METHOD_FSYNC, false},
112
#ifdef HAVE_FSYNC_WRITETHROUGH
113
	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
114 115
#endif
#ifdef HAVE_FDATASYNC
116
	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
117 118
#endif
#ifdef OPEN_SYNC_FLAG
119
	{"open_sync", SYNC_METHOD_OPEN, false},
120 121
#endif
#ifdef OPEN_DATASYNC_FLAG
122
	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
123
#endif
124
	{NULL, 0, false}
125
};
T
Tom Lane 已提交
126

127 128 129 130 131 132 133
/*
 * Statistics for current checkpoint are collected in this global struct.
 * Because only the background writer or a stand-alone backend can perform
 * checkpoints, this will be unused in normal backends.
 */
CheckpointStatsData CheckpointStats;

T
Tom Lane 已提交
134
/*
135 136
 * ThisTimeLineID will be same in all backends --- it identifies current
 * WAL timeline for the database system.
T
Tom Lane 已提交
137
 */
138
TimeLineID	ThisTimeLineID = 0;
V
WAL  
Vadim B. Mikheev 已提交
139

140
/*
141
 * Are we doing recovery from XLOG?
142
 *
143 144 145 146 147
 * This is only ever true in the startup process; it should be read as meaning
 * "this process is replaying WAL records", rather than "the system is in
 * recovery mode".  It should be examined primarily by functions that need
 * to act differently when called from a WAL redo function (e.g., to skip WAL
 * logging).  To check whether the system is in recovery regardless of which
148 149
 * process you're running in, use RecoveryInProgress() but only after shared
 * memory startup and lock initialization.
150
 */
T
Tom Lane 已提交
151
bool		InRecovery = false;
B
Bruce Momjian 已提交
152

153
/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
B
Bruce Momjian 已提交
154
HotStandbyState standbyState = STANDBY_DISABLED;
155

B
Bruce Momjian 已提交
156
static XLogRecPtr LastRec;
157

158 159
/*
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
160
 * known, need to check the shared state".
161 162
 */
static bool LocalRecoveryInProgress = true;
163 164 165 166 167
/*
 * Local copy of SharedHotStandbyActive variable. False actually means "not
 * known, need to check the shared state".
 */
static bool LocalHotStandbyActive = false;
168

169 170 171 172 173 174
/*
 * Local state for XLogInsertAllowed():
 *		1: unconditionally allowed to insert XLOG
 *		0: unconditionally not allowed to insert XLOG
 *		-1: must check RecoveryInProgress(); disallow until it is false
 * Most processes start with -1 and transition to 1 after seeing that recovery
B
Bruce Momjian 已提交
175
 * is not in progress.	But we can also force the value for special cases.
176 177 178 179 180 181 182 183
 * The coding in XLogInsertAllowed() depends on the first two of these states
 * being numerically the same as bool true and false.
 */
static int	LocalXLogInsertAllowed = -1;

/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;

184
/* Was the last xlog file restored from archive, or local? */
B
Bruce Momjian 已提交
185
static bool restoredFromArchive = false;
186

187
/* options taken from recovery.conf for archive recovery */
188
static char *recoveryRestoreCommand = NULL;
189
static char *recoveryEndCommand = NULL;
190
static char *archiveCleanupCommand = NULL;
191
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
192
static bool recoveryTargetInclusive = true;
193
static bool recoveryPauseAtTarget = true;
B
Bruce Momjian 已提交
194
static TransactionId recoveryTargetXid;
195
static TimestampTz recoveryTargetTime;
196
static char *recoveryTargetName;
197

198 199 200
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyMode = false;
static char *PrimaryConnInfo = NULL;
201
static char *TriggerFile = NULL;
202

203
/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
B
Bruce Momjian 已提交
204
static TransactionId recoveryStopXid;
205
static TimestampTz recoveryStopTime;
206
static char recoveryStopName[MAXFNAMELEN];
B
Bruce Momjian 已提交
207
static bool recoveryStopAfter;
208 209 210 211 212 213 214 215 216 217 218

/*
 * During normal operation, the only timeline we care about is ThisTimeLineID.
 * During recovery, however, things are more complicated.  To simplify life
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 * scan through the WAL history (that is, it is the line that was active when
 * the currently-scanned WAL record was generated).  We also need these
 * timeline values:
 *
 * recoveryTargetTLI: the desired timeline that we want to end in.
 *
219 220
 * recoveryTargetIsLatest: was the requested target timeline 'latest'?
 *
221 222
 * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 * its known parents, newest first (so recoveryTargetTLI is always the
B
Bruce Momjian 已提交
223
 * first list member).	Only these TLIs are expected to be seen in the WAL
224 225 226 227 228 229 230 231 232
 * segments we read, and indeed only these TLIs will be considered as
 * candidate WAL files to open at all.
 *
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
 * (This is not necessarily the same as ThisTimeLineID, because we could
 * be scanning data that was copied from an ancestor timeline when the current
 * file was created.)  During a sequential scan we do not allow this value
 * to decrease.
 */
B
Bruce Momjian 已提交
233
static TimeLineID recoveryTargetTLI;
234
static bool recoveryTargetIsLatest = false;
B
Bruce Momjian 已提交
235 236
static List *expectedTLIs;
static TimeLineID curFileTLI;
237

T
Tom Lane 已提交
238 239
/*
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
240 241 242 243
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 * end+1 of the last record, and is reset when we end a top-level transaction,
 * or start a new one; so it can be used to tell if the current transaction has
 * created any XLOG records.
T
Tom Lane 已提交
244 245
 */
static XLogRecPtr ProcLastRecPtr = {0, 0};
246

247
XLogRecPtr	XactLastRecEnd = {0, 0};
248

T
Tom Lane 已提交
249 250 251
/*
 * RedoRecPtr is this backend's local copy of the REDO record pointer
 * (which is almost but not quite the same as a pointer to the most recent
B
Bruce Momjian 已提交
252
 * CHECKPOINT record).	We update this from the shared-memory copy,
T
Tom Lane 已提交
253
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
B
Bruce Momjian 已提交
254
 * hold the Insert lock).  See XLogInsert for details.	We are also allowed
255
 * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
256 257
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 * InitXLOGAccess.
T
Tom Lane 已提交
258
 */
259
static XLogRecPtr RedoRecPtr;
260

261 262 263 264 265 266 267 268 269 270 271 272
/*
 * RedoStartLSN points to the checkpoint's REDO location which is specified
 * in a backup label file, backup history file or control file. In standby
 * mode, XLOG streaming usually starts from the position where an invalid
 * record was found. But if we fail to read even the initial checkpoint
 * record, we use the REDO location instead of the checkpoint location as
 * the start position of XLOG streaming. Otherwise we would have to jump
 * backwards to the REDO location after reading the checkpoint record,
 * because the REDO record can precede the checkpoint record.
 */
static XLogRecPtr RedoStartLSN = {0, 0};

T
Tom Lane 已提交
273 274 275 276 277 278 279 280 281
/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
282
 * We do a lot of pushups to minimize the amount of access to lockable
T
Tom Lane 已提交
283 284 285
 * shared memory values.  There are actually three shared-memory copies of
 * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 *		XLogCtl->LogwrtResult is protected by info_lck
286 287 288 289
 *		XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 *		XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 * One must hold the associated lock to read or write any of these, but
 * of course no lock is needed to read/write the unshared LogwrtResult.
T
Tom Lane 已提交
290 291 292
 *
 * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 * right", since both are updated by a write or flush operation before
293 294
 * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 * is that it can be examined/modified by code that already holds WALWriteLock
T
Tom Lane 已提交
295 296 297
 * without needing to grab info_lck as well.
 *
 * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
B
Bruce Momjian 已提交
298
 * but is updated when convenient.	Again, it exists for the convenience of
299
 * code that is already holding WALInsertLock but not the other locks.
T
Tom Lane 已提交
300 301 302 303 304 305 306 307 308 309
 *
 * The unshared LogwrtResult may lag behind any or all of these, and again
 * is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * Note that this all works because the request and result positions can only
 * advance forward, never back up, and so we can easily determine which of two
 * values is "more up to date".
310 311 312 313 314 315 316 317 318 319 320 321 322
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALInsertLock: must be held to insert a record into the WAL buffers.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
323
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
324 325
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the bgwriter, this is just pro forma).
326
 *
T
Tom Lane 已提交
327 328
 *----------
 */
329

T
Tom Lane 已提交
330
typedef struct XLogwrtRqst
331
{
T
Tom Lane 已提交
332 333
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
334
} XLogwrtRqst;
335

336 337 338 339 340 341
typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

T
Tom Lane 已提交
342 343 344
/*
 * Shared state data for XLogInsert.
 */
345 346
typedef struct XLogCtlInsert
{
B
Bruce Momjian 已提交
347 348
	XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
	XLogRecPtr	PrevRecord;		/* start of previously-inserted record */
349
	int			curridx;		/* current block index in cache */
B
Bruce Momjian 已提交
350 351 352
	XLogPageHeader currpage;	/* points to header of block in cache */
	char	   *currpos;		/* current insertion point in cache */
	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
353
	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
354 355 356 357 358

	/*
	 * exclusiveBackup is true if a backup started with pg_start_backup() is
	 * in progress, and nonExclusiveBackups is a counter indicating the number
	 * of streaming base backups currently in progress. forcePageWrites is
359 360 361
	 * set to true when either of these is non-zero. lastBackupStart is the
	 * latest checkpoint redo location used as a starting point for an online
	 * backup.
362 363 364
	 */
	bool		exclusiveBackup;
	int			nonExclusiveBackups;
365
	XLogRecPtr	lastBackupStart;
366 367
} XLogCtlInsert;

T
Tom Lane 已提交
368 369 370
/*
 * Shared state data for XLogWrite/XLogFlush.
 */
371 372
typedef struct XLogCtlWrite
{
B
Bruce Momjian 已提交
373 374
	XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
	int			curridx;		/* cache index of next block to write */
375
	pg_time_t	lastSegSwitchTime;		/* time of last xlog segment switch */
376 377
} XLogCtlWrite;

T
Tom Lane 已提交
378 379 380
/*
 * Total shared-memory state for XLOG.
 */
381 382
typedef struct XLogCtlData
{
383
	/* Protected by WALInsertLock: */
B
Bruce Momjian 已提交
384
	XLogCtlInsert Insert;
385

T
Tom Lane 已提交
386
	/* Protected by info_lck: */
B
Bruce Momjian 已提交
387 388
	XLogwrtRqst LogwrtRqst;
	XLogwrtResult LogwrtResult;
389 390
	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
	TransactionId ckptXid;
391
	XLogRecPtr	asyncXactLSN; /* LSN of newest async commit/abort */
392
	uint32		lastRemovedLog; /* latest removed/recycled XLOG segment */
393
	uint32		lastRemovedSeg;
394

395
	/* Protected by WALWriteLock: */
B
Bruce Momjian 已提交
396 397
	XLogCtlWrite Write;

T
Tom Lane 已提交
398
	/*
B
Bruce Momjian 已提交
399 400 401
	 * These values do not change after startup, although the pointed-to pages
	 * and xlblocks values certainly do.  Permission to read/write the pages
	 * and xlblocks values depends on WALInsertLock and WALWriteLock.
T
Tom Lane 已提交
402
	 */
B
Bruce Momjian 已提交
403
	char	   *pages;			/* buffers for unwritten XLOG pages */
404
	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
405
	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
406
	TimeLineID	ThisTimeLineID;
407
	TimeLineID	RecoveryTargetTLI;
B
Bruce Momjian 已提交
408

409
	/*
410
	 * archiveCleanupCommand is read from recovery.conf but needs to be in
411 412
	 * shared memory so that the bgwriter process can access it.
	 */
413
	char		archiveCleanupCommand[MAXPGPATH];
T
Tom Lane 已提交
414

415 416
	/*
	 * SharedRecoveryInProgress indicates if we're still in crash or archive
417
	 * recovery.  Protected by info_lck.
418 419 420
	 */
	bool		SharedRecoveryInProgress;

421 422 423 424 425 426
	/*
	 * SharedHotStandbyActive indicates if we're still in crash or archive
	 * recovery.  Protected by info_lck.
	 */
	bool		SharedHotStandbyActive;

427 428 429 430 431 432 433
	/*
	 * recoveryWakeupLatch is used to wake up the startup process to
	 * continue WAL replay, if it is waiting for WAL to arrive or failover
	 * trigger file to appear.
	 */
	Latch		recoveryWakeupLatch;

434
	/*
435 436
	 * During recovery, we keep a copy of the latest checkpoint record here.
	 * Used by the background writer when it wants to create a restartpoint.
437 438 439 440 441 442 443 444
	 *
	 * Protected by info_lck.
	 */
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;

	/* end+1 of the last record replayed (or being replayed) */
	XLogRecPtr	replayEndRecPtr;
445 446
	/* end+1 of the last record replayed */
	XLogRecPtr	recoveryLastRecPtr;
447 448
	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
	TimestampTz recoveryLastXTime;
449 450
	/* Are we requested to pause recovery? */
	bool		recoveryPause;
451

452
	slock_t		info_lck;		/* locks shared variables shown above */
453 454
} XLogCtlData;

455
static XLogCtlData *XLogCtl = NULL;
456

457
/*
T
Tom Lane 已提交
458
 * We maintain an image of pg_control in shared memory.
459
 */
460
static ControlFileData *ControlFile = NULL;
461

T
Tom Lane 已提交
462 463 464 465 466
/*
 * Macros for managing XLogInsert state.  In most cases, the calling routine
 * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 * so these are passed as parameters instead of being fetched via XLogCtl.
 */
467

T
Tom Lane 已提交
468 469
/* Free space remaining in the current xlog page buffer */
#define INSERT_FREESPACE(Insert)  \
470
	(XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
T
Tom Lane 已提交
471 472 473 474 475 476

/* Construct XLogRecPtr value for current insertion point */
#define INSERT_RECPTR(recptr,Insert,curridx)  \
	( \
	  (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
	  (recptr).xrecoff = \
B
Bruce Momjian 已提交
477
		XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
T
Tom Lane 已提交
478 479 480 481 482 483 484
	)

#define PrevBufIdx(idx)		\
		(((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

#define NextBufIdx(idx)		\
		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
485

T
Tom Lane 已提交
486 487 488 489
/*
 * Private, possibly out-of-date copy of shared LogwrtResult.
 * See discussion above.
 */
490
static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
491

492 493 494 495 496 497 498 499 500
/*
 * Codes indicating where we got a WAL file from during recovery, or where
 * to attempt to get one.  These are chosen so that they can be OR'd together
 * in a bitmask state variable.
 */
#define XLOG_FROM_ARCHIVE		(1<<0)	/* Restored using restore_command */
#define XLOG_FROM_PG_XLOG		(1<<1)	/* Existing file in pg_xlog */
#define XLOG_FROM_STREAM		(1<<2)	/* Streamed from master */

T
Tom Lane 已提交
501 502 503 504 505 506 507 508 509 510
/*
 * openLogFile is -1 or a kernel FD for an open log file segment.
 * When it's open, openLogOff is the current seek offset in the file.
 * openLogId/openLogSeg identify the segment.  These variables are only
 * used to write the XLOG, and so will normally refer to the active segment.
 */
static int	openLogFile = -1;
static uint32 openLogId = 0;
static uint32 openLogSeg = 0;
static uint32 openLogOff = 0;
511

T
Tom Lane 已提交
512 513 514 515
/*
 * These variables are used similarly to the ones above, but for reading
 * the XLOG.  Note, however, that readOff generally represents the offset
 * of the page just read, not the seek position of the FD itself, which
516
 * will be just past that page. readLen indicates how much of the current
517 518
 * page has been read into readBuf, and readSource indicates where we got
 * the currently open file from.
T
Tom Lane 已提交
519
 */
520 521 522 523
static int	readFile = -1;
static uint32 readId = 0;
static uint32 readSeg = 0;
static uint32 readOff = 0;
524
static uint32 readLen = 0;
B
Bruce Momjian 已提交
525
static int	readSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
526

527 528 529 530
/*
 * Keeps track of which sources we've tried to read the current WAL
 * record from and failed.
 */
B
Bruce Momjian 已提交
531
static int	failedSources = 0;	/* OR of XLOG_FROM_* codes */
532 533 534 535 536 537 538 539

/*
 * These variables track when we last obtained some WAL data to process,
 * and where we got it from.  (XLogReceiptSource is initially the same as
 * readSource, but readSource gets reset to zero when we don't have data
 * to process right now.)
 */
static TimestampTz XLogReceiptTime = 0;
B
Bruce Momjian 已提交
540
static int	XLogReceiptSource = 0;		/* XLOG_FROM_* code */
B
Bruce Momjian 已提交
541

542
/* Buffer for currently read page (XLOG_BLCKSZ bytes) */
T
Tom Lane 已提交
543
static char *readBuf = NULL;
B
Bruce Momjian 已提交
544

545 546 547 548
/* Buffer for current ReadRecord result (expandable) */
static char *readRecordBuf = NULL;
static uint32 readRecordBufSize = 0;

T
Tom Lane 已提交
549
/* State information for XLOG reading */
B
Bruce Momjian 已提交
550 551
static XLogRecPtr ReadRecPtr;	/* start of last record read */
static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
552
static TimeLineID lastPageTLI = 0;
553

554 555 556
static XLogRecPtr minRecoveryPoint;		/* local copy of
										 * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
557
static bool reachedMinRecoveryPoint = false;
558

V
WAL  
Vadim B. Mikheev 已提交
559 560
static bool InRedo = false;

561 562 563
/* Have we launched bgwriter during recovery? */
static bool bgwriterLaunched = false;

564 565 566 567 568 569 570 571 572 573 574 575
/*
 * Information logged when we detect a change in one of the parameters
 * important for Hot Standby.
 */
typedef struct xl_parameter_change
{
	int			MaxConnections;
	int			max_prepared_xacts;
	int			max_locks_per_xact;
	int			wal_level;
} xl_parameter_change;

576 577 578 579 580 581 582
/* logs restore point */
typedef struct xl_restore_point
{
	TimestampTz	rp_time;
	char		rp_name[MAXFNAMELEN];
} xl_restore_point;

583
/*
584
 * Flags set by interrupt handlers for later service in the redo loop.
585
 */
586
static volatile sig_atomic_t got_SIGHUP = false;
587
static volatile sig_atomic_t shutdown_requested = false;
R
Robert Haas 已提交
588
static volatile sig_atomic_t promote_triggered = false;
589

590 591
/*
 * Flag set when executing a restore command, to tell SIGTERM signal handler
592
 * that it's safe to just proc_exit.
593 594 595
 */
static volatile sig_atomic_t in_restore_command = false;

596

597 598
static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
599 600
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
601 602
static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
603
static void exitArchiveRecovery(TimeLineID endTLI,
B
Bruce Momjian 已提交
604
					uint32 endLogId, uint32 endLogSeg);
605
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
606 607 608
static void recoveryPausesHere(void);
static bool RecoveryIsPaused(void);
static void SetRecoveryPause(bool recoveryPause);
609 610
static void SetLatestXTime(TimestampTz xtime);
static TimestampTz GetLatestXTime(void);
611 612
static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
613
static void LocalSetXLogInsertAllowed(void);
614
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
T
Tom Lane 已提交
615

616
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
B
Bruce Momjian 已提交
617
				XLogRecPtr *lsn, BkpBlock *bkpb);
618
static bool AdvanceXLInsertBuffer(bool new_segment);
619
static bool XLogCheckpointNeeded(uint32 logid, uint32 logseg);
620
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
621 622
static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
623
					   bool use_lock);
B
Bruce Momjian 已提交
624
static int XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
625
			 int source, bool notexistOk);
B
Bruce Momjian 已提交
626
static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
627
				   int sources);
628 629
static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess);
B
Bruce Momjian 已提交
630
static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
B
Bruce Momjian 已提交
631
static void XLogFileClose(void);
632
static bool RestoreArchivedFile(char *path, const char *xlogfname,
B
Bruce Momjian 已提交
633
					const char *recovername, off_t expectedSize);
634 635
static void ExecuteRecoveryCommand(char *command, char *commandName,
					   bool failOnerror);
636 637
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
638
static void UpdateLastRemovedPtr(char *filename);
639
static void ValidateXLOGDirectoryStructure(void);
640
static void CleanupBackupHistory(void);
641
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
642
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
643
static void CheckRecoveryConsistency(void);
644
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
645
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
646 647
static List *readTimeLineHistory(TimeLineID targetTLI);
static bool existsTimeLineHistory(TimeLineID probeTLI);
648
static bool rescanLatestTimeLine(void);
649 650
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
B
Bruce Momjian 已提交
651 652
					 TimeLineID endTLI,
					 uint32 endLogId, uint32 endLogSeg);
T
Tom Lane 已提交
653 654
static void WriteControlFile(void);
static void ReadControlFile(void);
655
static char *str_time(pg_time_t tnow);
656
static bool CheckForStandbyTrigger(void);
657

658
#ifdef WAL_DEBUG
659
static void xlog_outrec(StringInfo buf, XLogRecord *record);
660
#endif
661
static void pg_start_backup_callback(int code, Datum arg);
662
static bool read_backup_label(XLogRecPtr *checkPointLoc);
663
static void rm_redo_error_callback(void *arg);
664
static int	get_sync_bit(int method);
T
Tom Lane 已提交
665 666 667 668 669


/*
 * Insert an XLOG record having the specified RMID and info bytes,
 * with the body of the record being the data chunk(s) described by
670
 * the rdata chain (see xlog.h for notes about rdata).
T
Tom Lane 已提交
671 672 673 674 675 676 677 678 679 680 681
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * NB: this routine feels free to scribble on the XLogRecData structs,
 * though not on the data they reference.  This is OK since the XLogRecData
 * structs are always just temporaries in the calling code.
 */
682
XLogRecPtr
683
XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
684
{
B
Bruce Momjian 已提交
685 686
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecord *record;
T
Tom Lane 已提交
687
	XLogContRecord *contrecord;
B
Bruce Momjian 已提交
688 689 690
	XLogRecPtr	RecPtr;
	XLogRecPtr	WriteRqst;
	uint32		freespace;
691
	int			curridx;
B
Bruce Momjian 已提交
692 693 694 695 696
	XLogRecData *rdt;
	Buffer		dtbuf[XLR_MAX_BKP_BLOCKS];
	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
697 698 699 700
	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
	pg_crc32	rdata_crc;
B
Bruce Momjian 已提交
701 702 703 704
	uint32		len,
				write_len;
	unsigned	i;
	bool		updrqst;
705
	bool		doPageWrites;
706
	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
V
Vadim B. Mikheev 已提交
707

708
	/* cross-check on whether we should be here or not */
709 710
	if (!XLogInsertAllowed())
		elog(ERROR, "cannot make new WAL entries during recovery");
711

712
	/* info's high bits are reserved for use by me */
V
Vadim B. Mikheev 已提交
713
	if (info & XLR_INFO_MASK)
714
		elog(PANIC, "invalid xlog info mask %02X", info);
V
Vadim B. Mikheev 已提交
715

716 717
	TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

T
Tom Lane 已提交
718
	/*
B
Bruce Momjian 已提交
719 720
	 * In bootstrap mode, we don't actually log anything but XLOG resources;
	 * return a phony record pointer.
T
Tom Lane 已提交
721
	 */
V
Vadim B. Mikheev 已提交
722
	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
V
WAL  
Vadim B. Mikheev 已提交
723 724
	{
		RecPtr.xlogid = 0;
B
Bruce Momjian 已提交
725
		RecPtr.xrecoff = SizeOfXLogLongPHD;		/* start of 1st chkpt record */
726
		return RecPtr;
V
WAL  
Vadim B. Mikheev 已提交
727 728
	}

T
Tom Lane 已提交
729
	/*
730
	 * Here we scan the rdata chain, determine which buffers must be backed
T
Tom Lane 已提交
731
	 * up, and compute the CRC values for the data.  Note that the record
B
Bruce Momjian 已提交
732 733 734 735
	 * header isn't added into the CRC initially since we don't know the final
	 * length or info bits quite yet.  Thus, the CRC will represent the CRC of
	 * the whole record in the order "rdata, then backup blocks, then record
	 * header".
T
Tom Lane 已提交
736
	 *
737 738 739 740 741
	 * We may have to loop back to here if a race condition is detected below.
	 * We could prevent the race by doing all this work while holding the
	 * insert lock, but it seems better to avoid doing CRC calculations while
	 * holding the lock.  This means we have to be careful about modifying the
	 * rdata chain until we know we aren't going to loop back again.  The only
B
Bruce Momjian 已提交
742 743 744 745 746
	 * change we allow ourselves to make earlier is to set rdt->data = NULL in
	 * chain items we have decided we will have to back up the whole buffer
	 * for.  This is OK because we will certainly decide the same thing again
	 * for those items if we do it over; doing it here saves an extra pass
	 * over the chain later.
T
Tom Lane 已提交
747
	 */
748
begin:;
T
Tom Lane 已提交
749 750 751 752 753 754
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		dtbuf[i] = InvalidBuffer;
		dtbuf_bkp[i] = false;
	}

755 756 757 758 759 760 761 762
	/*
	 * Decide if we need to do full-page writes in this XLOG record: true if
	 * full_page_writes is on or we have a PITR request for it.  Since we
	 * don't yet have the insert lock, forcePageWrites could change under us,
	 * but we'll recheck it once we have the lock.
	 */
	doPageWrites = fullPageWrites || Insert->forcePageWrites;

763
	INIT_CRC32(rdata_crc);
T
Tom Lane 已提交
764
	len = 0;
B
Bruce Momjian 已提交
765
	for (rdt = rdata;;)
766 767 768
	{
		if (rdt->buffer == InvalidBuffer)
		{
T
Tom Lane 已提交
769
			/* Simple data, just include it */
770
			len += rdt->len;
771
			COMP_CRC32(rdata_crc, rdt->data, rdt->len);
772
		}
T
Tom Lane 已提交
773
		else
774
		{
T
Tom Lane 已提交
775 776
			/* Find info for buffer */
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
777
			{
T
Tom Lane 已提交
778
				if (rdt->buffer == dtbuf[i])
779
				{
780
					/* Buffer already referenced by earlier chain item */
T
Tom Lane 已提交
781 782 783 784 785
					if (dtbuf_bkp[i])
						rdt->data = NULL;
					else if (rdt->data)
					{
						len += rdt->len;
786
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
787 788
					}
					break;
789
				}
T
Tom Lane 已提交
790
				if (dtbuf[i] == InvalidBuffer)
791
				{
T
Tom Lane 已提交
792 793
					/* OK, put it in this slot */
					dtbuf[i] = rdt->buffer;
794 795
					if (XLogCheckBuffer(rdt, doPageWrites,
										&(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
T
Tom Lane 已提交
796 797 798 799 800 801 802
					{
						dtbuf_bkp[i] = true;
						rdt->data = NULL;
					}
					else if (rdt->data)
					{
						len += rdt->len;
803
						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
T
Tom Lane 已提交
804 805
					}
					break;
806 807
				}
			}
T
Tom Lane 已提交
808
			if (i >= XLR_MAX_BKP_BLOCKS)
809
				elog(PANIC, "can backup at most %d blocks per xlog record",
T
Tom Lane 已提交
810
					 XLR_MAX_BKP_BLOCKS);
811
		}
812
		/* Break out of loop when rdt points to last chain item */
813 814 815 816 817
		if (rdt->next == NULL)
			break;
		rdt = rdt->next;
	}

818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
	/*
	 * Now add the backup block headers and data into the CRC
	 */
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
	{
		if (dtbuf_bkp[i])
		{
			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
			char	   *page;

			COMP_CRC32(rdata_crc,
					   (char *) bkpb,
					   sizeof(BkpBlock));
			page = (char *) BufferGetBlock(dtbuf[i]);
			if (bkpb->hole_length == 0)
			{
				COMP_CRC32(rdata_crc,
						   page,
						   BLCKSZ);
			}
			else
			{
				/* must skip the hole */
				COMP_CRC32(rdata_crc,
						   page,
						   bkpb->hole_offset);
				COMP_CRC32(rdata_crc,
						   page + (bkpb->hole_offset + bkpb->hole_length),
						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
			}
		}
	}

T
Tom Lane 已提交
851
	/*
852 853
	 * NOTE: We disallow len == 0 because it provides a useful bit of extra
	 * error checking in ReadRecord.  This means that all callers of
B
Bruce Momjian 已提交
854 855 856
	 * XLogInsert must supply at least some not-in-a-buffer data.  However, we
	 * make an exception for XLOG SWITCH records because we don't want them to
	 * ever cross a segment boundary.
T
Tom Lane 已提交
857
	 */
858
	if (len == 0 && !isLogSwitch)
859
		elog(PANIC, "invalid xlog record length %u", len);
860

861
	START_CRIT_SECTION();
862

863 864 865
	/* Now wait to get insert lock */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

T
Tom Lane 已提交
866
	/*
B
Bruce Momjian 已提交
867 868 869
	 * Check to see if my RedoRecPtr is out of date.  If so, may have to go
	 * back and recompute everything.  This can only happen just after a
	 * checkpoint, so it's better to be slow in this case and fast otherwise.
870 871
	 *
	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
B
Bruce Momjian 已提交
872 873
	 * affect the contents of the XLOG record, so we'll update our local copy
	 * but not force a recomputation.
T
Tom Lane 已提交
874 875
	 */
	if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
876
	{
T
Tom Lane 已提交
877 878 879
		Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
		RedoRecPtr = Insert->RedoRecPtr;

880
		if (doPageWrites)
881
		{
882
			for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
T
Tom Lane 已提交
883
			{
884 885 886 887 888 889 890 891 892 893 894 895 896
				if (dtbuf[i] == InvalidBuffer)
					continue;
				if (dtbuf_bkp[i] == false &&
					XLByteLE(dtbuf_lsn[i], RedoRecPtr))
				{
					/*
					 * Oops, this buffer now needs to be backed up, but we
					 * didn't think so above.  Start over.
					 */
					LWLockRelease(WALInsertLock);
					END_CRIT_SECTION();
					goto begin;
				}
T
Tom Lane 已提交
897
			}
898 899 900
		}
	}

901
	/*
B
Bruce Momjian 已提交
902 903 904 905
	 * Also check to see if forcePageWrites was just turned on; if we weren't
	 * already doing full-page writes then go back and recompute. (If it was
	 * just turned off, we could recompute the record without full pages, but
	 * we choose not to bother.)
906 907 908 909 910 911 912 913 914
	 */
	if (Insert->forcePageWrites && !doPageWrites)
	{
		/* Oops, must redo it with full-page data */
		LWLockRelease(WALInsertLock);
		END_CRIT_SECTION();
		goto begin;
	}

T
Tom Lane 已提交
915
	/*
B
Bruce Momjian 已提交
916 917 918 919
	 * Make additional rdata chain entries for the backup blocks, so that we
	 * don't need to special-case them in the write loop.  Note that we have
	 * now irrevocably changed the input rdata chain.  At the exit of this
	 * loop, write_len includes the backup block data.
T
Tom Lane 已提交
920
	 *
921 922 923
	 * Also set the appropriate info bits to show which buffers were backed
	 * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
	 * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
T
Tom Lane 已提交
924 925 926
	 */
	write_len = len;
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
927
	{
928 929 930
		BkpBlock   *bkpb;
		char	   *page;

931
		if (!dtbuf_bkp[i])
932 933
			continue;

T
Tom Lane 已提交
934
		info |= XLR_SET_BKP_BLOCK(i);
935

936 937 938 939 940
		bkpb = &(dtbuf_xlg[i]);
		page = (char *) BufferGetBlock(dtbuf[i]);

		rdt->next = &(dtbuf_rdt1[i]);
		rdt = rdt->next;
941

942 943
		rdt->data = (char *) bkpb;
		rdt->len = sizeof(BkpBlock);
T
Tom Lane 已提交
944
		write_len += sizeof(BkpBlock);
945

946 947
		rdt->next = &(dtbuf_rdt2[i]);
		rdt = rdt->next;
948

949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
		if (bkpb->hole_length == 0)
		{
			rdt->data = page;
			rdt->len = BLCKSZ;
			write_len += BLCKSZ;
			rdt->next = NULL;
		}
		else
		{
			/* must skip the hole */
			rdt->data = page;
			rdt->len = bkpb->hole_offset;
			write_len += bkpb->hole_offset;

			rdt->next = &(dtbuf_rdt3[i]);
			rdt = rdt->next;

			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
			write_len += rdt->len;
			rdt->next = NULL;
		}
971 972
	}

973 974 975 976 977 978 979
	/*
	 * If we backed up any full blocks and online backup is not in progress,
	 * mark the backup blocks as removable.  This allows the WAL archiver to
	 * know whether it is safe to compress archived WAL data by transforming
	 * full-block records into the non-full-block format.
	 *
	 * Note: we could just set the flag whenever !forcePageWrites, but
B
Bruce Momjian 已提交
980 981
	 * defining it like this leaves the info bit free for some potential other
	 * use in records without any backup blocks.
982 983 984 985
	 */
	if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
		info |= XLR_BKP_REMOVABLE;

986
	/*
987
	 * If there isn't enough space on the current XLOG page for a record
B
Bruce Momjian 已提交
988
	 * header, advance to the next page (leaving the unused space as zeroes).
989
	 */
T
Tom Lane 已提交
990 991
	updrqst = false;
	freespace = INSERT_FREESPACE(Insert);
992 993
	if (freespace < SizeOfXLogRecord)
	{
994
		updrqst = AdvanceXLInsertBuffer(false);
995 996 997
		freespace = INSERT_FREESPACE(Insert);
	}

998
	/* Compute record's XLOG location */
T
Tom Lane 已提交
999
	curridx = Insert->curridx;
1000 1001 1002
	INSERT_RECPTR(RecPtr, Insert, curridx);

	/*
B
Bruce Momjian 已提交
1003 1004 1005 1006 1007
	 * If the record is an XLOG_SWITCH, and we are exactly at the start of a
	 * segment, we need not insert it (and don't want to because we'd like
	 * consecutive switch requests to be no-ops).  Instead, make sure
	 * everything is written and flushed through the end of the prior segment,
	 * and return the prior segment's end address.
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
	 */
	if (isLogSwitch &&
		(RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
	{
		/* We can release insert lock immediately */
		LWLockRelease(WALInsertLock);

		RecPtr.xrecoff -= SizeOfXLogLongPHD;
		if (RecPtr.xrecoff == 0)
		{
			/* crossing a logid boundary */
			RecPtr.xlogid -= 1;
			RecPtr.xrecoff = XLogFileSize;
		}

		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(RecPtr, LogwrtResult.Flush))
		{
			XLogwrtRqst FlushRqst;

			FlushRqst.Write = RecPtr;
			FlushRqst.Flush = RecPtr;
			XLogWrite(FlushRqst, false, false);
		}
		LWLockRelease(WALWriteLock);

		END_CRIT_SECTION();

		return RecPtr;
	}
T
Tom Lane 已提交
1039

1040 1041
	/* Insert record header */

1042
	record = (XLogRecord *) Insert->currpos;
1043
	record->xl_prev = Insert->PrevRecord;
1044
	record->xl_xid = GetCurrentTransactionIdIfAny();
1045
	record->xl_tot_len = SizeOfXLogRecord + write_len;
T
Tom Lane 已提交
1046
	record->xl_len = len;		/* doesn't include backup blocks */
1047
	record->xl_info = info;
1048
	record->xl_rmid = rmid;
1049

1050 1051 1052 1053
	/* Now we can finish computing the record's CRC */
	COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(rdata_crc);
1054 1055
	record->xl_crc = rdata_crc;

1056
#ifdef WAL_DEBUG
V
WAL  
Vadim B. Mikheev 已提交
1057 1058
	if (XLOG_DEBUG)
	{
B
Bruce Momjian 已提交
1059
		StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
1060

1061
		initStringInfo(&buf);
1062 1063
		appendStringInfo(&buf, "INSERT @ %X/%X: ",
						 RecPtr.xlogid, RecPtr.xrecoff);
1064
		xlog_outrec(&buf, record);
1065
		if (rdata->data != NULL)
V
WAL  
Vadim B. Mikheev 已提交
1066
		{
1067 1068
			appendStringInfo(&buf, " - ");
			RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
V
WAL  
Vadim B. Mikheev 已提交
1069
		}
1070 1071
		elog(LOG, "%s", buf.data);
		pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
1072
	}
1073
#endif
V
WAL  
Vadim B. Mikheev 已提交
1074

T
Tom Lane 已提交
1075 1076 1077 1078
	/* Record begin of record in appropriate places */
	ProcLastRecPtr = RecPtr;
	Insert->PrevRecord = RecPtr;

1079
	Insert->currpos += SizeOfXLogRecord;
T
Tom Lane 已提交
1080
	freespace -= SizeOfXLogRecord;
1081

T
Tom Lane 已提交
1082 1083 1084 1085
	/*
	 * Append the data, including backup blocks if any
	 */
	while (write_len)
1086
	{
1087 1088 1089 1090
		while (rdata->data == NULL)
			rdata = rdata->next;

		if (freespace > 0)
1091
		{
1092 1093 1094 1095 1096
			if (rdata->len > freespace)
			{
				memcpy(Insert->currpos, rdata->data, freespace);
				rdata->data += freespace;
				rdata->len -= freespace;
T
Tom Lane 已提交
1097
				write_len -= freespace;
1098 1099 1100 1101 1102
			}
			else
			{
				memcpy(Insert->currpos, rdata->data, rdata->len);
				freespace -= rdata->len;
T
Tom Lane 已提交
1103
				write_len -= rdata->len;
1104 1105 1106 1107
				Insert->currpos += rdata->len;
				rdata = rdata->next;
				continue;
			}
1108 1109
		}

1110
		/* Use next buffer */
1111
		updrqst = AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
1112 1113 1114 1115 1116 1117
		curridx = Insert->curridx;
		/* Insert cont-record header */
		Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
		contrecord = (XLogContRecord *) Insert->currpos;
		contrecord->xl_rem_len = write_len;
		Insert->currpos += SizeOfXLogContRecord;
1118
		freespace = INSERT_FREESPACE(Insert);
1119
	}
1120

T
Tom Lane 已提交
1121 1122
	/* Ensure next record will be properly aligned */
	Insert->currpos = (char *) Insert->currpage +
B
Bruce Momjian 已提交
1123
		MAXALIGN(Insert->currpos - (char *) Insert->currpage);
T
Tom Lane 已提交
1124
	freespace = INSERT_FREESPACE(Insert);
1125

V
Vadim B. Mikheev 已提交
1126
	/*
B
Bruce Momjian 已提交
1127 1128
	 * The recptr I return is the beginning of the *next* record. This will be
	 * stored as LSN for changed data pages...
V
Vadim B. Mikheev 已提交
1129
	 */
T
Tom Lane 已提交
1130
	INSERT_RECPTR(RecPtr, Insert, curridx);
V
Vadim B. Mikheev 已提交
1131

1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
	/*
	 * If the record is an XLOG_SWITCH, we must now write and flush all the
	 * existing data, and then forcibly advance to the start of the next
	 * segment.  It's not good to do this I/O while holding the insert lock,
	 * but there seems too much risk of confusion if we try to release the
	 * lock sooner.  Fortunately xlog switch needn't be a high-performance
	 * operation anyway...
	 */
	if (isLogSwitch)
	{
		XLogCtlWrite *Write = &XLogCtl->Write;
		XLogwrtRqst FlushRqst;
		XLogRecPtr	OldSegEnd;

1146 1147
		TRACE_POSTGRESQL_XLOG_SWITCH();

1148 1149 1150
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

		/*
B
Bruce Momjian 已提交
1151 1152
		 * Flush through the end of the page containing XLOG_SWITCH, and
		 * perform end-of-segment actions (eg, notifying archiver).
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
		 */
		WriteRqst = XLogCtl->xlblocks[curridx];
		FlushRqst.Write = WriteRqst;
		FlushRqst.Flush = WriteRqst;
		XLogWrite(FlushRqst, false, true);

		/* Set up the next buffer as first page of next segment */
		/* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
		(void) AdvanceXLInsertBuffer(true);

		/* There should be no unwritten data */
		curridx = Insert->curridx;
		Assert(curridx == Write->curridx);

		/* Compute end address of old segment */
		OldSegEnd = XLogCtl->xlblocks[curridx];
		OldSegEnd.xrecoff -= XLOG_BLCKSZ;
		if (OldSegEnd.xrecoff == 0)
		{
			/* crossing a logid boundary */
			OldSegEnd.xlogid -= 1;
			OldSegEnd.xrecoff = XLogFileSize;
		}

		/* Make it look like we've written and synced all of old segment */
		LogwrtResult.Write = OldSegEnd;
		LogwrtResult.Flush = OldSegEnd;

		/*
		 * Update shared-memory status --- this code should match XLogWrite
		 */
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

			SpinLockAcquire(&xlogctl->info_lck);
			xlogctl->LogwrtResult = LogwrtResult;
			if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
				xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
			if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
				xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
			SpinLockRelease(&xlogctl->info_lck);
		}

		Write->LogwrtResult = LogwrtResult;

		LWLockRelease(WALWriteLock);

		updrqst = false;		/* done already */
	}
1203
	else
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
	{
		/* normal case, ie not xlog switch */

		/* Need to update shared LogwrtRqst if some block was filled up */
		if (freespace < SizeOfXLogRecord)
		{
			/* curridx is filled and available for writing out */
			updrqst = true;
		}
		else
		{
			/* if updrqst already set, write through end of previous buf */
			curridx = PrevBufIdx(curridx);
		}
		WriteRqst = XLogCtl->xlblocks[curridx];
	}
1220

1221
	LWLockRelease(WALInsertLock);
1222 1223 1224

	if (updrqst)
	{
1225 1226 1227
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1228
		SpinLockAcquire(&xlogctl->info_lck);
T
Tom Lane 已提交
1229
		/* advance global request to include new block(s) */
1230 1231
		if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
			xlogctl->LogwrtRqst.Write = WriteRqst;
T
Tom Lane 已提交
1232
		/* update local result copy while I have the chance */
1233
		LogwrtResult = xlogctl->LogwrtResult;
1234
		SpinLockRelease(&xlogctl->info_lck);
1235 1236
	}

1237
	XactLastRecEnd = RecPtr;
1238

1239
	END_CRIT_SECTION();
1240

1241
	return RecPtr;
1242
}
1243

1244
/*
1245 1246 1247
 * Determine whether the buffer referenced by an XLogRecData item has to
 * be backed up, and if so fill a BkpBlock struct for it.  In any case
 * save the buffer's LSN at *lsn.
1248
 */
1249
static bool
1250
XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1251
				XLogRecPtr *lsn, BkpBlock *bkpb)
1252
{
1253
	Page		page;
1254

1255
	page = BufferGetPage(rdata->buffer);
1256 1257

	/*
B
Bruce Momjian 已提交
1258 1259 1260
	 * XXX We assume page LSN is first data on *every* page that can be passed
	 * to XLogInsert, whether it otherwise has the standard page layout or
	 * not.
1261
	 */
1262
	*lsn = PageGetLSN(page);
1263

1264
	if (doPageWrites &&
1265
		XLByteLE(PageGetLSN(page), RedoRecPtr))
1266
	{
1267 1268 1269
		/*
		 * The page needs to be backed up, so set up *bkpb
		 */
1270
		BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1271

1272 1273 1274
		if (rdata->buffer_std)
		{
			/* Assume we can omit data between pd_lower and pd_upper */
1275 1276
			uint16		lower = ((PageHeader) page)->pd_lower;
			uint16		upper = ((PageHeader) page)->pd_upper;
1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297

			if (lower >= SizeOfPageHeaderData &&
				upper > lower &&
				upper <= BLCKSZ)
			{
				bkpb->hole_offset = lower;
				bkpb->hole_length = upper - lower;
			}
			else
			{
				/* No "hole" to compress out */
				bkpb->hole_offset = 0;
				bkpb->hole_length = 0;
			}
		}
		else
		{
			/* Not a standard page header, don't try to eliminate "hole" */
			bkpb->hole_offset = 0;
			bkpb->hole_length = 0;
		}
1298

1299
		return true;			/* buffer requires backup */
1300
	}
1301 1302

	return false;				/* buffer does not need to be backed up */
1303 1304
}

1305 1306 1307 1308 1309 1310
/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
1311
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
1312
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1313
 * then when complete, rename it to 0000000100000001000000C6.done
1314 1315 1316 1317 1318
 */
static void
XLogArchiveNotify(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
B
Bruce Momjian 已提交
1319
	FILE	   *fd;
1320 1321 1322 1323

	/* insert an otherwise empty file called <XLOG>.ready */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	fd = AllocateFile(archiveStatusPath, "w");
B
Bruce Momjian 已提交
1324 1325
	if (fd == NULL)
	{
1326 1327 1328 1329 1330 1331
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not create archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}
B
Bruce Momjian 已提交
1332 1333
	if (FreeFile(fd))
	{
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
		ereport(LOG,
				(errcode_for_file_access(),
				 errmsg("could not write archive status file \"%s\": %m",
						archiveStatusPath)));
		return;
	}

	/* Notify archiver that it's got something to do */
	if (IsUnderPostmaster)
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
}

/*
 * Convenience routine to notify using log/seg representation of filename
 */
static void
XLogArchiveNotifySeg(uint32 log, uint32 seg)
{
	char		xlog[MAXFNAMELEN];

1354
	XLogFileName(xlog, ThisTimeLineID, log, seg);
1355 1356 1357 1358
	XLogArchiveNotify(xlog);
}

/*
1359
 * XLogArchiveCheckDone
1360
 *
1361 1362 1363 1364
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
1365 1366
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1367 1368 1369 1370
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1371 1372
 */
static bool
1373
XLogArchiveCheckDone(const char *xlog)
1374 1375 1376 1377
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

1378 1379 1380 1381 1382
	/* Always deletable if archiving is off */
	if (!XLogArchivingActive())
		return true;

	/* First check for .done --- this means archiver is done with it */
1383 1384 1385 1386 1387 1388 1389
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
B
Bruce Momjian 已提交
1390
		return false;
1391 1392 1393 1394 1395 1396 1397

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Retry creation of the .ready file */
1398
	XLogArchiveNotify(xlog);
1399 1400 1401
	return false;
}

1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool
XLogArchiveIsBusy(const char *xlog)
{
	char		archiveStatusPath[MAXPGPATH];
	struct stat stat_buf;

	/* First check for .done --- this means archiver is done with it */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/* check for .ready --- this means archiver is still busy with it */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return true;

	/* Race condition --- maybe archiver just finished, so recheck */
	StatusFilePath(archiveStatusPath, xlog, ".done");
	if (stat(archiveStatusPath, &stat_buf) == 0)
		return false;

	/*
1434 1435 1436
	 * Check to see if the WAL file has been removed by checkpoint, which
	 * implies it has already been archived, and explains why we can't see a
	 * status file for it.
1437 1438 1439 1440 1441 1442 1443 1444 1445
	 */
	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
	if (stat(archiveStatusPath, &stat_buf) != 0 &&
		errno == ENOENT)
		return false;

	return true;
}

1446 1447 1448
/*
 * XLogArchiveCleanup
 *
1449
 * Cleanup archive notification file(s) for a particular xlog segment
1450 1451 1452 1453
 */
static void
XLogArchiveCleanup(const char *xlog)
{
B
Bruce Momjian 已提交
1454
	char		archiveStatusPath[MAXPGPATH];
1455

1456
	/* Remove the .done file */
1457 1458 1459
	StatusFilePath(archiveStatusPath, xlog, ".done");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1460 1461 1462 1463 1464

	/* Remove the .ready file if present --- normally it shouldn't be */
	StatusFilePath(archiveStatusPath, xlog, ".ready");
	unlink(archiveStatusPath);
	/* should we complain about failure? */
1465 1466
}

T
Tom Lane 已提交
1467 1468 1469 1470
/*
 * Advance the Insert state to the next buffer page, writing out the next
 * buffer if it still contains unwritten data.
 *
1471 1472 1473 1474
 * If new_segment is TRUE then we set up the next buffer page as the first
 * page of the next xlog segment file, possibly but not usually the next
 * consecutive file page.
 *
T
Tom Lane 已提交
1475
 * The global LogwrtRqst.Write pointer needs to be advanced to include the
1476
 * just-filled page.  If we can do this for free (without an extra lock),
T
Tom Lane 已提交
1477 1478 1479
 * we do so here.  Otherwise the caller must do it.  We return TRUE if the
 * request update still needs to be done, FALSE if we did it internally.
 *
1480
 * Must be called with WALInsertLock held.
T
Tom Lane 已提交
1481 1482
 */
static bool
1483
AdvanceXLInsertBuffer(bool new_segment)
1484
{
T
Tom Lane 已提交
1485 1486
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogCtlWrite *Write = &XLogCtl->Write;
1487
	int			nextidx = NextBufIdx(Insert->curridx);
T
Tom Lane 已提交
1488 1489 1490
	bool		update_needed = true;
	XLogRecPtr	OldPageRqstPtr;
	XLogwrtRqst WriteRqst;
1491 1492
	XLogRecPtr	NewPageEndPtr;
	XLogPageHeader NewPage;
1493

T
Tom Lane 已提交
1494 1495 1496
	/* Use Insert->LogwrtResult copy if it's more fresh */
	if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
		LogwrtResult = Insert->LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
1497

T
Tom Lane 已提交
1498
	/*
B
Bruce Momjian 已提交
1499 1500 1501
	 * Get ending-offset of the buffer page we need to replace (this may be
	 * zero if the buffer hasn't been used yet).  Fall through if it's already
	 * written out.
T
Tom Lane 已提交
1502 1503 1504 1505 1506 1507
	 */
	OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
	if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
	{
		/* nope, got work to do... */
		XLogRecPtr	FinishedPageRqstPtr;
1508

T
Tom Lane 已提交
1509
		FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1510

1511
		/* Before waiting, get info_lck and update LogwrtResult */
1512 1513 1514 1515
		{
			/* use volatile pointer to prevent code rearrangement */
			volatile XLogCtlData *xlogctl = XLogCtl;

1516
			SpinLockAcquire(&xlogctl->info_lck);
1517 1518 1519
			if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
				xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
			LogwrtResult = xlogctl->LogwrtResult;
1520
			SpinLockRelease(&xlogctl->info_lck);
1521
		}
1522 1523 1524 1525 1526 1527 1528 1529 1530

		update_needed = false;	/* Did the shared-request update */

		if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
		{
			/* OK, someone wrote it already */
			Insert->LogwrtResult = LogwrtResult;
		}
		else
1531
		{
1532 1533 1534 1535
			/* Must acquire write lock */
			LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
			LogwrtResult = Write->LogwrtResult;
			if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1536
			{
1537 1538 1539
				/* OK, someone wrote it already */
				LWLockRelease(WALWriteLock);
				Insert->LogwrtResult = LogwrtResult;
T
Tom Lane 已提交
1540
			}
1541
			else
T
Tom Lane 已提交
1542 1543
			{
				/*
B
Bruce Momjian 已提交
1544 1545
				 * Have to write buffers while holding insert lock. This is
				 * not good, so only write as much as we absolutely must.
T
Tom Lane 已提交
1546
				 */
1547
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
T
Tom Lane 已提交
1548 1549 1550
				WriteRqst.Write = OldPageRqstPtr;
				WriteRqst.Flush.xlogid = 0;
				WriteRqst.Flush.xrecoff = 0;
1551
				XLogWrite(WriteRqst, false, false);
1552
				LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
1553
				Insert->LogwrtResult = LogwrtResult;
1554
				TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1555 1556 1557 1558
			}
		}
	}

T
Tom Lane 已提交
1559
	/*
B
Bruce Momjian 已提交
1560 1561
	 * Now the next buffer slot is free and we can set it up to be the next
	 * output page.
T
Tom Lane 已提交
1562
	 */
1563
	NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1564 1565 1566 1567 1568 1569 1570 1571

	if (new_segment)
	{
		/* force it to a segment start point */
		NewPageEndPtr.xrecoff += XLogSegSize - 1;
		NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
	}

1572
	if (NewPageEndPtr.xrecoff >= XLogFileSize)
1573
	{
T
Tom Lane 已提交
1574
		/* crossing a logid boundary */
1575
		NewPageEndPtr.xlogid += 1;
1576
		NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1577
	}
T
Tom Lane 已提交
1578
	else
1579
		NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1580
	XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1581
	NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
B
Bruce Momjian 已提交
1582

T
Tom Lane 已提交
1583
	Insert->curridx = nextidx;
1584
	Insert->currpage = NewPage;
B
Bruce Momjian 已提交
1585 1586

	Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
B
Bruce Momjian 已提交
1587

T
Tom Lane 已提交
1588
	/*
B
Bruce Momjian 已提交
1589 1590
	 * Be sure to re-zero the buffer so that bytes beyond what we've written
	 * will look like zeroes and not valid XLOG records...
T
Tom Lane 已提交
1591
	 */
1592
	MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1593

1594 1595 1596
	/*
	 * Fill the new page's header
	 */
B
Bruce Momjian 已提交
1597 1598
	NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

1599
	/* NewPage->xlp_info = 0; */	/* done by memset */
B
Bruce Momjian 已提交
1600 1601
	NewPage   ->xlp_tli = ThisTimeLineID;
	NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1602
	NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
T
Tom Lane 已提交
1603

1604
	/*
1605
	 * If first page of an XLOG segment file, make it a long header.
1606 1607 1608
	 */
	if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
	{
1609
		XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1610

1611 1612
		NewLongPage->xlp_sysid = ControlFile->system_identifier;
		NewLongPage->xlp_seg_size = XLogSegSize;
1613
		NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
B
Bruce Momjian 已提交
1614 1615 1616
		NewPage   ->xlp_info |= XLP_LONG_HEADER;

		Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1617 1618
	}

T
Tom Lane 已提交
1619
	return update_needed;
1620 1621
}

1622 1623 1624
/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
1625 1626 1627
 * logid/logseg indicate a log file that has just been filled up (or read
 * during recovery). We measure the distance from RedoRecPtr to logid/logseg
 * and see if that exceeds CheckPointSegments.
1628 1629 1630 1631
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool
1632
XLogCheckpointNeeded(uint32 logid, uint32 logseg)
1633 1634
{
	/*
1635 1636
	 * A straight computation of segment number could overflow 32 bits. Rather
	 * than assuming we have working 64-bit arithmetic, we compare the
B
Bruce Momjian 已提交
1637 1638
	 * highest-order bits separately, and force a checkpoint immediately when
	 * they change.
1639 1640 1641 1642 1643 1644 1645 1646 1647
	 */
	uint32		old_segno,
				new_segno;
	uint32		old_highbits,
				new_highbits;

	old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
		(RedoRecPtr.xrecoff / XLogSegSize);
	old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1648 1649
	new_segno = (logid % XLogSegSize) * XLogSegsPerFile + logseg;
	new_highbits = logid / XLogSegSize;
1650
	if (new_highbits != old_highbits ||
B
Bruce Momjian 已提交
1651
		new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1652 1653 1654 1655
		return true;
	return false;
}

T
Tom Lane 已提交
1656 1657 1658
/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
1659 1660 1661 1662 1663
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
1664 1665 1666 1667 1668 1669
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
1670
 * Must be called with WALWriteLock held.
T
Tom Lane 已提交
1671
 */
1672
static void
1673
XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1674
{
1675
	XLogCtlWrite *Write = &XLogCtl->Write;
T
Tom Lane 已提交
1676
	bool		ispartialpage;
1677
	bool		last_iteration;
1678
	bool		finishing_seg;
1679
	bool		use_existent;
1680 1681 1682 1683
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;
1684

1685 1686 1687
	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

B
Bruce Momjian 已提交
1688
	/*
B
Bruce Momjian 已提交
1689
	 * Update local LogwrtResult (caller probably did this already, but...)
B
Bruce Momjian 已提交
1690
	 */
T
Tom Lane 已提交
1691 1692
	LogwrtResult = Write->LogwrtResult;

1693 1694 1695
	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
B
Bruce Momjian 已提交
1696 1697 1698 1699 1700
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
1701 1702 1703 1704 1705 1706 1707 1708 1709
	 */
	npages = 0;
	startidx = 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  We advance Write->curridx only after successfully
	 * writing pages.  (Right now, this refinement is useless since we are
B
Bruce Momjian 已提交
1710 1711
	 * going to PANIC if any error occurs anyway; but someday it may come in
	 * useful.)
1712 1713
	 */
	curridx = Write->curridx;
B
 
Bruce Momjian 已提交
1714

T
Tom Lane 已提交
1715
	while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1716
	{
1717
		/*
B
Bruce Momjian 已提交
1718 1719 1720
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
1721
		 */
1722
		if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1723
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1724
				 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1725 1726
				 XLogCtl->xlblocks[curridx].xlogid,
				 XLogCtl->xlblocks[curridx].xrecoff);
1727

T
Tom Lane 已提交
1728
		/* Advance LogwrtResult.Write to end of current buffer page */
1729
		LogwrtResult.Write = XLogCtl->xlblocks[curridx];
T
Tom Lane 已提交
1730 1731 1732
		ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);

		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1733
		{
T
Tom Lane 已提交
1734
			/*
1735 1736
			 * Switch to new logfile segment.  We cannot have any pending
			 * pages here (since we dump what we have at segment end).
T
Tom Lane 已提交
1737
			 */
1738
			Assert(npages == 0);
T
Tom Lane 已提交
1739
			if (openLogFile >= 0)
1740
				XLogFileClose();
T
Tom Lane 已提交
1741 1742
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);

1743 1744 1745 1746
			/* create/use new log file */
			use_existent = true;
			openLogFile = XLogFileInit(openLogId, openLogSeg,
									   &use_existent, true);
T
Tom Lane 已提交
1747
			openLogOff = 0;
1748 1749
		}

1750
		/* Make sure we have the current logfile open */
T
Tom Lane 已提交
1751
		if (openLogFile < 0)
1752
		{
T
Tom Lane 已提交
1753
			XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1754
			openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
1755
			openLogOff = 0;
1756 1757
		}

1758 1759 1760 1761 1762
		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx = curridx;
1763
			startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1764 1765
		}
		npages++;
1766

T
Tom Lane 已提交
1767
		/*
B
Bruce Momjian 已提交
1768 1769 1770 1771
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
T
Tom Lane 已提交
1772
		 */
1773 1774
		last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);

1775
		finishing_seg = !ispartialpage &&
1776
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1777

1778
		if (last_iteration ||
1779 1780
			curridx == XLogCtl->XLogCacheBlck ||
			finishing_seg)
T
Tom Lane 已提交
1781
		{
1782 1783
			char	   *from;
			Size		nbytes;
1784

1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
			/* Need to seek in the file? */
			if (openLogOff != startoffset)
			{
				if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
					ereport(PANIC,
							(errcode_for_file_access(),
							 errmsg("could not seek in log file %u, "
									"segment %u to offset %u: %m",
									openLogId, openLogSeg, startoffset)));
				openLogOff = startoffset;
			}

			/* OK to write the page(s) */
1798 1799
			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
			nbytes = npages * (Size) XLOG_BLCKSZ;
1800 1801 1802 1803 1804 1805 1806 1807 1808
			errno = 0;
			if (write(openLogFile, from, nbytes) != nbytes)
			{
				/* if write didn't set errno, assume no disk space */
				if (errno == 0)
					errno = ENOSPC;
				ereport(PANIC,
						(errcode_for_file_access(),
						 errmsg("could not write to log file %u, segment %u "
P
Peter Eisentraut 已提交
1809
								"at offset %u, length %lu: %m",
1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825
								openLogId, openLogSeg,
								openLogOff, (unsigned long) nbytes)));
			}

			/* Update state for write */
			openLogOff += nbytes;
			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
			npages = 0;

			/*
			 * If we just wrote the whole last page of a logfile segment,
			 * fsync the segment immediately.  This avoids having to go back
			 * and re-open prior segments when an fsync request comes along
			 * later. Doing it here ensures that one and only one backend will
			 * perform this fsync.
			 *
1826 1827 1828
			 * We also do this if this is the last page written for an xlog
			 * switch.
			 *
1829
			 * This is also the right place to notify the Archiver that the
B
Bruce Momjian 已提交
1830
			 * segment is ready to copy to archival storage, and to update the
1831 1832 1833
			 * timer for archive_timeout, and to signal for a checkpoint if
			 * too many logfile segments have been used since the last
			 * checkpoint.
1834
			 */
1835
			if (finishing_seg || (xlog_switch && last_iteration))
1836
			{
1837
				issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
B
Bruce Momjian 已提交
1838
				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
1839 1840 1841

				if (XLogArchivingActive())
					XLogArchiveNotifySeg(openLogId, openLogSeg);
1842

1843
				Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1844 1845

				/*
1846
				 * Signal bgwriter to start a checkpoint if we've consumed too
1847
				 * much xlog since the last one.  For speed, we first check
B
Bruce Momjian 已提交
1848 1849 1850
				 * using the local copy of RedoRecPtr, which might be out of
				 * date; if it looks like a checkpoint is needed, forcibly
				 * update RedoRecPtr and recheck.
1851
				 */
1852
				if (IsUnderPostmaster &&
1853
					XLogCheckpointNeeded(openLogId, openLogSeg))
1854
				{
1855
					(void) GetRedoRecPtr();
1856
					if (XLogCheckpointNeeded(openLogId, openLogSeg))
1857
						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1858
				}
1859
			}
T
Tom Lane 已提交
1860
		}
1861

T
Tom Lane 已提交
1862 1863 1864 1865 1866 1867
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
1868 1869 1870 1871 1872
		curridx = NextBufIdx(curridx);

		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
1873
	}
1874 1875 1876

	Assert(npages == 0);
	Assert(curridx == Write->curridx);
1877

T
Tom Lane 已提交
1878 1879 1880 1881 1882
	/*
	 * If asked to flush, do so
	 */
	if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
		XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1883
	{
T
Tom Lane 已提交
1884
		/*
B
Bruce Momjian 已提交
1885 1886 1887
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.	However, we do not need to
		 * fsync more than one file.
T
Tom Lane 已提交
1888
		 */
1889 1890
		if (sync_method != SYNC_METHOD_OPEN &&
			sync_method != SYNC_METHOD_OPEN_DSYNC)
T
Tom Lane 已提交
1891
		{
1892
			if (openLogFile >= 0 &&
B
Bruce Momjian 已提交
1893
				!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1894
				XLogFileClose();
1895 1896 1897
			if (openLogFile < 0)
			{
				XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1898
				openLogFile = XLogFileOpen(openLogId, openLogSeg);
1899 1900
				openLogOff = 0;
			}
1901
			issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
T
Tom Lane 已提交
1902 1903
		}
		LogwrtResult.Flush = LogwrtResult.Write;
1904 1905
	}

T
Tom Lane 已提交
1906 1907 1908
	/*
	 * Update shared-memory status
	 *
B
Bruce Momjian 已提交
1909
	 * We make sure that the shared 'request' values do not fall behind the
B
Bruce Momjian 已提交
1910 1911
	 * 'result' values.  This is not absolutely essential, but it saves some
	 * code in a couple of places.
T
Tom Lane 已提交
1912
	 */
1913 1914 1915 1916
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

1917
		SpinLockAcquire(&xlogctl->info_lck);
1918 1919 1920 1921 1922
		xlogctl->LogwrtResult = LogwrtResult;
		if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
			xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
		if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
			xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1923
		SpinLockRelease(&xlogctl->info_lck);
1924
	}
1925

T
Tom Lane 已提交
1926 1927 1928
	Write->LogwrtResult = LogwrtResult;
}

1929
/*
1930 1931
 * Record the LSN for an asynchronous transaction commit/abort.
 * (This should not be called for for synchronous commits.)
1932 1933
 */
void
1934
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1935 1936 1937 1938 1939
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
1940 1941
	if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
		xlogctl->asyncXactLSN = asyncXactLSN;
1942 1943 1944
	SpinLockRelease(&xlogctl->info_lck);
}

1945 1946 1947 1948
/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
1949 1950
 * database is consistent.
 *
1951
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1952
 * is only updated if it's not already greater than or equal to 'lsn'.
1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
 */
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
	/* Quick check using our local copy of the variable */
	if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
		return;

	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

	/* update local copy */
	minRecoveryPoint = ControlFile->minRecoveryPoint;

	/*
	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
1968 1969
	 * i.e., we're doing crash recovery.  We never modify the control file's
	 * value in that case, so we can short-circuit future checks here too.
1970 1971 1972 1973 1974 1975 1976
	 */
	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
		updateMinRecoveryPoint = false;
	else if (force || XLByteLT(minRecoveryPoint, lsn))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
1977
		XLogRecPtr	newMinRecoveryPoint;
1978 1979 1980 1981

		/*
		 * To avoid having to update the control file too often, we update it
		 * all the way to the last record being replayed, even though 'lsn'
1982 1983 1984 1985
		 * would suffice for correctness.  This also allows the 'force' case
		 * to not need a valid 'lsn' value.
		 *
		 * Another important reason for doing it this way is that the passed
B
Bruce Momjian 已提交
1986 1987 1988 1989
		 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
		 * the caller got it from a corrupted heap page.  Accepting such a
		 * value as the min recovery point would prevent us from coming up at
		 * all.  Instead, we just log a warning and continue with recovery.
1990
		 * (See also the comments about corrupt LSNs in XLogFlush.)
1991 1992 1993 1994 1995
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
		SpinLockRelease(&xlogctl->info_lck);

1996 1997
		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
			elog(WARNING,
B
Bruce Momjian 已提交
1998
			   "xlog min recovery request %X/%X is past current point %X/%X",
1999 2000 2001
				 lsn.xlogid, lsn.xrecoff,
				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);

2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
		/* update control file */
		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
		{
			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
			UpdateControlFile();
			minRecoveryPoint = newMinRecoveryPoint;

			ereport(DEBUG2,
					(errmsg("updated min recovery point to %X/%X",
						minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
		}
	}
	LWLockRelease(ControlFileLock);
}

T
Tom Lane 已提交
2017 2018 2019
/*
 * Ensure that all XLOG data through the given position is flushed to disk.
 *
2020
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
T
Tom Lane 已提交
2021 2022 2023 2024 2025 2026 2027 2028
 * already held, and we try to avoid acquiring it if possible.
 */
void
XLogFlush(XLogRecPtr record)
{
	XLogRecPtr	WriteRqstPtr;
	XLogwrtRqst WriteRqst;

2029
	/*
2030
	 * During REDO, we are reading not writing WAL.  Therefore, instead of
B
Bruce Momjian 已提交
2031 2032 2033 2034
	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
	 * test XLogInsertAllowed(), not InRecovery, because we need the bgwriter
	 * to act this way too, and because when the bgwriter tries to write the
	 * end-of-recovery checkpoint, it should indeed flush.
2035
	 */
2036
	if (!XLogInsertAllowed())
2037 2038
	{
		UpdateMinRecoveryPoint(record, false);
T
Tom Lane 已提交
2039
		return;
2040
	}
T
Tom Lane 已提交
2041 2042 2043 2044 2045

	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return;

2046
#ifdef WAL_DEBUG
2047
	if (XLOG_DEBUG)
2048
		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2049 2050 2051
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2052
#endif
2053

T
Tom Lane 已提交
2054 2055 2056 2057
	START_CRIT_SECTION();

	/*
	 * Since fsync is usually a horribly expensive operation, we try to
B
Bruce Momjian 已提交
2058 2059 2060 2061
	 * piggyback as much data as we can on each fsync: if we see any more data
	 * entered into the xlog buffer, we'll write and fsync that too, so that
	 * the final value of LogwrtResult.Flush is as large as possible. This
	 * gives us some chance of avoiding another fsync immediately after.
T
Tom Lane 已提交
2062 2063 2064 2065 2066
	 */

	/* initialize to given target; may increase below */
	WriteRqstPtr = record;

2067
	/* read LogwrtResult and update local state */
2068 2069 2070 2071
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2072
		SpinLockAcquire(&xlogctl->info_lck);
2073 2074 2075
		if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
			WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		LogwrtResult = xlogctl->LogwrtResult;
2076
		SpinLockRelease(&xlogctl->info_lck);
2077
	}
2078 2079 2080

	/* done already? */
	if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2081
	{
2082 2083 2084 2085
		/* now wait for the write lock */
		LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
		LogwrtResult = XLogCtl->Write.LogwrtResult;
		if (!XLByteLE(record, LogwrtResult.Flush))
T
Tom Lane 已提交
2086
		{
2087 2088 2089 2090 2091 2092
			/* try to write/flush later additions to XLOG as well */
			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
			{
				XLogCtlInsert *Insert = &XLogCtl->Insert;
				uint32		freespace = INSERT_FREESPACE(Insert);

B
Bruce Momjian 已提交
2093
				if (freespace < SizeOfXLogRecord)		/* buffer is full */
2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
				else
				{
					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
					WriteRqstPtr.xrecoff -= freespace;
				}
				LWLockRelease(WALInsertLock);
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = WriteRqstPtr;
			}
			else
			{
				WriteRqst.Write = WriteRqstPtr;
				WriteRqst.Flush = record;
			}
2109
			XLogWrite(WriteRqst, false, false);
T
Tom Lane 已提交
2110
		}
2111
		LWLockRelease(WALWriteLock);
T
Tom Lane 已提交
2112 2113 2114
	}

	END_CRIT_SECTION();
2115 2116 2117

	/*
	 * If we still haven't flushed to the request point then we have a
B
Bruce Momjian 已提交
2118 2119
	 * problem; most likely, the requested flush point is past end of XLOG.
	 * This has been seen to occur when a disk page has a corrupted LSN.
2120
	 *
2121 2122 2123 2124
	 * Formerly we treated this as a PANIC condition, but that hurts the
	 * system's robustness rather than helping it: we do not want to take down
	 * the whole system due to corruption on one data page.  In particular, if
	 * the bad page is encountered again during recovery then we would be
2125
	 * unable to restart the database at all!  (This scenario actually
B
Bruce Momjian 已提交
2126 2127 2128 2129 2130
	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
	 * the only time we can reach here during recovery is while flushing the
	 * end-of-recovery checkpoint record, and we don't expect that to have a
	 * bad LSN.
2131
	 *
B
Bruce Momjian 已提交
2132 2133 2134 2135
	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
	 * since xact.c calls this routine inside a critical section.  However,
	 * calls from bufmgr.c are not within critical sections and so we will not
	 * force a restart for a bad LSN on a data page.
2136 2137
	 */
	if (XLByteLT(LogwrtResult.Flush, record))
2138
		elog(ERROR,
B
Bruce Momjian 已提交
2139
		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2140 2141
			 record.xlogid, record.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2142 2143
}

2144 2145 2146 2147 2148 2149
/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
B
Bruce Momjian 已提交
2150
 * are not being used, we will flush complete blocks only.	We can guarantee
2151
 * that async commits reach disk after at most three cycles; normally only
B
Bruce Momjian 已提交
2152
 * one or two.	(We allow XLogWrite to write "flexibly", meaning it can stop
2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164
 * at the end of the buffer ring; this makes a difference only with very high
 * load or long wal_writer_delay, but imposes one extra cycle for the worst
 * case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 */
void
XLogBackgroundFlush(void)
{
	XLogRecPtr	WriteRqstPtr;
	bool		flexible = true;

2165 2166 2167 2168
	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return;

2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		WriteRqstPtr = xlogctl->LogwrtRqst.Write;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* back off to last completed page boundary */
	WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records */
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

2189
		SpinLockAcquire(&xlogctl->info_lck);
2190
		WriteRqstPtr = xlogctl->asyncXactLSN;
2191
		SpinLockRelease(&xlogctl->info_lck);
2192 2193 2194
		flexible = false;		/* ensure it all gets written */
	}

2195
	/*
B
Bruce Momjian 已提交
2196 2197 2198
	 * If already known flushed, we're done. Just need to check if we are
	 * holding an open file handle to a logfile that's no longer in use,
	 * preventing the file from being deleted.
2199
	 */
2200
	if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2201
	{
B
Bruce Momjian 已提交
2202 2203
		if (openLogFile >= 0)
		{
2204 2205 2206 2207 2208
			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
			{
				XLogFileClose();
			}
		}
2209
		return;
2210
	}
2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
#endif

	START_CRIT_SECTION();

	/* now wait for the write lock */
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->Write.LogwrtResult;
	if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
	{
		XLogwrtRqst WriteRqst;

		WriteRqst.Write = WriteRqstPtr;
		WriteRqst.Flush = WriteRqstPtr;
		XLogWrite(WriteRqst, flexible, false);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();
}

2238 2239 2240 2241 2242 2243 2244 2245 2246
/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool
XLogNeedsFlush(XLogRecPtr record)
{
2247 2248 2249 2250 2251
	/*
	 * During recovery, we don't flush WAL but update minRecoveryPoint
	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
	 * would need to be updated.
	 */
2252
	if (RecoveryInProgress())
2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
	{
		/* Quick exit if already known updated */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;

		/*
		 * Update local copy of minRecoveryPoint. But if the lock is busy,
		 * just return a conservative guess.
		 */
		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
			return true;
		minRecoveryPoint = ControlFile->minRecoveryPoint;
		LWLockRelease(ControlFileLock);

		/*
B
Bruce Momjian 已提交
2268 2269 2270 2271
		 * An invalid minRecoveryPoint means that we need to recover all the
		 * WAL, i.e., we're doing crash recovery.  We never modify the control
		 * file's value in that case, so we can short-circuit future checks
		 * here too.
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281
		 */
		if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
			updateMinRecoveryPoint = false;

		/* check again */
		if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
			return false;
		else
			return true;
	}
2282

2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303
	/* Quick exit if already known flushed */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	/* read LogwrtResult and update local state */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* check again */
	if (XLByteLE(record, LogwrtResult.Flush))
		return false;

	return true;
}

T
Tom Lane 已提交
2304 2305 2306
/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
2307 2308 2309
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
B
Bruce Momjian 已提交
2310
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
2311 2312
 * file was used.
 *
2313
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2314
 * place.  This should be TRUE except during bootstrap log creation.  The
2315
 * caller must *not* hold the lock at call.
2316
 *
T
Tom Lane 已提交
2317
 * Returns FD of opened file.
2318 2319 2320 2321 2322
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
T
Tom Lane 已提交
2323
 */
2324
int
2325 2326
XLogFileInit(uint32 log, uint32 seg,
			 bool *use_existent, bool use_lock)
2327
{
2328
	char		path[MAXPGPATH];
2329
	char		tmppath[MAXPGPATH];
2330
	char	   *zbuffer;
2331 2332 2333
	uint32		installed_log;
	uint32		installed_seg;
	int			max_advance;
2334
	int			fd;
2335
	int			nbytes;
2336

2337
	XLogFilePath(path, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
2338 2339

	/*
B
Bruce Momjian 已提交
2340
	 * Try to use existent file (checkpoint maker may have created it already)
V
Vadim B. Mikheev 已提交
2341
	 */
2342
	if (*use_existent)
V
Vadim B. Mikheev 已提交
2343
	{
2344
		fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2345
						   S_IRUSR | S_IWUSR);
V
Vadim B. Mikheev 已提交
2346 2347 2348
		if (fd < 0)
		{
			if (errno != ENOENT)
2349
				ereport(ERROR,
2350
						(errcode_for_file_access(),
2351
						 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2352
								path, log, seg)));
V
Vadim B. Mikheev 已提交
2353 2354
		}
		else
2355
			return fd;
V
Vadim B. Mikheev 已提交
2356 2357
	}

2358
	/*
B
Bruce Momjian 已提交
2359 2360 2361 2362
	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
	 * another process is doing the same thing.  If so, we will end up
	 * pre-creating an extra log segment.  That seems OK, and better than
	 * holding the lock throughout this lengthy process.
2363
	 */
2364 2365
	elog(DEBUG2, "creating and filling new WAL file");

2366
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2367 2368

	unlink(tmppath);
2369

2370
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2371
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
T
Tom Lane 已提交
2372
					   S_IRUSR | S_IWUSR);
2373
	if (fd < 0)
2374
		ereport(ERROR,
2375
				(errcode_for_file_access(),
2376
				 errmsg("could not create file \"%s\": %m", tmppath)));
2377

2378
	/*
B
Bruce Momjian 已提交
2379 2380 2381 2382 2383 2384 2385
	 * Zero-fill the file.	We have to do this the hard way to ensure that all
	 * the file space has really been allocated --- on platforms that allow
	 * "holes" in files, just seeking to the end doesn't allocate intermediate
	 * space.  This way, we know that we have all the space and (after the
	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
	 * log file.
2386 2387 2388 2389
	 *
	 * Note: palloc zbuffer, instead of just using a local char array, to
	 * ensure it is reasonably well-aligned; this may save a few cycles
	 * transferring data to the kernel.
2390
	 */
2391 2392
	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2393
	{
2394
		errno = 0;
2395
		if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
T
Tom Lane 已提交
2396
		{
B
Bruce Momjian 已提交
2397
			int			save_errno = errno;
T
Tom Lane 已提交
2398

B
Bruce Momjian 已提交
2399
			/*
B
Bruce Momjian 已提交
2400
			 * If we fail to make the file, delete it to release disk space
B
Bruce Momjian 已提交
2401
			 */
2402
			unlink(tmppath);
2403 2404
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;
T
Tom Lane 已提交
2405

2406
			ereport(ERROR,
2407
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2408
					 errmsg("could not write to file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2409
		}
2410
	}
2411
	pfree(zbuffer);
2412

2413
	if (pg_fsync(fd) != 0)
2414
		ereport(ERROR,
2415
				(errcode_for_file_access(),
2416
				 errmsg("could not fsync file \"%s\": %m", tmppath)));
2417

2418
	if (close(fd))
2419
		ereport(ERROR,
2420 2421
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));
T
Tom Lane 已提交
2422

2423
	/*
2424 2425
	 * Now move the segment into place with its final name.
	 *
2426
	 * If caller didn't want to use a pre-existing file, get rid of any
B
Bruce Momjian 已提交
2427 2428 2429
	 * pre-existing file.  Otherwise, cope with possibility that someone else
	 * has created the file while we were filling ours: if so, use ours to
	 * pre-create a future log segment.
2430
	 */
2431 2432 2433 2434 2435
	installed_log = log;
	installed_seg = seg;
	max_advance = XLOGfileslop;
	if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
								*use_existent, &max_advance,
2436 2437
								use_lock))
	{
2438 2439 2440 2441 2442
		/*
		 * No need for any more future segments, or InstallXLogFileSegment()
		 * failed to rename the file into place. If the rename failed, opening
		 * the file below will fail.
		 */
2443 2444 2445 2446 2447 2448 2449
		unlink(tmppath);
	}

	/* Set flag to tell caller there was no existent file */
	*use_existent = false;

	/* Now open original target segment (might not be file I just made) */
2450
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2451 2452
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2453
		ereport(ERROR,
2454
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2455 2456
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2457

2458 2459
	elog(DEBUG2, "done creating and filling new WAL file");

2460
	return fd;
2461 2462
}

2463 2464 2465 2466 2467 2468 2469 2470 2471
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
2472
 * considerations.	But we should be just as tense as XLogFileInit to avoid
2473 2474 2475 2476 2477 2478 2479 2480
 * emplacing a bogus file.
 */
static void
XLogFileCopy(uint32 log, uint32 seg,
			 TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
2481
	char		buffer[XLOG_BLCKSZ];
2482 2483 2484 2485 2486 2487 2488 2489 2490 2491
	int			srcfd;
	int			fd;
	int			nbytes;

	/*
	 * Open the source file
	 */
	XLogFilePath(path, srcTLI, srclog, srcseg);
	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
2492
		ereport(ERROR,
2493 2494 2495 2496 2497 2498
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", path)));

	/*
	 * Copy into a temp file name.
	 */
2499
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2500 2501 2502

	unlink(tmppath);

2503
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2504 2505 2506
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
2507
		ereport(ERROR,
2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * Do the data copying.
	 */
	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
	{
		errno = 0;
		if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			if (errno != 0)
2520
				ereport(ERROR,
2521 2522 2523
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			else
2524
				ereport(ERROR,
B
Bruce Momjian 已提交
2525
						(errmsg("not enough data in file \"%s\"", path)));
2526 2527 2528 2529 2530 2531 2532
		}
		errno = 0;
		if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
		{
			int			save_errno = errno;

			/*
B
Bruce Momjian 已提交
2533
			 * If we fail to make the file, delete it to release disk space
2534 2535 2536 2537 2538
			 */
			unlink(tmppath);
			/* if write didn't set errno, assume problem is no disk space */
			errno = save_errno ? save_errno : ENOSPC;

2539
			ereport(ERROR,
2540
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
2541
					 errmsg("could not write to file \"%s\": %m", tmppath)));
2542 2543 2544 2545
		}
	}

	if (pg_fsync(fd) != 0)
2546
		ereport(ERROR,
2547 2548 2549 2550
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
2551
		ereport(ERROR,
2552 2553 2554 2555 2556 2557 2558 2559
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));

	close(srcfd);

	/*
	 * Now move the segment into place with its final name.
	 */
2560
	if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2561
		elog(ERROR, "InstallXLogFileSegment should not have failed");
2562 2563
}

2564 2565 2566 2567 2568 2569
/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
2570 2571 2572
 * *log, *seg: identify segment to install as (or first possible target).
 * When find_free is TRUE, these are modified on return to indicate the
 * actual installation location or last segment searched.
2573 2574 2575 2576 2577 2578 2579
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty log/seg
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
2580 2581 2582 2583
 * *max_advance: maximum number of log/seg slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
2584
 *
2585
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
2586
 * place.  This should be TRUE except during bootstrap log creation.  The
2587
 * caller must *not* hold the lock at call.
2588
 *
2589 2590 2591
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
2592 2593
 */
static bool
2594 2595
InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
					   bool find_free, int *max_advance,
2596 2597 2598
					   bool use_lock)
{
	char		path[MAXPGPATH];
2599
	struct stat stat_buf;
2600

2601
	XLogFilePath(path, ThisTimeLineID, *log, *seg);
2602 2603 2604 2605 2606

	/*
	 * We want to be sure that only one process does this at a time.
	 */
	if (use_lock)
2607
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2608

2609 2610 2611 2612 2613
	if (!find_free)
	{
		/* Force installation: get rid of any pre-existing segment file */
		unlink(path);
	}
2614 2615
	else
	{
2616
		/* Find a free slot to put it in */
2617
		while (stat(path, &stat_buf) == 0)
2618
		{
2619
			if (*max_advance <= 0)
2620 2621 2622
			{
				/* Failed to find a free slot within specified range */
				if (use_lock)
2623
					LWLockRelease(ControlFileLock);
2624 2625
				return false;
			}
2626 2627 2628
			NextLogSeg(*log, *seg);
			(*max_advance)--;
			XLogFilePath(path, ThisTimeLineID, *log, *seg);
2629 2630 2631 2632 2633 2634 2635
		}
	}

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
2636
	 */
2637
#if HAVE_WORKING_LINK
2638
	if (link(tmppath, path) < 0)
2639 2640 2641 2642
	{
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2643
				(errcode_for_file_access(),
2644
				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2645
						tmppath, path, *log, *seg)));
2646 2647
		return false;
	}
2648
	unlink(tmppath);
2649
#else
2650
	if (rename(tmppath, path) < 0)
2651
	{
2652 2653 2654
		if (use_lock)
			LWLockRelease(ControlFileLock);
		ereport(LOG,
2655
				(errcode_for_file_access(),
2656
				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2657
						tmppath, path, *log, *seg)));
2658
		return false;
2659
	}
2660
#endif
V
Vadim B. Mikheev 已提交
2661

2662
	if (use_lock)
2663
		LWLockRelease(ControlFileLock);
2664

2665
	return true;
2666 2667
}

T
Tom Lane 已提交
2668
/*
2669
 * Open a pre-existing logfile segment for writing.
T
Tom Lane 已提交
2670
 */
2671
int
2672
XLogFileOpen(uint32 log, uint32 seg)
2673
{
2674 2675
	char		path[MAXPGPATH];
	int			fd;
2676

2677
	XLogFilePath(path, ThisTimeLineID, log, seg);
2678

2679
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2680
					   S_IRUSR | S_IWUSR);
2681
	if (fd < 0)
2682 2683
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
2684 2685
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2686 2687 2688 2689 2690 2691

	return fd;
}

/*
 * Open a logfile segment for reading (during recovery).
2692
 *
2693
 * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2694
 * Otherwise, it's assumed to be already available in pg_xlog.
2695 2696
 */
static int
2697
XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
2698
			 int source, bool notfoundOk)
2699 2700
{
	char		xlogfname[MAXFNAMELEN];
2701
	char		activitymsg[MAXFNAMELEN + 16];
2702
	char		path[MAXPGPATH];
2703
	int			fd;
2704

B
Bruce Momjian 已提交
2705
	XLogFileName(xlogfname, tli, log, seg);
2706

2707
	switch (source)
B
Bruce Momjian 已提交
2708
	{
2709 2710 2711 2712 2713
		case XLOG_FROM_ARCHIVE:
			/* Report recovery progress in PS display */
			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
					 xlogfname);
			set_ps_display(activitymsg, false);
2714

2715 2716 2717 2718 2719 2720 2721 2722
			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
													  "RECOVERYXLOG",
													  XLogSegSize);
			if (!restoredFromArchive)
				return -1;
			break;

		case XLOG_FROM_PG_XLOG:
2723
		case XLOG_FROM_STREAM:
2724 2725 2726 2727 2728 2729
			XLogFilePath(path, tli, log, seg);
			restoredFromArchive = false;
			break;

		default:
			elog(ERROR, "invalid XLogFileRead source %d", source);
B
Bruce Momjian 已提交
2730
	}
2731

B
Bruce Momjian 已提交
2732 2733 2734 2735 2736
	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (fd >= 0)
	{
		/* Success! */
		curFileTLI = tli;
2737

B
Bruce Momjian 已提交
2738 2739 2740 2741 2742
		/* Report recovery progress in PS display */
		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
				 xlogfname);
		set_ps_display(activitymsg, false);

2743
		/* Track source of data in assorted state variables */
2744
		readSource = source;
2745 2746 2747 2748 2749
		XLogReceiptSource = source;
		/* In FROM_STREAM case, caller tracks receipt time, not me */
		if (source != XLOG_FROM_STREAM)
			XLogReceiptTime = GetCurrentTimestamp();

B
Bruce Momjian 已提交
2750 2751 2752 2753 2754 2755 2756 2757
		return fd;
	}
	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
		ereport(PANIC,
				(errcode_for_file_access(),
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
	return -1;
2758 2759 2760 2761 2762 2763 2764 2765
}

/*
 * Open a logfile segment for reading (during recovery).
 *
 * This version searches for the segment with any TLI listed in expectedTLIs.
 */
static int
2766
XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788
{
	char		path[MAXPGPATH];
	ListCell   *cell;
	int			fd;

	/*
	 * Loop looking for a suitable timeline ID: we might need to read any of
	 * the timelines listed in expectedTLIs.
	 *
	 * We expect curFileTLI on entry to be the TLI of the preceding file in
	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
	 * to go backwards; this prevents us from picking up the wrong file when a
	 * parent timeline extends to higher segment numbers than the child we
	 * want to read.
	 */
	foreach(cell, expectedTLIs)
	{
		TimeLineID	tli = (TimeLineID) lfirst_int(cell);

		if (tli < curFileTLI)
			break;				/* don't bother looking at too-old TLIs */

2789 2790 2791 2792 2793 2794 2795 2796 2797
		if (sources & XLOG_FROM_ARCHIVE)
		{
			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_ARCHIVE, true);
			if (fd != -1)
			{
				elog(DEBUG1, "got WAL segment from archive");
				return fd;
			}
		}
2798

2799
		if (sources & XLOG_FROM_PG_XLOG)
2800
		{
2801
			fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
2802 2803 2804
			if (fd != -1)
				return fd;
		}
2805 2806 2807 2808 2809 2810 2811
	}

	/* Couldn't find it.  For simplicity, complain about front timeline */
	XLogFilePath(path, recoveryTargetTLI, log, seg);
	errno = ENOENT;
	ereport(emode,
			(errcode_for_file_access(),
B
Bruce Momjian 已提交
2812 2813
		   errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
				  path, log, seg)));
2814
	return -1;
2815 2816
}

2817 2818 2819 2820 2821 2822 2823 2824 2825
/*
 * Close the current logfile segment for writing.
 */
static void
XLogFileClose(void)
{
	Assert(openLogFile >= 0);

	/*
2826
	 * WAL segment files will not be re-read in normal operation, so we advise
2827
	 * the OS to release any cached pages.	But do not do so if WAL archiving
B
Bruce Momjian 已提交
2828 2829
	 * or streaming is active, because archiver and walsender process could
	 * use the cache to read the WAL segment.
2830
	 */
2831
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2832
	if (!XLogIsNeeded())
2833
		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2834
#endif
2835

2836 2837
	if (close(openLogFile))
		ereport(PANIC,
B
Bruce Momjian 已提交
2838 2839 2840
				(errcode_for_file_access(),
				 errmsg("could not close log file %u, segment %u: %m",
						openLogId, openLogSeg)));
2841 2842 2843
	openLogFile = -1;
}

2844
/*
2845
 * Attempt to retrieve the specified file from off-line archival storage.
2846
 * If successful, fill "path" with its complete path (note that this will be
2847 2848
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
2849
 *
2850 2851 2852
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return
 * FALSE.
2853 2854 2855 2856
 *
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery.  If the file size is not
 * known, set expectedSize = 0.
2857
 */
2858 2859
static bool
RestoreArchivedFile(char *path, const char *xlogfname,
2860
					const char *recovername, off_t expectedSize)
2861
{
B
Bruce Momjian 已提交
2862 2863
	char		xlogpath[MAXPGPATH];
	char		xlogRestoreCmd[MAXPGPATH];
2864
	char		lastRestartPointFname[MAXPGPATH];
B
Bruce Momjian 已提交
2865 2866
	char	   *dp;
	char	   *endp;
2867
	const char *sp;
B
Bruce Momjian 已提交
2868
	int			rc;
2869
	bool		signaled;
2870
	struct stat stat_buf;
B
Bruce Momjian 已提交
2871 2872
	uint32		restartLog;
	uint32		restartSeg;
2873

2874
	/* In standby mode, restore_command might not be supplied */
2875
	if (recoveryRestoreCommand == NULL)
2876 2877
		goto not_available;

2878
	/*
B
Bruce Momjian 已提交
2879 2880 2881 2882
	 * When doing archive recovery, we always prefer an archived log file even
	 * if a file of the same name exists in XLOGDIR.  The reason is that the
	 * file in XLOGDIR could be an old, un-filled or partly-filled version
	 * that was copied and restored as part of backing up $PGDATA.
2883
	 *
B
Bruce Momjian 已提交
2884
	 * We could try to optimize this slightly by checking the local copy
B
Bruce Momjian 已提交
2885 2886 2887 2888
	 * lastchange timestamp against the archived copy, but we have no API to
	 * do this, nor can we guarantee that the lastchange timestamp was
	 * preserved correctly when we copied to archive. Our aim is robustness,
	 * so we elect not to do this.
2889
	 *
2890 2891 2892
	 * If we cannot obtain the log file from the archive, however, we will try
	 * to use the XLOGDIR file if it exists.  This is so that we can make use
	 * of log segments that weren't yet transferred to the archive.
2893
	 *
2894 2895 2896 2897
	 * Notice that we don't actually overwrite any files when we copy back
	 * from archive because the recoveryRestoreCommand may inadvertently
	 * restore inappropriate xlogs, or they may be corrupt, so we may wish to
	 * fallback to the segments remaining in current XLOGDIR later. The
B
Bruce Momjian 已提交
2898 2899
	 * copy-from-archive filename is always the same, ensuring that we don't
	 * run out of disk space on long recoveries.
2900
	 */
2901
	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2902 2903

	/*
2904
	 * Make sure there is no existing file named recovername.
2905 2906 2907 2908 2909 2910
	 */
	if (stat(xlogpath, &stat_buf) != 0)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
2911
					 errmsg("could not stat file \"%s\": %m",
2912 2913 2914 2915 2916 2917 2918
							xlogpath)));
	}
	else
	{
		if (unlink(xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
2919
					 errmsg("could not remove file \"%s\": %m",
2920 2921 2922
							xlogpath)));
	}

2923 2924
	/*
	 * Calculate the archive file cutoff point for use during log shipping
2925 2926
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
2927 2928
	 *
	 * We initialise this with the filename of an InvalidXLogRecPtr, which
2929 2930
	 * will prevent the deletion of any WAL files from the archive because of
	 * the alphabetic sorting property of WAL filenames.
2931 2932 2933
	 *
	 * Once we have successfully located the redo pointer of the checkpoint
	 * from which we start recovery we never request a file prior to the redo
2934 2935 2936 2937
	 * pointer of the last restartpoint. When redo begins we know that we have
	 * successfully located it, so there is no need for additional status
	 * flags to signify the point when we can begin deleting WAL files from
	 * the archive.
2938 2939 2940 2941 2942 2943 2944 2945 2946
	 */
	if (InRedo)
	{
		XLByteToSeg(ControlFile->checkPointCopy.redo,
					restartLog, restartSeg);
		XLogFileName(lastRestartPointFname,
					 ControlFile->checkPointCopy.ThisTimeLineID,
					 restartLog, restartSeg);
		/* we shouldn't need anything earlier than last restart point */
2947
		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2948 2949 2950 2951
	}
	else
		XLogFileName(lastRestartPointFname, 0, 0, 0);

2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965
	/*
	 * construct the command to be executed
	 */
	dp = xlogRestoreCmd;
	endp = xlogRestoreCmd + MAXPGPATH - 1;
	*endp = '\0';

	for (sp = recoveryRestoreCommand; *sp; sp++)
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'p':
2966
					/* %p: relative path of target file */
2967
					sp++;
B
Bruce Momjian 已提交
2968
					StrNCpy(dp, xlogpath, endp - dp);
2969
					make_native_path(dp);
2970 2971 2972 2973 2974
					dp += strlen(dp);
					break;
				case 'f':
					/* %f: filename of desired file */
					sp++;
B
Bruce Momjian 已提交
2975
					StrNCpy(dp, xlogfname, endp - dp);
2976 2977
					dp += strlen(dp);
					break;
2978 2979 2980 2981 2982 2983
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
B
Bruce Momjian 已提交
3006
			(errmsg_internal("executing restore command \"%s\"",
3007 3008
							 xlogRestoreCmd)));

3009 3010
	/*
	 * Set in_restore_command to tell the signal handler that we should exit
3011
	 * right away on SIGTERM. We know that we're at a safe point to do that.
3012 3013 3014 3015 3016
	 * Check if we had already received the signal, so that we don't miss a
	 * shutdown request received just before this.
	 */
	in_restore_command = true;
	if (shutdown_requested)
3017
		proc_exit(1);
3018

3019
	/*
3020
	 * Copy xlog from archival storage to XLOGDIR
3021 3022
	 */
	rc = system(xlogRestoreCmd);
3023 3024 3025

	in_restore_command = false;

3026 3027
	if (rc == 0)
	{
3028 3029 3030 3031 3032 3033 3034
		/*
		 * command apparently succeeded, but let's make sure the file is
		 * really there now and has the correct size.
		 */
		if (stat(xlogpath, &stat_buf) == 0)
		{
			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
3035
			{
B
Bruce Momjian 已提交
3036
				int			elevel;
3037 3038 3039 3040 3041 3042 3043

				/*
				 * If we find a partial file in standby mode, we assume it's
				 * because it's just being copied to the archive, and keep
				 * trying.
				 *
				 * Otherwise treat a wrong-sized file as FATAL to ensure the
B
Bruce Momjian 已提交
3044
				 * DBA would notice it, but is that too strong? We could try
3045 3046
				 * to plow ahead with a local copy of the file ... but the
				 * problem is that there probably isn't one, and we'd
B
Bruce Momjian 已提交
3047 3048
				 * incorrectly conclude we've reached the end of WAL and we're
				 * done recovering ...
3049 3050 3051 3052 3053 3054
				 */
				if (StandbyMode && stat_buf.st_size < expectedSize)
					elevel = DEBUG1;
				else
					elevel = FATAL;
				ereport(elevel,
3055 3056 3057 3058
						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
								xlogfname,
								(unsigned long) stat_buf.st_size,
								(unsigned long) expectedSize)));
3059 3060
				return false;
			}
3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075
			else
			{
				ereport(LOG,
						(errmsg("restored log file \"%s\" from archive",
								xlogfname)));
				strcpy(path, xlogpath);
				return true;
			}
		}
		else
		{
			/* stat failed */
			if (errno != ENOENT)
				ereport(FATAL,
						(errcode_for_file_access(),
P
Peter Eisentraut 已提交
3076
						 errmsg("could not stat file \"%s\": %m",
3077
								xlogpath)));
3078 3079 3080 3081
		}
	}

	/*
3082
	 * Remember, we rollforward UNTIL the restore fails so failure here is
B
Bruce Momjian 已提交
3083
	 * just part of the process... that makes it difficult to determine
B
Bruce Momjian 已提交
3084 3085 3086
	 * whether the restore failed because there isn't an archive to restore,
	 * or because the administrator has specified the restore program
	 * incorrectly.  We have to assume the former.
3087 3088
	 *
	 * However, if the failure was due to any sort of signal, it's best to
B
Bruce Momjian 已提交
3089 3090 3091
	 * punt and abort recovery.  (If we "return false" here, upper levels will
	 * assume that recovery is complete and start up the database!) It's
	 * essential to abort on child SIGINT and SIGQUIT, because per spec
3092
	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3093 3094 3095 3096 3097
	 * those it's a good bet we should have gotten it too.
	 *
	 * On SIGTERM, assume we have received a fast shutdown request, and exit
	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
	 * child process. If we receive it first, the signal handler will call
3098 3099 3100
	 * proc_exit, otherwise we do it here. If we or the child process received
	 * SIGTERM for any other reason than a fast shutdown request, postmaster
	 * will perform an immediate shutdown when it sees us exiting
3101
	 * unexpectedly.
3102
	 *
B
Bruce Momjian 已提交
3103 3104 3105 3106
	 * Per the Single Unix Spec, shells report exit status > 128 when a called
	 * command died on a signal.  Also, 126 and 127 are used to report
	 * problems such as an unfindable command; treat those as fatal errors
	 * too.
3107
	 */
3108
	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3109
		proc_exit(1);
3110

3111 3112 3113
	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

	ereport(signaled ? FATAL : DEBUG2,
B
Bruce Momjian 已提交
3114 3115
		(errmsg("could not restore file \"%s\" from archive: return code %d",
				xlogfname, rc)));
3116

3117
not_available:
B
Bruce Momjian 已提交
3118

3119
	/*
B
Bruce Momjian 已提交
3120 3121
	 * if an archived file is not available, there might still be a version of
	 * this file in XLOGDIR, so return that as the filename to open.
3122
	 *
B
Bruce Momjian 已提交
3123 3124
	 * In many recovery scenarios we expect this to fail also, but if so that
	 * just means we've reached the end of WAL.
3125
	 */
3126
	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3127
	return false;
3128 3129
}

3130
/*
3131 3132 3133 3134
 * Attempt to execute an external shell command during recovery.
 *
 * 'command' is the shell command to be executed, 'commandName' is a
 * human-readable name describing the command emitted in the logs. If
3135
 * 'failOnSignal' is true and the command is killed by a signal, a FATAL
3136 3137
 * error is thrown. Otherwise a WARNING is emitted.
 *
3138
 * This is currently used for recovery_end_command and archive_cleanup_command.
3139 3140
 */
static void
3141
ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
3142
{
3143
	char		xlogRecoveryCmd[MAXPGPATH];
3144 3145 3146 3147 3148 3149 3150 3151 3152
	char		lastRestartPointFname[MAXPGPATH];
	char	   *dp;
	char	   *endp;
	const char *sp;
	int			rc;
	bool		signaled;
	uint32		restartLog;
	uint32		restartSeg;

3153
	Assert(command && commandName);
3154 3155 3156

	/*
	 * Calculate the archive file cutoff point for use during log shipping
3157 3158
	 * replication. All files earlier than this point can be deleted from the
	 * archive, though there is no requirement to do so.
3159
	 */
3160 3161 3162 3163 3164 3165 3166
	LWLockAcquire(ControlFileLock, LW_SHARED);
	XLByteToSeg(ControlFile->checkPointCopy.redo,
				restartLog, restartSeg);
	XLogFileName(lastRestartPointFname,
				 ControlFile->checkPointCopy.ThisTimeLineID,
				 restartLog, restartSeg);
	LWLockRelease(ControlFileLock);
3167 3168 3169 3170

	/*
	 * construct the command to be executed
	 */
3171 3172
	dp = xlogRecoveryCmd;
	endp = xlogRecoveryCmd + MAXPGPATH - 1;
3173 3174
	*endp = '\0';

3175
	for (sp = command; *sp; sp++)
3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
	{
		if (*sp == '%')
		{
			switch (sp[1])
			{
				case 'r':
					/* %r: filename of last restartpoint */
					sp++;
					StrNCpy(dp, lastRestartPointFname, endp - dp);
					dp += strlen(dp);
					break;
				case '%':
					/* convert %% to a single % */
					sp++;
					if (dp < endp)
						*dp++ = *sp;
					break;
				default:
					/* otherwise treat the % as not special */
					if (dp < endp)
						*dp++ = *sp;
					break;
			}
		}
		else
		{
			if (dp < endp)
				*dp++ = *sp;
		}
	}
	*dp = '\0';

	ereport(DEBUG3,
3209
			(errmsg_internal("executing %s \"%s\"", commandName, command)));
3210 3211

	/*
T
Tom Lane 已提交
3212
	 * execute the constructed command
3213
	 */
3214
	rc = system(xlogRecoveryCmd);
3215 3216 3217 3218
	if (rc != 0)
	{
		/*
		 * If the failure was due to any sort of signal, it's best to punt and
3219
		 * abort recovery. See also detailed comments on signals in
3220 3221 3222 3223
		 * RestoreArchivedFile().
		 */
		signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

3224 3225 3226 3227 3228 3229 3230
		/*
		 * translator: First %s represents a recovery.conf parameter name like
		 * "recovery_end_command", and the 2nd is the value of that parameter.
		 */
		ereport((signaled && failOnSignal) ? FATAL : WARNING,
				(errmsg("%s \"%s\": return code %d", commandName,
						command, rc)));
3231 3232 3233
	}
}

V
Vadim B. Mikheev 已提交
3234
/*
3235 3236 3237 3238 3239 3240 3241 3242
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
T
Tom Lane 已提交
3243
 */
3244
static void
T
Tom Lane 已提交
3245 3246 3247 3248 3249
PreallocXlogFiles(XLogRecPtr endptr)
{
	uint32		_logId;
	uint32		_logSeg;
	int			lf;
3250
	bool		use_existent;
T
Tom Lane 已提交
3251 3252

	XLByteToPrevSeg(endptr, _logId, _logSeg);
B
Bruce Momjian 已提交
3253
	if ((endptr.xrecoff - 1) % XLogSegSize >=
B
Bruce Momjian 已提交
3254
		(uint32) (0.75 * XLogSegSize))
T
Tom Lane 已提交
3255 3256
	{
		NextLogSeg(_logId, _logSeg);
3257 3258
		use_existent = true;
		lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
T
Tom Lane 已提交
3259
		close(lf);
3260
		if (!use_existent)
3261
			CheckpointStats.ckpt_segs_added++;
T
Tom Lane 已提交
3262 3263 3264
	}
}

3265
/*
3266
 * Get the log/seg of the latest removed or recycled WAL segment.
3267
 * Returns 0/0 if no WAL segments have been removed since startup.
3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280
 */
void
XLogGetLastRemoved(uint32 *log, uint32 *seg)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	*log = xlogctl->lastRemovedLog;
	*seg = xlogctl->lastRemovedSeg;
	SpinLockRelease(&xlogctl->info_lck);
}

3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305
/*
 * Update the last removed log/seg pointer in shared memory, to reflect
 * that the given XLOG file has been removed.
 */
static void
UpdateLastRemovedPtr(char *filename)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	uint32		tli,
				log,
				seg;

	XLogFromFileName(filename, &tli, &log, &seg);

	SpinLockAcquire(&xlogctl->info_lck);
	if (log > xlogctl->lastRemovedLog ||
		(log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
	{
		xlogctl->lastRemovedLog = log;
		xlogctl->lastRemovedSeg = seg;
	}
	SpinLockRelease(&xlogctl->info_lck);
}

T
Tom Lane 已提交
3306
/*
3307
 * Recycle or remove all log files older or equal to passed log/seg#
3308 3309 3310
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
V
Vadim B. Mikheev 已提交
3311 3312
 */
static void
3313
RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
V
Vadim B. Mikheev 已提交
3314
{
3315 3316
	uint32		endlogId;
	uint32		endlogSeg;
3317
	int			max_advance;
B
Bruce Momjian 已提交
3318 3319
	DIR		   *xldir;
	struct dirent *xlde;
3320
	char		lastoff[MAXFNAMELEN];
B
Bruce Momjian 已提交
3321
	char		path[MAXPGPATH];
B
Bruce Momjian 已提交
3322

3323 3324 3325
#ifdef WIN32
	char		newpath[MAXPGPATH];
#endif
3326
	struct stat statbuf;
3327

3328 3329 3330 3331
	/*
	 * Initialize info about where to try to recycle to.  We allow recycling
	 * segments up to XLOGfileslop segments beyond the current XLOG location.
	 */
3332
	XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3333
	max_advance = XLOGfileslop;
V
Vadim B. Mikheev 已提交
3334

3335
	xldir = AllocateDir(XLOGDIR);
V
Vadim B. Mikheev 已提交
3336
	if (xldir == NULL)
3337
		ereport(ERROR,
3338
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3339 3340
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
V
Vadim B. Mikheev 已提交
3341

3342
	XLogFileName(lastoff, ThisTimeLineID, log, seg);
V
Vadim B. Mikheev 已提交
3343

3344 3345 3346
	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
		 lastoff);

3347
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
V
Vadim B. Mikheev 已提交
3348
	{
3349
		/*
3350
		 * We ignore the timeline part of the XLOG segment identifiers in
B
Bruce Momjian 已提交
3351 3352 3353 3354 3355
		 * deciding whether a segment is still needed.	This ensures that we
		 * won't prematurely remove a segment from a parent timeline. We could
		 * probably be a little more proactive about removing segments of
		 * non-parent timelines, but that would be a whole lot more
		 * complicated.
3356
		 *
B
Bruce Momjian 已提交
3357 3358
		 * We use the alphanumeric sorting property of the filenames to decide
		 * which ones are earlier than the lastoff segment.
3359
		 */
3360 3361 3362
		if (strlen(xlde->d_name) == 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
V
Vadim B. Mikheev 已提交
3363
		{
3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374
			/*
			 * Normally we don't delete old XLOG files during recovery to
			 * avoid accidentally deleting a file that looks stale due to a
			 * bug or hardware issue, but in fact contains important data.
			 * During streaming recovery, however, we will eventually fill the
			 * disk if we never clean up, so we have to. That's not an issue
			 * with file-based archive recovery because in that case we
			 * restore one XLOG file at a time, on-demand, and with a
			 * different filename that can't be confused with regular XLOG
			 * files.
			 */
3375
			if (WalRcvInProgress() || XLogArchiveCheckDone(xlde->d_name))
3376
			{
3377
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3378

3379 3380 3381
				/* Update the last removed location in shared memory first */
				UpdateLastRemovedPtr(xlde->d_name);

3382
				/*
B
Bruce Momjian 已提交
3383
				 * Before deleting the file, see if it can be recycled as a
3384 3385 3386
				 * future log segment. Only recycle normal files, pg_standby
				 * for example can create symbolic links pointing to a
				 * separate archive directory.
3387
				 */
3388 3389 3390
				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
					InstallXLogFileSegment(&endlogId, &endlogSeg, path,
										   true, &max_advance, true))
3391
				{
3392
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3393 3394
							(errmsg("recycled transaction log file \"%s\"",
									xlde->d_name)));
3395
					CheckpointStats.ckpt_segs_recycled++;
3396 3397 3398 3399 3400 3401
					/* Needn't recheck that slot on future iterations */
					if (max_advance > 0)
					{
						NextLogSeg(endlogId, endlogSeg);
						max_advance--;
					}
3402 3403 3404 3405
				}
				else
				{
					/* No need for any more future segments... */
B
Bruce Momjian 已提交
3406
					int			rc;
3407

3408
					ereport(DEBUG2,
B
Bruce Momjian 已提交
3409 3410
							(errmsg("removing transaction log file \"%s\"",
									xlde->d_name)));
3411 3412

#ifdef WIN32
B
Bruce Momjian 已提交
3413

3414 3415 3416 3417
					/*
					 * On Windows, if another process (e.g another backend)
					 * holds the file open in FILE_SHARE_DELETE mode, unlink
					 * will succeed, but the file will still show up in
B
Bruce Momjian 已提交
3418 3419 3420 3421
					 * directory listing until the last handle is closed. To
					 * avoid confusing the lingering deleted file for a live
					 * WAL file that needs to be archived, rename it before
					 * deleting it.
3422 3423 3424 3425 3426 3427 3428
					 *
					 * If another process holds the file open without
					 * FILE_SHARE_DELETE flag, rename will fail. We'll try
					 * again at the next checkpoint.
					 */
					snprintf(newpath, MAXPGPATH, "%s.deleted", path);
					if (rename(path, newpath) != 0)
3429 3430
					{
						ereport(LOG,
3431
								(errcode_for_file_access(),
3432
								 errmsg("could not rename old transaction log file \"%s\": %m",
3433
										path)));
3434 3435
						continue;
					}
3436 3437 3438 3439 3440
					rc = unlink(newpath);
#else
					rc = unlink(path);
#endif
					if (rc != 0)
3441 3442
					{
						ereport(LOG,
3443 3444 3445
								(errcode_for_file_access(),
								 errmsg("could not remove old transaction log file \"%s\": %m",
										path)));
3446 3447
						continue;
					}
3448
					CheckpointStats.ckpt_segs_removed++;
3449
				}
3450 3451

				XLogArchiveCleanup(xlde->d_name);
3452
			}
V
Vadim B. Mikheev 已提交
3453 3454
		}
	}
B
Bruce Momjian 已提交
3455

3456
	FreeDir(xldir);
V
Vadim B. Mikheev 已提交
3457 3458
}

3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475
/*
 * Verify whether pg_xlog and pg_xlog/archive_status exist.
 * If the latter does not exist, recreate it.
 *
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a cluster
 * copy for PITR purposes but omitted pg_xlog from the copy.
 *
 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
 * policy decision was made not to.  It is fairly common for pg_xlog to be
 * a symlink, and if that was the DBA's intent then automatically making a
 * plain directory would result in degraded performance with no notice.
 */
static void
ValidateXLOGDirectoryStructure(void)
{
	char		path[MAXPGPATH];
3476
	struct stat stat_buf;
3477 3478 3479 3480

	/* Check for pg_xlog; if it doesn't exist, error out */
	if (stat(XLOGDIR, &stat_buf) != 0 ||
		!S_ISDIR(stat_buf.st_mode))
3481
		ereport(FATAL,
3482 3483 3484 3485 3486 3487 3488 3489 3490
				(errmsg("required WAL directory \"%s\" does not exist",
						XLOGDIR)));

	/* Check for archive_status */
	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
	if (stat(path, &stat_buf) == 0)
	{
		/* Check for weird cases where it exists but isn't a directory */
		if (!S_ISDIR(stat_buf.st_mode))
3491
			ereport(FATAL,
3492 3493 3494 3495 3496 3497 3498
					(errmsg("required WAL directory \"%s\" does not exist",
							path)));
	}
	else
	{
		ereport(LOG,
				(errmsg("creating missing WAL directory \"%s\"", path)));
3499
		if (mkdir(path, S_IRWXU) < 0)
3500
			ereport(FATAL,
3501 3502 3503 3504 3505
					(errmsg("could not create missing directory \"%s\": %m",
							path)));
	}
}

3506
/*
3507 3508 3509
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
3510 3511
 */
static void
3512
CleanupBackupHistory(void)
3513 3514 3515 3516 3517
{
	DIR		   *xldir;
	struct dirent *xlde;
	char		path[MAXPGPATH];

3518
	xldir = AllocateDir(XLOGDIR);
3519 3520 3521
	if (xldir == NULL)
		ereport(ERROR,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
3522 3523
				 errmsg("could not open transaction log directory \"%s\": %m",
						XLOGDIR)));
3524

3525
	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3526 3527 3528 3529 3530 3531
	{
		if (strlen(xlde->d_name) > 24 &&
			strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
			strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
				   ".backup") == 0)
		{
3532
			if (XLogArchiveCheckDone(xlde->d_name))
3533 3534
			{
				ereport(DEBUG2,
B
Bruce Momjian 已提交
3535 3536
				(errmsg("removing transaction log backup history file \"%s\"",
						xlde->d_name)));
3537
				snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3538 3539 3540 3541 3542 3543 3544 3545 3546
				unlink(path);
				XLogArchiveCleanup(xlde->d_name);
			}
		}
	}

	FreeDir(xldir);
}

T
Tom Lane 已提交
3547 3548 3549 3550
/*
 * Restore the backup blocks present in an XLOG record, if any.
 *
 * We assume all of the record has been read into memory at *record.
3551 3552 3553 3554 3555 3556 3557 3558 3559
 *
 * Note: when a backup block is available in XLOG, we restore it
 * unconditionally, even if the page in the database appears newer.
 * This is to protect ourselves against database pages that were partially
 * or incorrectly written during a crash.  We assume that the XLOG data
 * must be good because it has passed a CRC check, while the database
 * page might not be.  This will force us to replay all subsequent
 * modifications of the page that appear in XLOG, rather than possibly
 * ignoring them as already applied, but that's not a huge drawback.
3560 3561
 *
 * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3562 3563 3564 3565 3566
 * Otherwise, a normal exclusive lock is used.	During crash recovery, that's
 * just pro forma because there can't be any regular backends in the system,
 * but in hot standby mode the distinction is important. The 'cleanup'
 * argument applies to all backup blocks in the WAL record, that suffices for
 * now.
T
Tom Lane 已提交
3567
 */
3568 3569
void
RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3570 3571 3572 3573 3574 3575 3576
{
	Buffer		buffer;
	Page		page;
	BkpBlock	bkpb;
	char	   *blk;
	int			i;

3577 3578 3579
	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
		return;

B
Bruce Momjian 已提交
3580
	blk = (char *) XLogRecGetData(record) + record->xl_len;
T
Tom Lane 已提交
3581
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3582
	{
T
Tom Lane 已提交
3583
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3584 3585
			continue;

3586
		memcpy(&bkpb, blk, sizeof(BkpBlock));
3587 3588
		blk += sizeof(BkpBlock);

3589 3590
		buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
										RBM_ZERO);
3591
		Assert(BufferIsValid(buffer));
3592 3593 3594 3595 3596
		if (cleanup)
			LockBufferForCleanup(buffer);
		else
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

3597
		page = (Page) BufferGetPage(buffer);
3598

3599
		if (bkpb.hole_length == 0)
3600
		{
3601 3602 3603 3604 3605 3606 3607 3608 3609 3610
			memcpy((char *) page, blk, BLCKSZ);
		}
		else
		{
			/* must zero-fill the hole */
			MemSet((char *) page, 0, BLCKSZ);
			memcpy((char *) page, blk, bkpb.hole_offset);
			memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
				   blk + bkpb.hole_offset,
				   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3611 3612
		}

3613 3614
		PageSetLSN(page, lsn);
		PageSetTLI(page, ThisTimeLineID);
3615 3616
		MarkBufferDirty(buffer);
		UnlockReleaseBuffer(buffer);
3617

3618
		blk += BLCKSZ - bkpb.hole_length;
3619 3620 3621
	}
}

T
Tom Lane 已提交
3622 3623 3624 3625 3626 3627 3628
/*
 * CRC-check an XLOG record.  We do not believe the contents of an XLOG
 * record (other than to the minimal extent of computing the amount of
 * data to read in) until we've checked the CRCs.
 *
 * We assume all of the record has been read into memory at *record.
 */
3629 3630 3631
static bool
RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
{
3632
	pg_crc32	crc;
3633 3634
	int			i;
	uint32		len = record->xl_len;
3635
	BkpBlock	bkpb;
3636 3637
	char	   *blk;

3638 3639 3640
	/* First the rmgr data */
	INIT_CRC32(crc);
	COMP_CRC32(crc, XLogRecGetData(record), len);
3641

3642
	/* Add in the backup blocks, if any */
B
Bruce Momjian 已提交
3643
	blk = (char *) XLogRecGetData(record) + len;
T
Tom Lane 已提交
3644
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3645
	{
B
Bruce Momjian 已提交
3646
		uint32		blen;
3647

T
Tom Lane 已提交
3648
		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3649 3650
			continue;

3651 3652
		memcpy(&bkpb, blk, sizeof(BkpBlock));
		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3653
		{
3654
			ereport(emode_for_corrupt_record(emode, recptr),
3655 3656 3657
					(errmsg("incorrect hole size in record at %X/%X",
							recptr.xlogid, recptr.xrecoff)));
			return false;
3658
		}
3659 3660 3661 3662 3663 3664 3665 3666
		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
		COMP_CRC32(crc, blk, blen);
		blk += blen;
	}

	/* Check that xl_tot_len agrees with our calculation */
	if (blk != (char *) record + record->xl_tot_len)
	{
3667
		ereport(emode_for_corrupt_record(emode, recptr),
3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679
				(errmsg("incorrect total length in record at %X/%X",
						recptr.xlogid, recptr.xrecoff)));
		return false;
	}

	/* Finally include the record header */
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);

	if (!EQ_CRC32(record->xl_crc, crc))
	{
3680
		ereport(emode_for_corrupt_record(emode, recptr),
B
Bruce Momjian 已提交
3681 3682
		(errmsg("incorrect resource manager data checksum in record at %X/%X",
				recptr.xlogid, recptr.xrecoff)));
3683
		return false;
3684 3685
	}

3686
	return true;
3687 3688
}

T
Tom Lane 已提交
3689 3690 3691 3692 3693 3694
/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
3695
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3696
 * (emode must be either PANIC, LOG)
T
Tom Lane 已提交
3697
 *
3698 3699
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
T
Tom Lane 已提交
3700
 */
3701
static XLogRecord *
3702
ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3703
{
3704
	XLogRecord *record;
3705
	char	   *buffer;
3706
	XLogRecPtr	tmpRecPtr = EndRecPtr;
3707
	bool		randAccess = false;
T
Tom Lane 已提交
3708 3709
	uint32		len,
				total_len;
3710 3711
	uint32		targetRecOff;
	uint32		pageHeaderSize;
T
Tom Lane 已提交
3712 3713 3714 3715

	if (readBuf == NULL)
	{
		/*
B
Bruce Momjian 已提交
3716 3717 3718 3719 3720
		 * First time through, permanently allocate readBuf.  We do it this
		 * way, rather than just making a static array, for two reasons: (1)
		 * no need to waste the storage in most instantiations of the backend;
		 * (2) a static char array isn't guaranteed to have any particular
		 * alignment, whereas malloc() will provide MAXALIGN'd storage.
T
Tom Lane 已提交
3721
		 */
3722
		readBuf = (char *) malloc(XLOG_BLCKSZ);
T
Tom Lane 已提交
3723 3724
		Assert(readBuf != NULL);
	}
3725

T
Tom Lane 已提交
3726
	if (RecPtr == NULL)
3727
	{
3728
		RecPtr = &tmpRecPtr;
3729 3730

		/*
B
Bruce Momjian 已提交
3731 3732
		 * Align recptr to next page if no more records can fit on the current
		 * page.
3733
		 */
3734 3735
		if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
		{
3736
			NextLogPage(tmpRecPtr);
3737 3738
			/* We will account for page header size below */
		}
3739 3740 3741 3742 3743 3744

		if (tmpRecPtr.xrecoff >= XLogFileSize)
		{
			(tmpRecPtr.xlogid)++;
			tmpRecPtr.xrecoff = 0;
		}
3745 3746 3747 3748 3749 3750 3751
	}
	else
	{
		if (!XRecOffIsValid(RecPtr->xrecoff))
			ereport(PANIC,
					(errmsg("invalid record offset at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
B
Bruce Momjian 已提交
3752

3753
		/*
B
Bruce Momjian 已提交
3754 3755 3756 3757 3758
		 * Since we are going to a random position in WAL, forget any prior
		 * state about what timeline we were in, and allow it to be any
		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
		 * to go backwards (but we can't reset that variable right here, since
		 * we might not change files at all).
3759 3760 3761
		 */
		lastPageTLI = 0;		/* see comment in ValidXLOGHeader */
		randAccess = true;		/* allow curFileTLI to go backwards too */
3762 3763
	}

3764 3765 3766
	/* This is the first try to read this page. */
	failedSources = 0;
retry:
3767 3768 3769
	/* Read the page containing the record */
	if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
		return NULL;
3770

3771
	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3772
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3773 3774 3775
	if (targetRecOff == 0)
	{
		/*
B
Bruce Momjian 已提交
3776 3777 3778
		 * Can only get here in the continuing-from-prev-page case, because
		 * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
		 * to skip over the new page's header.
3779 3780 3781 3782 3783 3784
		 */
		tmpRecPtr.xrecoff += pageHeaderSize;
		targetRecOff = pageHeaderSize;
	}
	else if (targetRecOff < pageHeaderSize)
	{
3785
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3786 3787 3788 3789
				(errmsg("invalid record offset at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
T
Tom Lane 已提交
3790
	if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3791
		targetRecOff == pageHeaderSize)
3792
	{
3793
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3794 3795
				(errmsg("contrecord is requested by %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3796 3797
		goto next_record_is_invalid;
	}
3798
	record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3799

T
Tom Lane 已提交
3800
	/*
B
Bruce Momjian 已提交
3801 3802
	 * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
	 * required.
T
Tom Lane 已提交
3803
	 */
3804 3805 3806 3807
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		if (record->xl_len != 0)
		{
3808
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3809 3810 3811 3812 3813 3814
					(errmsg("invalid xlog switch record at %X/%X",
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else if (record->xl_len == 0)
3815
	{
3816
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3817 3818
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
3819 3820
		goto next_record_is_invalid;
	}
3821 3822 3823 3824
	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
	{
3825
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3826 3827 3828 3829
				(errmsg("invalid record length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}
3830 3831
	if (record->xl_rmid > RM_MAX_ID)
	{
3832
		ereport(emode_for_corrupt_record(emode, *RecPtr),
3833
				(errmsg("invalid resource manager ID %u at %X/%X",
B
Bruce Momjian 已提交
3834
						record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3835 3836
		goto next_record_is_invalid;
	}
3837 3838 3839
	if (randAccess)
	{
		/*
B
Bruce Momjian 已提交
3840 3841
		 * We can't exactly verify the prev-link, but surely it should be less
		 * than the record's own address.
3842 3843 3844
		 */
		if (!XLByteLT(record->xl_prev, *RecPtr))
		{
3845
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3846 3847 3848 3849 3850 3851 3852 3853 3854
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
	else
	{
		/*
B
Bruce Momjian 已提交
3855 3856 3857
		 * Record's prev-link should exactly match our previous location. This
		 * check guards against torn WAL pages where a stale but valid-looking
		 * WAL record starts on a sector boundary.
3858 3859 3860
		 */
		if (!XLByteEQ(record->xl_prev, ReadRecPtr))
		{
3861
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3862 3863 3864 3865 3866 3867
					(errmsg("record with incorrect prev-link %X/%X at %X/%X",
							record->xl_prev.xlogid, record->xl_prev.xrecoff,
							RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
	}
B
Bruce Momjian 已提交
3868

T
Tom Lane 已提交
3869
	/*
B
Bruce Momjian 已提交
3870
	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3871 3872 3873 3874
	 * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
	 * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
	 * enough for all "normal" records, but very large commit or abort records
	 * might need more space.)
T
Tom Lane 已提交
3875
	 */
3876
	total_len = record->xl_tot_len;
3877
	if (total_len > readRecordBufSize)
3878
	{
3879 3880
		uint32		newSize = total_len;

3881 3882
		newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
		newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3883 3884 3885 3886 3887 3888 3889
		if (readRecordBuf)
			free(readRecordBuf);
		readRecordBuf = (char *) malloc(newSize);
		if (!readRecordBuf)
		{
			readRecordBufSize = 0;
			/* We treat this as a "bogus data" condition */
3890
			ereport(emode_for_corrupt_record(emode, *RecPtr),
3891 3892 3893 3894 3895
					(errmsg("record length %u at %X/%X too long",
							total_len, RecPtr->xlogid, RecPtr->xrecoff)));
			goto next_record_is_invalid;
		}
		readRecordBufSize = newSize;
3896
	}
3897 3898

	buffer = readRecordBuf;
3899
	len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
T
Tom Lane 已提交
3900
	if (total_len > len)
3901
	{
T
Tom Lane 已提交
3902 3903
		/* Need to reassemble record */
		XLogContRecord *contrecord;
3904
		XLogRecPtr	pagelsn;
B
Bruce Momjian 已提交
3905
		uint32		gotlen = len;
3906

3907 3908 3909 3910
		/* Initialize pagelsn to the beginning of the page this record is on */
		pagelsn = *RecPtr;
		pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;

T
Tom Lane 已提交
3911
		memcpy(buffer, record, len);
3912
		record = (XLogRecord *) buffer;
T
Tom Lane 已提交
3913
		buffer += len;
3914
		for (;;)
3915
		{
3916 3917 3918
			/* Calculate pointer to beginning of next page */
			pagelsn.xrecoff += XLOG_BLCKSZ;
			if (pagelsn.xrecoff >= XLogFileSize)
3919
			{
3920 3921
				(pagelsn.xlogid)++;
				pagelsn.xrecoff = 0;
3922
			}
3923 3924 3925
			/* Wait for the next page to become available */
			if (!XLogPageRead(&pagelsn, emode, false, false))
				return NULL;
3926

3927
			/* Check that the continuation record looks valid */
T
Tom Lane 已提交
3928
			if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3929
			{
3930
				ereport(emode_for_corrupt_record(emode, *RecPtr),
3931 3932
						(errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
								readId, readSeg, readOff)));
3933 3934
				goto next_record_is_invalid;
			}
3935 3936
			pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
			contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
B
Bruce Momjian 已提交
3937
			if (contrecord->xl_rem_len == 0 ||
T
Tom Lane 已提交
3938
				total_len != (contrecord->xl_rem_len + gotlen))
3939
			{
3940
				ereport(emode_for_corrupt_record(emode, *RecPtr),
3941 3942 3943
						(errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
								contrecord->xl_rem_len,
								readId, readSeg, readOff)));
3944 3945
				goto next_record_is_invalid;
			}
3946
			len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
T
Tom Lane 已提交
3947
			if (contrecord->xl_rem_len > len)
3948
			{
B
Bruce Momjian 已提交
3949
				memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
T
Tom Lane 已提交
3950 3951 3952 3953 3954 3955 3956 3957
				gotlen += len;
				buffer += len;
				continue;
			}
			memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
				   contrecord->xl_rem_len);
			break;
		}
3958
		if (!RecordIsValid(record, *RecPtr, emode))
T
Tom Lane 已提交
3959
			goto next_record_is_invalid;
3960
		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
T
Tom Lane 已提交
3961 3962
		EndRecPtr.xlogid = readId;
		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3963 3964
			pageHeaderSize +
			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3965

T
Tom Lane 已提交
3966
		ReadRecPtr = *RecPtr;
3967
		/* needn't worry about XLOG SWITCH, it can't cross page boundaries */
T
Tom Lane 已提交
3968
		return record;
3969 3970
	}

T
Tom Lane 已提交
3971
	/* Record does not cross a page boundary */
3972
	if (!RecordIsValid(record, *RecPtr, emode))
T
Tom Lane 已提交
3973 3974 3975
		goto next_record_is_invalid;
	EndRecPtr.xlogid = RecPtr->xlogid;
	EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3976

T
Tom Lane 已提交
3977 3978
	ReadRecPtr = *RecPtr;
	memcpy(buffer, record, total_len);
B
Bruce Momjian 已提交
3979

3980 3981 3982 3983 3984 3985 3986 3987
	/*
	 * Special processing if it's an XLOG SWITCH record
	 */
	if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
	{
		/* Pretend it extends to end of segment */
		EndRecPtr.xrecoff += XLogSegSize - 1;
		EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
B
Bruce Momjian 已提交
3988

3989
		/*
B
Bruce Momjian 已提交
3990 3991 3992
		 * Pretend that readBuf contains the last page of the segment. This is
		 * just to avoid Assert failure in StartupXLOG if XLOG ends with this
		 * segment.
3993 3994 3995
		 */
		readOff = XLogSegSize - XLOG_BLCKSZ;
	}
T
Tom Lane 已提交
3996
	return (XLogRecord *) buffer;
3997

3998 3999 4000
next_record_is_invalid:
	failedSources |= readSource;

4001 4002 4003 4004 4005
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
4006 4007 4008 4009 4010 4011

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return NULL;
4012 4013
}

4014 4015 4016 4017
/*
 * Check whether the xlog header of a page just read in looks valid.
 *
 * This is just a convenience subroutine to avoid duplicated code in
B
Bruce Momjian 已提交
4018
 * ReadRecord.	It's not intended for use from anywhere else.
4019 4020
 */
static bool
4021
ValidXLOGHeader(XLogPageHeader hdr, int emode)
4022
{
4023 4024
	XLogRecPtr	recaddr;

4025 4026 4027
	recaddr.xlogid = readId;
	recaddr.xrecoff = readSeg * XLogSegSize + readOff;

4028 4029
	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
	{
4030
		ereport(emode_for_corrupt_record(emode, recaddr),
4031 4032
				(errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
						hdr->xlp_magic, readId, readSeg, readOff)));
4033 4034 4035 4036
		return false;
	}
	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
	{
4037
		ereport(emode_for_corrupt_record(emode, recaddr),
4038 4039
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
4040 4041
		return false;
	}
4042
	if (hdr->xlp_info & XLP_LONG_HEADER)
4043
	{
4044
		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
B
Bruce Momjian 已提交
4045

4046
		if (longhdr->xlp_sysid != ControlFile->system_identifier)
4047
		{
4048 4049
			char		fhdrident_str[32];
			char		sysident_str[32];
4050

4051
			/*
B
Bruce Momjian 已提交
4052 4053
			 * Format sysids separately to keep platform-dependent format code
			 * out of the translatable message string.
4054 4055 4056 4057 4058
			 */
			snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
					 longhdr->xlp_sysid);
			snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
					 ControlFile->system_identifier);
4059
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4060 4061
					(errmsg("WAL file is from different database system"),
					 errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
B
Bruce Momjian 已提交
4062
							   fhdrident_str, sysident_str)));
4063 4064 4065 4066
			return false;
		}
		if (longhdr->xlp_seg_size != XLogSegSize)
		{
4067
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4068
					(errmsg("WAL file is from different database system"),
B
Bruce Momjian 已提交
4069
					 errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
4070 4071
			return false;
		}
4072 4073
		if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
		{
4074
			ereport(emode_for_corrupt_record(emode, recaddr),
P
Peter Eisentraut 已提交
4075
					(errmsg("WAL file is from different database system"),
4076 4077 4078
					 errdetail("Incorrect XLOG_BLCKSZ in page header.")));
			return false;
		}
4079
	}
4080 4081 4082
	else if (readOff == 0)
	{
		/* hmm, first page of file doesn't have a long header? */
4083
		ereport(emode_for_corrupt_record(emode, recaddr),
4084 4085 4086 4087 4088
				(errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
						hdr->xlp_info, readId, readSeg, readOff)));
		return false;
	}

4089 4090
	if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
	{
4091
		ereport(emode_for_corrupt_record(emode, recaddr),
4092
				(errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
B
Bruce Momjian 已提交
4093
						hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
4094 4095 4096 4097 4098 4099 4100 4101 4102
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Check page TLI is one of the expected values.
	 */
	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
	{
4103
		ereport(emode_for_corrupt_record(emode, recaddr),
4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114
				(errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
						hdr->xlp_tli,
						readId, readSeg, readOff)));
		return false;
	}

	/*
	 * Since child timelines are always assigned a TLI greater than their
	 * immediate parent's TLI, we should never see TLI go backwards across
	 * successive pages of a consistent WAL sequence.
	 *
B
Bruce Momjian 已提交
4115 4116 4117
	 * Of course this check should only be applied when advancing sequentially
	 * across pages; therefore ReadRecord resets lastPageTLI to zero when
	 * going to a random page.
4118 4119 4120
	 */
	if (hdr->xlp_tli < lastPageTLI)
	{
4121
		ereport(emode_for_corrupt_record(emode, recaddr),
4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134
				(errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
						hdr->xlp_tli, lastPageTLI,
						readId, readSeg, readOff)));
		return false;
	}
	lastPageTLI = hdr->xlp_tli;
	return true;
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
B
Bruce Momjian 已提交
4135
 * its ancestor TLIs).	If we can't find the history file, assume that the
4136 4137 4138 4139 4140 4141 4142 4143 4144 4145
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
static List *
readTimeLineHistory(TimeLineID targetTLI)
{
	List	   *result;
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		fline[MAXPGPATH];
B
Bruce Momjian 已提交
4146
	FILE	   *fd;
4147

4148 4149 4150 4151
	/* Timeline 1 does not have a history file, so no need to check */
	if (targetTLI == 1)
		return list_make1_int((int) targetTLI);

4152 4153 4154
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, targetTLI);
4155
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4156 4157 4158 4159
	}
	else
		TLHistoryFilePath(path, targetTLI);

B
Bruce Momjian 已提交
4160
	fd = AllocateFile(path, "r");
4161 4162 4163 4164 4165
	if (fd == NULL)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4166
					 errmsg("could not open file \"%s\": %m", path)));
4167 4168 4169 4170 4171 4172
		/* Not there, so assume no parents */
		return list_make1_int((int) targetTLI);
	}

	result = NIL;

B
Bruce Momjian 已提交
4173 4174 4175
	/*
	 * Parse the file...
	 */
4176
	while (fgets(fline, sizeof(fline), fd) != NULL)
4177 4178
	{
		/* skip leading whitespace and check for # comment */
B
Bruce Momjian 已提交
4179 4180 4181
		char	   *ptr;
		char	   *endptr;
		TimeLineID	tli;
4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201

		for (ptr = fline; *ptr; ptr++)
		{
			if (!isspace((unsigned char) *ptr))
				break;
		}
		if (*ptr == '\0' || *ptr == '#')
			continue;

		/* expect a numeric timeline ID as first field of line */
		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
		if (endptr == ptr)
			ereport(FATAL,
					(errmsg("syntax error in history file: %s", fline),
					 errhint("Expected a numeric timeline ID.")));

		if (result &&
			tli <= (TimeLineID) linitial_int(result))
			ereport(FATAL,
					(errmsg("invalid data in history file: %s", fline),
B
Bruce Momjian 已提交
4202
				   errhint("Timeline IDs must be in increasing sequence.")));
4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215

		/* Build list with newest item first */
		result = lcons_int((int) tli, result);

		/* we ignore the remainder of each line */
	}

	FreeFile(fd);

	if (result &&
		targetTLI <= (TimeLineID) linitial_int(result))
		ereport(FATAL,
				(errmsg("invalid data in history file \"%s\"", path),
B
Bruce Momjian 已提交
4216
			errhint("Timeline IDs must be less than child timeline's ID.")));
4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234

	result = lcons_int((int) targetTLI, result);

	ereport(DEBUG3,
			(errmsg_internal("history of timeline %u is %s",
							 targetTLI, nodeToString(result))));

	return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool
existsTimeLineHistory(TimeLineID probeTLI)
{
	char		path[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
B
Bruce Momjian 已提交
4235
	FILE	   *fd;
4236

4237 4238 4239 4240
	/* Timeline 1 does not have a history file, so no need to check */
	if (probeTLI == 1)
		return false;

4241 4242 4243
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, probeTLI);
4244
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259
	}
	else
		TLHistoryFilePath(path, probeTLI);

	fd = AllocateFile(path, "r");
	if (fd != NULL)
	{
		FreeFile(fd);
		return true;
	}
	else
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4260
					 errmsg("could not open file \"%s\": %m", path)));
4261 4262 4263 4264
		return false;
	}
}

4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320
/*
 * Scan for new timelines that might have appeared in the archive since we
 * started recovery.
 *
 * If there are any, the function changes recovery target TLI to the latest
 * one and returns 'true'.
 */
static bool
rescanLatestTimeLine(void)
{
	TimeLineID newtarget;
	newtarget = findNewestTimeLine(recoveryTargetTLI);
	if (newtarget != recoveryTargetTLI)
	{
		/*
		 * Determine the list of expected TLIs for the new TLI
		 */
		List *newExpectedTLIs;
		newExpectedTLIs = readTimeLineHistory(newtarget);

		/*
		 * If the current timeline is not part of the history of the
		 * new timeline, we cannot proceed to it.
		 *
		 * XXX This isn't foolproof: The new timeline might have forked from
		 * the current one, but before the current recovery location. In that
		 * case we will still switch to the new timeline and proceed replaying
		 * from it even though the history doesn't match what we already
		 * replayed. That's not good. We will likely notice at the next online
		 * checkpoint, as the TLI won't match what we expected, but it's
		 * not guaranteed. The admin needs to make sure that doesn't happen.
		 */
		if (!list_member_int(newExpectedTLIs,
							 (int) recoveryTargetTLI))
			ereport(LOG,
					(errmsg("new timeline %u is not a child of database system timeline %u",
							newtarget,
							ThisTimeLineID)));
		else
		{
			/* Switch target */
			recoveryTargetTLI = newtarget;
			list_free(expectedTLIs);
			expectedTLIs = newExpectedTLIs;

			XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;

			ereport(LOG,
					(errmsg("new target timeline is %u",
							recoveryTargetTLI)));
			return true;
		}
	}
	return false;
}

4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334
/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID
findNewestTimeLine(TimeLineID startTLI)
{
	TimeLineID	newestTLI;
	TimeLineID	probeTLI;

	/*
B
Bruce Momjian 已提交
4335 4336
	 * The algorithm is just to probe for the existence of timeline history
	 * files.  XXX is it useful to allow gaps in the sequence?
4337 4338 4339
	 */
	newestTLI = startTLI;

B
Bruce Momjian 已提交
4340
	for (probeTLI = startTLI + 1;; probeTLI++)
4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363
	{
		if (existsTimeLineHistory(probeTLI))
		{
			newestTLI = probeTLI;		/* probeTLI exists */
		}
		else
		{
			/* doesn't exist, assume we're done */
			break;
		}
	}

	return newestTLI;
}

/*
 * Create a new timeline history file.
 *
 *	newTLI: ID of the new timeline
 *	parentTLI: ID of its immediate parent
 *	endTLI et al: ID of the last used WAL file, for annotation purposes
 *
 * Currently this is only used during recovery, and so there are no locking
B
Bruce Momjian 已提交
4364
 * considerations.	But we should be just as tense as XLogFileInit to avoid
4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379
 * emplacing a bogus file.
 */
static void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
					 TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{
	char		path[MAXPGPATH];
	char		tmppath[MAXPGPATH];
	char		histfname[MAXFNAMELEN];
	char		xlogfname[MAXFNAMELEN];
	char		buffer[BLCKSZ];
	int			srcfd;
	int			fd;
	int			nbytes;

B
Bruce Momjian 已提交
4380
	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4381 4382 4383 4384

	/*
	 * Write into a temp file name.
	 */
4385
	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4386 4387 4388

	unlink(tmppath);

4389
	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
4390 4391 4392
	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
4393
		ereport(ERROR,
4394 4395 4396 4397 4398 4399 4400 4401 4402
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tmppath)));

	/*
	 * If a history file exists for the parent, copy it verbatim
	 */
	if (InArchiveRecovery)
	{
		TLHistoryFileName(histfname, parentTLI);
4403
		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4404 4405 4406 4407 4408 4409 4410 4411
	}
	else
		TLHistoryFilePath(path, parentTLI);

	srcfd = BasicOpenFile(path, O_RDONLY, 0);
	if (srcfd < 0)
	{
		if (errno != ENOENT)
4412
			ereport(ERROR,
4413
					(errcode_for_file_access(),
P
Peter Eisentraut 已提交
4414
					 errmsg("could not open file \"%s\": %m", path)));
4415 4416 4417 4418 4419 4420 4421 4422 4423
		/* Not there, so assume parent has no parents */
	}
	else
	{
		for (;;)
		{
			errno = 0;
			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
			if (nbytes < 0 || errno != 0)
4424
				ereport(ERROR,
4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438
						(errcode_for_file_access(),
						 errmsg("could not read file \"%s\": %m", path)));
			if (nbytes == 0)
				break;
			errno = 0;
			if ((int) write(fd, buffer, nbytes) != nbytes)
			{
				int			save_errno = errno;

				/*
				 * If we fail to make the file, delete it to release disk
				 * space
				 */
				unlink(tmppath);
B
Bruce Momjian 已提交
4439 4440

				/*
B
Bruce Momjian 已提交
4441
				 * if write didn't set errno, assume problem is no disk space
B
Bruce Momjian 已提交
4442
				 */
4443 4444
				errno = save_errno ? save_errno : ENOSPC;

4445
				ereport(ERROR,
4446
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
4447
					 errmsg("could not write to file \"%s\": %m", tmppath)));
4448 4449 4450 4451 4452 4453 4454 4455
			}
		}
		close(srcfd);
	}

	/*
	 * Append one line with the details of this timeline split.
	 *
B
Bruce Momjian 已提交
4456 4457
	 * If we did have a parent file, insert an extra newline just in case the
	 * parent file failed to end with one.
4458 4459 4460
	 */
	XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);

4461
	/*
B
Bruce Momjian 已提交
4462 4463
	 * Write comment to history file to explain why and where timeline
	 * changed. Comment varies according to the recovery target used.
4464 4465 4466 4467 4468 4469 4470 4471 4472
	 */
	if (recoveryTarget == RECOVERY_TARGET_XID)
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\t%s transaction %u\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname,
				 recoveryStopAfter ? "after" : "before",
				 recoveryStopXid);
4473
	else if (recoveryTarget == RECOVERY_TARGET_TIME)
4474 4475 4476 4477 4478 4479 4480
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\t%s %s\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname,
				 recoveryStopAfter ? "after" : "before",
				 timestamptz_to_str(recoveryStopTime));
4481 4482 4483 4484 4485 4486 4487
	else if (recoveryTarget == RECOVERY_TARGET_NAME)
		snprintf(buffer, sizeof(buffer),
				"%s%u\t%s\tat restore point \"%s\"\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname,
				 recoveryStopName);
4488 4489 4490 4491 4492 4493
	else
		snprintf(buffer, sizeof(buffer),
				 "%s%u\t%s\tno recovery target specified\n",
				 (srcfd < 0) ? "" : "\n",
				 parentTLI,
				 xlogfname);
4494 4495 4496 4497 4498 4499 4500 4501

	nbytes = strlen(buffer);
	errno = 0;
	if ((int) write(fd, buffer, nbytes) != nbytes)
	{
		int			save_errno = errno;

		/*
B
Bruce Momjian 已提交
4502
		 * If we fail to make the file, delete it to release disk space
4503 4504 4505 4506 4507
		 */
		unlink(tmppath);
		/* if write didn't set errno, assume problem is no disk space */
		errno = save_errno ? save_errno : ENOSPC;

4508
		ereport(ERROR,
4509 4510 4511 4512 4513
				(errcode_for_file_access(),
				 errmsg("could not write to file \"%s\": %m", tmppath)));
	}

	if (pg_fsync(fd) != 0)
4514
		ereport(ERROR,
4515 4516 4517 4518
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tmppath)));

	if (close(fd))
4519
		ereport(ERROR,
4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tmppath)));


	/*
	 * Now move the completed history file into place with its final name.
	 */
	TLHistoryFilePath(path, newTLI);

	/*
	 * Prefer link() to rename() here just to be really sure that we don't
	 * overwrite an existing logfile.  However, there shouldn't be one, so
	 * rename() is an acceptable substitute except for the truly paranoid.
	 */
#if HAVE_WORKING_LINK
	if (link(tmppath, path) < 0)
4536
		ereport(ERROR,
4537 4538 4539 4540 4541 4542
				(errcode_for_file_access(),
				 errmsg("could not link file \"%s\" to \"%s\": %m",
						tmppath, path)));
	unlink(tmppath);
#else
	if (rename(tmppath, path) < 0)
4543
		ereport(ERROR,
4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555
				(errcode_for_file_access(),
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
						tmppath, path)));
#endif

	/* The history file can be archived immediately. */
	TLHistoryFileName(histfname, newTLI);
	XLogArchiveNotify(histfname);
}

/*
 * I/O routines for pg_control
4556 4557
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
B
Bruce Momjian 已提交
4558
 * contents of pg_control.	WriteControlFile() initializes pg_control
4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
static void
WriteControlFile(void)
{
	int			fd;
B
Bruce Momjian 已提交
4572
	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
4573 4574

	/*
T
Tom Lane 已提交
4575
	 * Initialize version and compatibility-check fields
4576
	 */
T
Tom Lane 已提交
4577 4578
	ControlFile->pg_control_version = PG_CONTROL_VERSION;
	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4579 4580 4581 4582

	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
	ControlFile->floatFormat = FLOATFORMAT_VALUE;

4583 4584
	ControlFile->blcksz = BLCKSZ;
	ControlFile->relseg_size = RELSEG_SIZE;
4585
	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4586
	ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4587 4588

	ControlFile->nameDataLen = NAMEDATALEN;
4589
	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4590

4591 4592
	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

4593
#ifdef HAVE_INT64_TIMESTAMP
4594
	ControlFile->enableIntTimes = true;
4595
#else
4596
	ControlFile->enableIntTimes = false;
4597
#endif
4598 4599
	ControlFile->float4ByVal = FLOAT4PASSBYVAL;
	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4600

T
Tom Lane 已提交
4601
	/* Contents are protected with a CRC */
4602 4603 4604 4605 4606
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
T
Tom Lane 已提交
4607

4608
	/*
4609 4610 4611 4612 4613
	 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
	 * excess over sizeof(ControlFileData).  This reduces the odds of
	 * premature-EOF errors when reading pg_control.  We'll still fail when we
	 * check the contents of the file, but hopefully with a more specific
	 * error than "couldn't read pg_control".
4614
	 */
4615 4616
	if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
		elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4617

4618
	memset(buffer, 0, PG_CONTROL_SIZE);
4619 4620
	memcpy(buffer, ControlFile, sizeof(ControlFileData));

4621 4622
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4623
					   S_IRUSR | S_IWUSR);
4624
	if (fd < 0)
4625 4626 4627
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not create control file \"%s\": %m",
4628
						XLOG_CONTROL_FILE)));
4629

4630
	errno = 0;
4631
	if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4632 4633 4634 4635
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4636 4637
		ereport(PANIC,
				(errcode_for_file_access(),
4638
				 errmsg("could not write to control file: %m")));
4639
	}
4640

4641
	if (pg_fsync(fd) != 0)
4642 4643
		ereport(PANIC,
				(errcode_for_file_access(),
4644
				 errmsg("could not fsync control file: %m")));
4645

4646 4647 4648 4649
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4650 4651 4652 4653 4654
}

static void
ReadControlFile(void)
{
4655
	pg_crc32	crc;
4656 4657 4658 4659 4660
	int			fd;

	/*
	 * Read data...
	 */
4661 4662 4663
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4664
	if (fd < 0)
4665 4666 4667
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4668
						XLOG_CONTROL_FILE)));
4669 4670

	if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4671 4672
		ereport(PANIC,
				(errcode_for_file_access(),
4673
				 errmsg("could not read from control file: %m")));
4674 4675 4676

	close(fd);

T
Tom Lane 已提交
4677
	/*
B
Bruce Momjian 已提交
4678 4679 4680 4681
	 * Check for expected pg_control format version.  If this is wrong, the
	 * CRC check will likely fail because we'll be checking the wrong number
	 * of bytes.  Complaining about wrong version will probably be more
	 * enlightening than complaining about wrong CRC.
T
Tom Lane 已提交
4682
	 */
4683 4684 4685 4686 4687

	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4688 4689
		 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
			ControlFile->pg_control_version, ControlFile->pg_control_version,
4690 4691 4692
						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

T
Tom Lane 已提交
4693
	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4694 4695 4696
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
B
Bruce Momjian 已提交
4697 4698
				  " but the server was compiled with PG_CONTROL_VERSION %d.",
						ControlFile->pg_control_version, PG_CONTROL_VERSION),
4699
				 errhint("It looks like you need to initdb.")));
4700

T
Tom Lane 已提交
4701
	/* Now check the CRC. */
4702 4703 4704 4705 4706
	INIT_CRC32(crc);
	COMP_CRC32(crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(crc);
4707

4708
	if (!EQ_CRC32(crc, ControlFile->crc))
4709
		ereport(FATAL,
4710
				(errmsg("incorrect checksum in control file")));
4711

4712
	/*
4713
	 * Do compatibility checking immediately.  If the database isn't
4714 4715
	 * compatible with the backend executable, we want to abort before we can
	 * possibly do any damage.
4716
	 */
T
Tom Lane 已提交
4717
	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4718 4719 4720
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
B
Bruce Momjian 已提交
4721 4722
				  " but the server was compiled with CATALOG_VERSION_NO %d.",
						ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4723
				 errhint("It looks like you need to initdb.")));
4724 4725 4726
	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4727 4728 4729 4730
		   errdetail("The database cluster was initialized with MAXALIGN %d,"
					 " but the server was compiled with MAXALIGN %d.",
					 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
				 errhint("It looks like you need to initdb.")));
4731 4732 4733
	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
P
Peter Eisentraut 已提交
4734
				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4735
				 errhint("It looks like you need to initdb.")));
4736
	if (ControlFile->blcksz != BLCKSZ)
4737 4738
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4739 4740 4741 4742
			 errdetail("The database cluster was initialized with BLCKSZ %d,"
					   " but the server was compiled with BLCKSZ %d.",
					   ControlFile->blcksz, BLCKSZ),
				 errhint("It looks like you need to recompile or initdb.")));
4743
	if (ControlFile->relseg_size != RELSEG_SIZE)
4744 4745
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4746 4747 4748 4749
		errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
				  " but the server was compiled with RELSEG_SIZE %d.",
				  ControlFile->relseg_size, RELSEG_SIZE),
				 errhint("It looks like you need to recompile or initdb.")));
4750 4751 4752
	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4753 4754 4755
		errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
				  " but the server was compiled with XLOG_BLCKSZ %d.",
				  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4756
				 errhint("It looks like you need to recompile or initdb.")));
4757 4758 4759 4760
	if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
B
Bruce Momjian 已提交
4761
					   " but the server was compiled with XLOG_SEG_SIZE %d.",
4762
						   ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
B
Bruce Momjian 已提交
4763
				 errhint("It looks like you need to recompile or initdb.")));
4764
	if (ControlFile->nameDataLen != NAMEDATALEN)
4765 4766
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
B
Bruce Momjian 已提交
4767 4768 4769 4770
		errdetail("The database cluster was initialized with NAMEDATALEN %d,"
				  " but the server was compiled with NAMEDATALEN %d.",
				  ControlFile->nameDataLen, NAMEDATALEN),
				 errhint("It looks like you need to recompile or initdb.")));
4771
	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4772 4773
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4774
				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
B
Bruce Momjian 已提交
4775
					  " but the server was compiled with INDEX_MAX_KEYS %d.",
4776
						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
B
Bruce Momjian 已提交
4777
				 errhint("It looks like you need to recompile or initdb.")));
4778 4779 4780 4781
	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
B
Bruce Momjian 已提交
4782 4783
				" but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
			  ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4784
				 errhint("It looks like you need to recompile or initdb.")));
4785 4786

#ifdef HAVE_INT64_TIMESTAMP
4787
	if (ControlFile->enableIntTimes != true)
4788 4789 4790
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4791 4792
				  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4793
#else
4794
	if (ControlFile->enableIntTimes != false)
4795 4796 4797
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
B
Bruce Momjian 已提交
4798 4799
			   " but the server was compiled without HAVE_INT64_TIMESTAMP."),
				 errhint("It looks like you need to recompile or initdb.")));
4800 4801
#endif

4802 4803 4804 4805 4806
#ifdef USE_FLOAT4_BYVAL
	if (ControlFile->float4ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4807
					  " but the server was compiled with USE_FLOAT4_BYVAL."),
4808 4809 4810 4811 4812
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float4ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4813 4814
		errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
				  " but the server was compiled without USE_FLOAT4_BYVAL."),
4815 4816 4817 4818 4819 4820 4821 4822
				 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
	if (ControlFile->float8ByVal != true)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4823
					  " but the server was compiled with USE_FLOAT8_BYVAL."),
4824 4825 4826 4827 4828
				 errhint("It looks like you need to recompile or initdb.")));
#else
	if (ControlFile->float8ByVal != false)
		ereport(FATAL,
				(errmsg("database files are incompatible with server"),
4829 4830
		errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
				  " but the server was compiled without USE_FLOAT8_BYVAL."),
4831 4832
				 errhint("It looks like you need to recompile or initdb.")));
#endif
4833 4834
}

4835
void
4836
UpdateControlFile(void)
4837
{
4838
	int			fd;
4839

4840 4841 4842 4843 4844
	INIT_CRC32(ControlFile->crc);
	COMP_CRC32(ControlFile->crc,
			   (char *) ControlFile,
			   offsetof(ControlFileData, crc));
	FIN_CRC32(ControlFile->crc);
4845

4846 4847 4848
	fd = BasicOpenFile(XLOG_CONTROL_FILE,
					   O_RDWR | PG_BINARY,
					   S_IRUSR | S_IWUSR);
4849
	if (fd < 0)
4850 4851 4852
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not open control file \"%s\": %m",
4853
						XLOG_CONTROL_FILE)));
4854

4855
	errno = 0;
4856
	if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4857 4858 4859 4860
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
4861 4862
		ereport(PANIC,
				(errcode_for_file_access(),
4863
				 errmsg("could not write to control file: %m")));
4864
	}
4865

4866
	if (pg_fsync(fd) != 0)
4867 4868
		ereport(PANIC,
				(errcode_for_file_access(),
4869
				 errmsg("could not fsync control file: %m")));
4870

4871 4872 4873 4874
	if (close(fd))
		ereport(PANIC,
				(errcode_for_file_access(),
				 errmsg("could not close control file: %m")));
4875 4876
}

4877 4878 4879 4880 4881 4882 4883 4884 4885 4886
/*
 * Returns the unique system identifier from control file.
 */
uint64
GetSystemIdentifier(void)
{
	Assert(ControlFile != NULL);
	return ControlFile->system_identifier;
}

4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921
/*
 * Auto-tune the number of XLOG buffers.
 *
 * If the user-set value of wal_buffers is -1, we auto-tune to about 3% of
 * shared_buffers, with a maximum of one XLOG segment and a minimum of 8
 * blocks (8 was the default value prior to PostgreSQL 9.1, when auto-tuning
 * was added).  We also clamp manually-set values to at least 4 blocks; prior
 * to PostgreSQL 9.1, a minimum of 4 was enforced by guc.c, but since that
 * is no longer possible, we just silently treat such values as a request for
 * the minimum.
 */
static void
XLOGTuneNumBuffers(void)
{
	int			xbuffers = XLOGbuffers;
	char		buf[32];

	if (xbuffers == -1)
	{
		xbuffers = NBuffers / 32;
		if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
			xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
		if (xbuffers < 8)
			xbuffers = 8;
	}
	else if (xbuffers < 4)
		xbuffers = 4;

	if (xbuffers != XLOGbuffers)
	{
		snprintf(buf, sizeof(buf), "%d", xbuffers);
		SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
	}
}

4922
/*
T
Tom Lane 已提交
4923
 * Initialization of shared memory for XLOG
4924
 */
4925
Size
4926
XLOGShmemSize(void)
4927
{
4928
	Size		size;
4929

4930 4931 4932 4933
	/* Figure out how many XLOG buffers we need. */
	XLOGTuneNumBuffers();
	Assert(XLOGbuffers > 0);

4934 4935 4936 4937 4938 4939 4940
	/* XLogCtl */
	size = sizeof(XLogCtlData);
	/* xlblocks array */
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers */
	size = add_size(size, ALIGNOF_XLOG_BUFFER);
	/* and the buffers themselves */
4941
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4942 4943

	/*
B
Bruce Momjian 已提交
4944 4945 4946
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
4947 4948 4949
	 */

	return size;
4950 4951 4952 4953 4954
}

void
XLOGShmemInit(void)
{
4955 4956
	bool		foundCFile,
				foundXLog;
4957
	char	   *allocptr;
4958

4959
	ControlFile = (ControlFileData *)
4960
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4961 4962
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4963

4964
	if (foundCFile || foundXLog)
4965 4966
	{
		/* both should be present or neither */
4967
		Assert(foundCFile && foundXLog);
4968 4969
		return;
	}
4970

T
Tom Lane 已提交
4971
	memset(XLogCtl, 0, sizeof(XLogCtlData));
B
Bruce Momjian 已提交
4972

T
Tom Lane 已提交
4973
	/*
B
Bruce Momjian 已提交
4974 4975 4976
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
T
Tom Lane 已提交
4977
	 */
4978 4979
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
T
Tom Lane 已提交
4980
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4981
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
B
Bruce Momjian 已提交
4982

T
Tom Lane 已提交
4983
	/*
4984
	 * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
T
Tom Lane 已提交
4985
	 */
4986 4987
	allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
	XLogCtl->pages = allocptr;
4988
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
T
Tom Lane 已提交
4989 4990

	/*
B
Bruce Momjian 已提交
4991 4992
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
T
Tom Lane 已提交
4993 4994
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4995
	XLogCtl->SharedRecoveryInProgress = true;
4996
	XLogCtl->SharedHotStandbyActive = false;
T
Tom Lane 已提交
4997
	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4998
	SpinLockInit(&XLogCtl->info_lck);
4999
	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
T
Tom Lane 已提交
5000

5001
	/*
B
Bruce Momjian 已提交
5002 5003 5004
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
5005 5006 5007
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
5008 5009 5010
}

/*
T
Tom Lane 已提交
5011 5012
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
5013 5014
 */
void
T
Tom Lane 已提交
5015
BootStrapXLOG(void)
5016
{
5017
	CheckPoint	checkPoint;
T
Tom Lane 已提交
5018 5019
	char	   *buffer;
	XLogPageHeader page;
5020
	XLogLongPageHeader longpage;
5021
	XLogRecord *record;
B
Bruce Momjian 已提交
5022
	bool		use_existent;
5023 5024
	uint64		sysidentifier;
	struct timeval tv;
5025
	pg_crc32	crc;
5026

5027
	/*
B
Bruce Momjian 已提交
5028 5029 5030 5031 5032 5033 5034 5035 5036 5037
	 * Select a hopefully-unique system identifier code for this installation.
	 * We use the result of gettimeofday(), including the fractional seconds
	 * field, as being about as unique as we can easily get.  (Think not to
	 * use random(), since it hasn't been seeded and there's no portable way
	 * to seed it other than the system clock value...)  The upper half of the
	 * uint64 value is just the tv_sec part, while the lower half is the XOR
	 * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
	 * unnecessarily if "uint64" is really only 32 bits wide.  A person
	 * knowing this encoding can determine the initialization time of the
	 * installation, which could perhaps be useful sometimes.
5038 5039 5040 5041 5042
	 */
	gettimeofday(&tv, NULL);
	sysidentifier = ((uint64) tv.tv_sec) << 32;
	sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

5043 5044 5045
	/* First timeline ID is always 1 */
	ThisTimeLineID = 1;

5046
	/* page buffer must be aligned suitably for O_DIRECT */
5047
	buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
5048
	page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
5049
	memset(page, 0, XLOG_BLCKSZ);
T
Tom Lane 已提交
5050

5051 5052 5053 5054 5055 5056 5057
	/*
	 * Set up information for the initial checkpoint record
	 *
	 * The initial checkpoint record is written to the beginning of the
	 * WAL segment with logid=0 logseg=1. The very first WAL segment, 0/0, is
	 * not used, so that we can use 0/0 to mean "before any valid WAL segment".
	 */
5058
	checkPoint.redo.xlogid = 0;
5059
	checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
5060
	checkPoint.ThisTimeLineID = ThisTimeLineID;
5061
	checkPoint.nextXidEpoch = 0;
5062
	checkPoint.nextXid = FirstNormalTransactionId;
5063
	checkPoint.nextOid = FirstBootstrapObjectId;
5064
	checkPoint.nextMulti = FirstMultiXactId;
5065
	checkPoint.nextMultiOffset = 0;
5066 5067
	checkPoint.oldestXid = FirstNormalTransactionId;
	checkPoint.oldestXidDB = TemplateDbOid;
5068
	checkPoint.time = (pg_time_t) time(NULL);
5069
	checkPoint.oldestActiveXid = InvalidTransactionId;
5070

5071 5072 5073
	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
	ShmemVariableCache->oidCount = 0;
5074
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5075
	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5076

5077
	/* Set up the XLOG page header */
5078
	page->xlp_magic = XLOG_PAGE_MAGIC;
5079 5080
	page->xlp_info = XLP_LONG_HEADER;
	page->xlp_tli = ThisTimeLineID;
5081
	page->xlp_pageaddr.xlogid = 0;
5082
	page->xlp_pageaddr.xrecoff = XLogSegSize;
5083 5084 5085
	longpage = (XLogLongPageHeader) page;
	longpage->xlp_sysid = sysidentifier;
	longpage->xlp_seg_size = XLogSegSize;
5086
	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5087 5088

	/* Insert the initial checkpoint record */
5089
	record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
5090
	record->xl_prev.xlogid = 0;
5091
	record->xl_prev.xrecoff = 0;
5092
	record->xl_xid = InvalidTransactionId;
5093
	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
5094
	record->xl_len = sizeof(checkPoint);
T
Tom Lane 已提交
5095
	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5096
	record->xl_rmid = RM_XLOG_ID;
T
Tom Lane 已提交
5097
	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
5098

5099 5100 5101 5102 5103
	INIT_CRC32(crc);
	COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
			   SizeOfXLogRecord - sizeof(pg_crc32));
	FIN_CRC32(crc);
5104 5105
	record->xl_crc = crc;

5106
	/* Create first XLOG segment file */
5107
	use_existent = false;
5108
	openLogFile = XLogFileInit(0, 1, &use_existent, false);
5109

5110
	/* Write the first page with the initial record */
5111
	errno = 0;
5112
	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5113 5114 5115 5116
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
5117 5118
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
5119
			  errmsg("could not write bootstrap transaction log file: %m")));
5120
	}
5121

T
Tom Lane 已提交
5122
	if (pg_fsync(openLogFile) != 0)
5123 5124
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
5125
			  errmsg("could not fsync bootstrap transaction log file: %m")));
5126

5127 5128 5129
	if (close(openLogFile))
		ereport(PANIC,
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
5130
			  errmsg("could not close bootstrap transaction log file: %m")));
5131

T
Tom Lane 已提交
5132
	openLogFile = -1;
5133

5134 5135
	/* Now create pg_control */

5136
	memset(ControlFile, 0, sizeof(ControlFileData));
T
Tom Lane 已提交
5137
	/* Initialize pg_control status fields */
5138
	ControlFile->system_identifier = sysidentifier;
T
Tom Lane 已提交
5139 5140
	ControlFile->state = DB_SHUTDOWNED;
	ControlFile->time = checkPoint.time;
5141
	ControlFile->checkPoint = checkPoint.redo;
T
Tom Lane 已提交
5142
	ControlFile->checkPointCopy = checkPoint;
5143 5144 5145 5146 5147 5148 5149

	/* Set important parameter values for use when replaying WAL */
	ControlFile->MaxConnections = MaxConnections;
	ControlFile->max_prepared_xacts = max_prepared_xacts;
	ControlFile->max_locks_per_xact = max_locks_per_xact;
	ControlFile->wal_level = wal_level;

5150
	/* some additional ControlFile fields are set in WriteControlFile() */
5151

5152
	WriteControlFile();
5153 5154 5155

	/* Bootstrap the commit log, too */
	BootStrapCLOG();
5156
	BootStrapSUBTRANS();
5157
	BootStrapMultiXact();
5158

5159
	pfree(buffer);
5160 5161
}

5162
static char *
5163
str_time(pg_time_t tnow)
5164
{
5165
	static char buf[128];
5166

5167 5168 5169
	pg_strftime(buf, sizeof(buf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&tnow, log_timezone));
5170

5171
	return buf;
5172 5173
}

5174 5175
/*
 * See if there is a recovery command file (recovery.conf), and if so
5176
 * read in parameters for archive recovery and XLOG streaming.
5177
 *
5178
 * The file is parsed using the main configuration parser.
5179 5180 5181 5182
 */
static void
readRecoveryCommandFile(void)
{
B
Bruce Momjian 已提交
5183 5184 5185
	FILE	   *fd;
	TimeLineID	rtli = 0;
	bool		rtliGiven = false;
5186 5187 5188
	ConfigVariable *item,
				   *head = NULL,
				   *tail = NULL;
B
Bruce Momjian 已提交
5189

5190
	fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5191 5192 5193 5194 5195
	if (fd == NULL)
	{
		if (errno == ENOENT)
			return;				/* not there, so no archive recovery */
		ereport(FATAL,
B
Bruce Momjian 已提交
5196
				(errcode_for_file_access(),
5197
				 errmsg("could not open recovery command file \"%s\": %m",
5198
						RECOVERY_COMMAND_FILE)));
5199 5200
	}

B
Bruce Momjian 已提交
5201
	/*
5202 5203 5204 5205
	 * Since we're asking ParseConfigFp() to error out at FATAL, there's no
	 * need to check the return value.
	 */ 
	ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5206

5207 5208 5209
	for (item = head; item; item = item->next)
	{
		if (strcmp(item->name, "restore_command") == 0)
B
Bruce Momjian 已提交
5210
		{
5211
			recoveryRestoreCommand = pstrdup(item->value);
5212
			ereport(DEBUG2,
5213
					(errmsg("restore_command = '%s'",
5214 5215
							recoveryRestoreCommand)));
		}
5216
		else if (strcmp(item->name, "recovery_end_command") == 0)
5217
		{
5218
			recoveryEndCommand = pstrdup(item->value);
5219
			ereport(DEBUG2,
5220 5221 5222
					(errmsg("recovery_end_command = '%s'",
							recoveryEndCommand)));
		}
5223
		else if (strcmp(item->name, "archive_cleanup_command") == 0)
5224
		{
5225
			archiveCleanupCommand = pstrdup(item->value);
5226
			ereport(DEBUG2,
5227 5228
					(errmsg("archive_cleanup_command = '%s'",
							archiveCleanupCommand)));
5229
		}
5230 5231 5232 5233 5234 5235 5236 5237 5238
		else if (strcmp(item->name, "pause_at_recovery_target") == 0)
		{
			if (!parse_bool(item->value, &recoveryPauseAtTarget))
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						 errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
			ereport(DEBUG2,
					(errmsg("pause_at_recovery_target = '%s'", item->value)));
		}
5239
		else if (strcmp(item->name, "recovery_target_timeline") == 0)
B
Bruce Momjian 已提交
5240
		{
5241
			rtliGiven = true;
5242
			if (strcmp(item->value, "latest") == 0)
5243 5244 5245 5246
				rtli = 0;
			else
			{
				errno = 0;
5247
				rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5248 5249 5250
				if (errno == EINVAL || errno == ERANGE)
					ereport(FATAL,
							(errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5251
									item->value)));
5252 5253
			}
			if (rtli)
5254
				ereport(DEBUG2,
5255 5256
						(errmsg("recovery_target_timeline = %u", rtli)));
			else
5257
				ereport(DEBUG2,
5258 5259
						(errmsg("recovery_target_timeline = latest")));
		}
5260
		else if (strcmp(item->name, "recovery_target_xid") == 0)
B
Bruce Momjian 已提交
5261
		{
5262
			errno = 0;
5263
			recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5264 5265
			if (errno == EINVAL || errno == ERANGE)
				ereport(FATAL,
B
Bruce Momjian 已提交
5266
				 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5267
						 item->value)));
5268
			ereport(DEBUG2,
5269 5270
					(errmsg("recovery_target_xid = %u",
							recoveryTargetXid)));
5271
			recoveryTarget = RECOVERY_TARGET_XID;
5272
		}
5273
		else if (strcmp(item->name, "recovery_target_time") == 0)
B
Bruce Momjian 已提交
5274
		{
5275
			/*
5276 5277
			 * if recovery_target_xid or recovery_target_name specified, then
			 * this overrides recovery_target_time
5278
			 */
5279 5280
			if (recoveryTarget == RECOVERY_TARGET_XID ||
					recoveryTarget == RECOVERY_TARGET_NAME)
5281
				continue;
5282
			recoveryTarget = RECOVERY_TARGET_TIME;
B
Bruce Momjian 已提交
5283

5284
			/*
5285
			 * Convert the time string given by the user to TimestampTz form.
5286
			 */
5287 5288
			recoveryTargetTime =
				DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5289
														CStringGetDatum(item->value),
5290 5291
												ObjectIdGetDatum(InvalidOid),
														Int32GetDatum(-1)));
5292
			ereport(DEBUG2,
5293
					(errmsg("recovery_target_time = '%s'",
5294
							timestamptz_to_str(recoveryTargetTime))));
5295
		}
5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309
		else if (strcmp(item->name, "recovery_target_name") == 0)
		{
			/*
			 * if recovery_target_xid specified, then this overrides
			 * recovery_target_name
			 */
			if (recoveryTarget == RECOVERY_TARGET_XID)
				continue;
			recoveryTarget = RECOVERY_TARGET_NAME;

			recoveryTargetName = pstrdup(item->value);
			if (strlen(recoveryTargetName) >= MAXFNAMELEN)
				ereport(FATAL,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5310
						 errmsg("recovery_target_name is too long (maximum %d characters)", MAXFNAMELEN - 1)));
5311 5312 5313 5314 5315

			ereport(DEBUG2,
					(errmsg("recovery_target_name = '%s'",
							recoveryTargetName)));
		}
5316
		else if (strcmp(item->name, "recovery_target_inclusive") == 0)
B
Bruce Momjian 已提交
5317
		{
5318 5319 5320
			/*
			 * does nothing if a recovery_target is not also set
			 */
5321
			if (!parse_bool(item->value, &recoveryTargetInclusive))
5322 5323
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5324
						 errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
5325
			ereport(DEBUG2,
5326
					(errmsg("recovery_target_inclusive = %s", item->value)));
5327
		}
5328
		else if (strcmp(item->name, "standby_mode") == 0)
5329
		{
5330
			if (!parse_bool(item->value, &StandbyMode))
5331 5332
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5333
						 errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
5334
			ereport(DEBUG2,
5335
					(errmsg("standby_mode = '%s'", item->value)));
5336
		}
5337
		else if (strcmp(item->name, "primary_conninfo") == 0)
5338
		{
5339
			PrimaryConnInfo = pstrdup(item->value);
5340
			ereport(DEBUG2,
5341 5342 5343
					(errmsg("primary_conninfo = '%s'",
							PrimaryConnInfo)));
		}
5344
		else if (strcmp(item->name, "trigger_file") == 0)
5345
		{
5346
			TriggerFile = pstrdup(item->value);
5347
			ereport(DEBUG2,
5348 5349 5350
					(errmsg("trigger_file = '%s'",
							TriggerFile)));
		}
5351 5352 5353
		else
			ereport(FATAL,
					(errmsg("unrecognized recovery parameter \"%s\"",
5354
							item->name)));
5355 5356
	}

5357 5358 5359 5360 5361 5362
	/*
	 * Check for compulsory parameters
	 */
	if (StandbyMode)
	{
		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5363
			ereport(WARNING,
5364
					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5365 5366
							RECOVERY_COMMAND_FILE),
					 errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5367 5368 5369 5370 5371
	}
	else
	{
		if (recoveryRestoreCommand == NULL)
			ereport(FATAL,
5372
					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5373 5374
							RECOVERY_COMMAND_FILE)));
	}
5375

5376 5377 5378
	/* Enable fetching from archive recovery area */
	InArchiveRecovery = true;

5379
	/*
B
Bruce Momjian 已提交
5380 5381 5382 5383
	 * If user specified recovery_target_timeline, validate it or compute the
	 * "latest" value.	We can't do this until after we've gotten the restore
	 * command and set InArchiveRecovery, because we need to fetch timeline
	 * history files from the archive.
5384
	 */
5385 5386 5387 5388 5389 5390 5391
	if (rtliGiven)
	{
		if (rtli)
		{
			/* Timeline 1 does not have a history file, all else should */
			if (rtli != 1 && !existsTimeLineHistory(rtli))
				ereport(FATAL,
5392
						(errmsg("recovery target timeline %u does not exist",
B
Bruce Momjian 已提交
5393
								rtli)));
5394
			recoveryTargetTLI = rtli;
5395
			recoveryTargetIsLatest = false;
5396 5397 5398 5399 5400
		}
		else
		{
			/* We start the "latest" search from pg_control's timeline */
			recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5401
			recoveryTargetIsLatest = true;
5402 5403
		}
	}
5404 5405 5406

	FreeConfigVariables(head);
	FreeFile(fd);
5407 5408 5409 5410 5411 5412
}

/*
 * Exit archive-recovery state
 */
static void
5413
exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5414
{
B
Bruce Momjian 已提交
5415 5416
	char		recoveryPath[MAXPGPATH];
	char		xlogpath[MAXPGPATH];
5417
	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
5418 5419

	/*
5420
	 * We are no longer in archive recovery state.
5421 5422 5423
	 */
	InArchiveRecovery = false;

5424 5425 5426 5427 5428
	/*
	 * Update min recovery point one last time.
	 */
	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

5429
	/*
B
Bruce Momjian 已提交
5430 5431
	 * If the ending log segment is still open, close it (to avoid problems on
	 * Windows with trying to rename or delete an open file).
5432
	 */
5433 5434 5435 5436 5437
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
5438 5439

	/*
B
Bruce Momjian 已提交
5440 5441 5442 5443 5444 5445 5446
	 * If the segment was fetched from archival storage, we want to replace
	 * the existing xlog segment (if any) with the archival version.  This is
	 * because whatever is in XLOGDIR is very possibly older than what we have
	 * from the archives, since it could have come from restoring a PGDATA
	 * backup.	In any case, the archival version certainly is more
	 * descriptive of what our current database state is, because that is what
	 * we replayed from.
5447
	 *
5448 5449
	 * Note that if we are establishing a new timeline, ThisTimeLineID is
	 * already set to the new value, and so we will create a new file instead
5450 5451
	 * of overwriting any existing file.  (This is, in fact, always the case
	 * at present.)
5452
	 */
5453
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5454
	XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5455 5456 5457 5458 5459 5460 5461 5462 5463 5464

	if (restoredFromArchive)
	{
		ereport(DEBUG3,
				(errmsg_internal("moving last restored xlog to \"%s\"",
								 xlogpath)));
		unlink(xlogpath);		/* might or might not exist */
		if (rename(recoveryPath, xlogpath) != 0)
			ereport(FATAL,
					(errcode_for_file_access(),
5465
					 errmsg("could not rename file \"%s\" to \"%s\": %m",
5466 5467 5468 5469 5470 5471 5472 5473 5474 5475
							recoveryPath, xlogpath)));
		/* XXX might we need to fix permissions on the file? */
	}
	else
	{
		/*
		 * If the latest segment is not archival, but there's still a
		 * RECOVERYXLOG laying about, get rid of it.
		 */
		unlink(recoveryPath);	/* ignore any error */
B
Bruce Momjian 已提交
5476

5477
		/*
B
Bruce Momjian 已提交
5478 5479 5480
		 * If we are establishing a new timeline, we have to copy data from
		 * the last WAL segment of the old timeline to create a starting WAL
		 * segment for the new timeline.
5481 5482 5483 5484
		 *
		 * Notify the archiver that the last WAL segment of the old timeline
		 * is ready to copy to archival storage. Otherwise, it is not archived
		 * for a while.
5485 5486
		 */
		if (endTLI != ThisTimeLineID)
5487
		{
5488 5489
			XLogFileCopy(endLogId, endLogSeg,
						 endTLI, endLogId, endLogSeg);
5490 5491 5492 5493 5494 5495 5496

			if (XLogArchivingActive())
			{
				XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
				XLogArchiveNotify(xlogpath);
			}
		}
5497 5498 5499
	}

	/*
B
Bruce Momjian 已提交
5500 5501
	 * Let's just make real sure there are not .ready or .done flags posted
	 * for the new segment.
5502
	 */
5503 5504
	XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
	XLogArchiveCleanup(xlogpath);
5505

5506
	/* Get rid of any remaining recovered timeline-history file, too */
5507
	snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
B
Bruce Momjian 已提交
5508
	unlink(recoveryPath);		/* ignore any error */
5509 5510

	/*
B
Bruce Momjian 已提交
5511 5512
	 * Rename the config file out of the way, so that we don't accidentally
	 * re-enter archive recovery mode in a subsequent crash.
5513
	 */
5514 5515
	unlink(RECOVERY_COMMAND_DONE);
	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5516 5517
		ereport(FATAL,
				(errcode_for_file_access(),
5518
				 errmsg("could not rename file \"%s\" to \"%s\": %m",
5519
						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530

	ereport(LOG,
			(errmsg("archive recovery complete")));
}

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
5531
 *
5532
 * We also track the timestamp of the latest applied COMMIT/ABORT
5533
 * record in XLogCtl->recoveryLastXTime, for logging purposes.
5534 5535
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
5536 5537 5538 5539 5540
 */
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
{
	bool		stopsHere;
B
Bruce Momjian 已提交
5541
	uint8		record_info;
B
Bruce Momjian 已提交
5542
	TimestampTz recordXtime;
5543
	char		recordRPName[MAXFNAMELEN];
5544

5545 5546
	/* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
	if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
5547 5548 5549
		return false;
	record_info = record->xl_info & ~XLR_INFO_MASK;
	if (record_info == XLOG_XACT_COMMIT)
5550
	{
5551
		xl_xact_commit *recordXactCommitData;
5552

5553 5554
		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
		recordXtime = recordXactCommitData->xact_time;
5555
	}
5556
	else if (record_info == XLOG_XACT_ABORT)
5557
	{
5558
		xl_xact_abort *recordXactAbortData;
5559

5560 5561
		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
		recordXtime = recordXactAbortData->xact_time;
5562
	}
5563 5564 5565 5566 5567 5568 5569 5570
	else if (record_info == XLOG_RESTORE_POINT)
	{
		xl_restore_point *recordRestorePointData;

		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
		recordXtime = recordRestorePointData->rp_time;
		strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
	}
5571 5572 5573
	else
		return false;

5574
	/* Do we have a PITR target at all? */
5575
	if (recoveryTarget == RECOVERY_TARGET_UNSET)
5576
	{
5577 5578 5579 5580 5581 5582
		/*
		 * Save timestamp of latest transaction commit/abort if this is
		 * a transaction record
		 */
		if (record->xl_rmid == RM_XACT_ID)
			SetLatestXTime(recordXtime);
5583
		return false;
5584
	}
5585

5586
	if (recoveryTarget == RECOVERY_TARGET_XID)
5587 5588
	{
		/*
5589
		 * There can be only one transaction end record with this exact
B
Bruce Momjian 已提交
5590
		 * transactionid
5591
		 *
B
Bruce Momjian 已提交
5592
		 * when testing for an xid, we MUST test for equality only, since
B
Bruce Momjian 已提交
5593 5594 5595
		 * transactions are numbered in the order they start, not the order
		 * they complete. A higher numbered xid will complete before you about
		 * 50% of the time...
5596 5597 5598 5599 5600
		 */
		stopsHere = (record->xl_xid == recoveryTargetXid);
		if (stopsHere)
			*includeThis = recoveryTargetInclusive;
	}
5601 5602 5603
	else if (recoveryTarget == RECOVERY_TARGET_NAME)
	{
		/*
5604
		 * There can be many restore points that share the same name, so we stop
5605 5606 5607 5608 5609
		 * at the first one
		 */
		stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);

		/*
5610
		 * Ignore recoveryTargetInclusive because this is not a transaction
5611 5612 5613 5614
		 * record
		 */
		*includeThis = false;
	}
5615 5616 5617
	else
	{
		/*
5618
		 * There can be many transactions that share the same commit time, so
B
Bruce Momjian 已提交
5619 5620
		 * we stop after the last one, if we are inclusive, or stop at the
		 * first one if we are exclusive
5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631
		 */
		if (recoveryTargetInclusive)
			stopsHere = (recordXtime > recoveryTargetTime);
		else
			stopsHere = (recordXtime >= recoveryTargetTime);
		if (stopsHere)
			*includeThis = false;
	}

	if (stopsHere)
	{
5632 5633 5634 5635
		recoveryStopXid = record->xl_xid;
		recoveryStopTime = recordXtime;
		recoveryStopAfter = *includeThis;

5636 5637
		if (record_info == XLOG_XACT_COMMIT)
		{
5638
			if (recoveryStopAfter)
5639 5640
				ereport(LOG,
						(errmsg("recovery stopping after commit of transaction %u, time %s",
5641 5642
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5643 5644 5645
			else
				ereport(LOG,
						(errmsg("recovery stopping before commit of transaction %u, time %s",
5646 5647
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5648
		}
5649
		else if (record_info == XLOG_XACT_ABORT)
5650
		{
5651
			if (recoveryStopAfter)
5652 5653
				ereport(LOG,
						(errmsg("recovery stopping after abort of transaction %u, time %s",
5654 5655
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5656 5657 5658
			else
				ereport(LOG,
						(errmsg("recovery stopping before abort of transaction %u, time %s",
5659 5660
								recoveryStopXid,
								timestamptz_to_str(recoveryStopTime))));
5661
		}
5662 5663 5664 5665 5666 5667 5668 5669 5670
		else
		{
			strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);

			ereport(LOG,
					(errmsg("recovery stopping at restore point \"%s\", time %s",
								recoveryStopName,
								timestamptz_to_str(recoveryStopTime))));
		}
5671

5672 5673 5674 5675 5676 5677
		/*
		 * Note that if we use a RECOVERY_TARGET_TIME then we can stop
		 * at a restore point since they are timestamped, though the latest
		 * transaction time is not updated.
		 */
		if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
5678
			SetLatestXTime(recordXtime);
5679
	}
5680
	else if (record->xl_rmid == RM_XACT_ID)
5681
		SetLatestXTime(recordXtime);
5682 5683 5684 5685

	return stopsHere;
}

5686 5687 5688 5689 5690 5691 5692 5693
/*
 * Recheck shared recoveryPause by polling.
 *
 * XXX Can also be done with shared latch.
 */
static void
recoveryPausesHere(void)
{
5694 5695 5696 5697
	ereport(LOG,
			(errmsg("recovery has paused"),
			 errhint("Execute pg_xlog_replay_resume() to continue.")));

5698
	while (RecoveryIsPaused())
5699 5700 5701
	{
		pg_usleep(1000000L);		/* 1000 ms */
		HandleStartupProcInterrupts();
5702
	}
5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793
}

static bool
RecoveryIsPaused(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	bool recoveryPause;

	SpinLockAcquire(&xlogctl->info_lck);
	recoveryPause = xlogctl->recoveryPause;
	SpinLockRelease(&xlogctl->info_lck);

	return recoveryPause;
}

static void
SetRecoveryPause(bool recoveryPause)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->recoveryPause = recoveryPause;
	SpinLockRelease(&xlogctl->info_lck);
}

/*
 * pg_xlog_replay_pause - pause recovery now
 */
Datum
pg_xlog_replay_pause(PG_FUNCTION_ARGS)
{
	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
			 (errmsg("must be superuser to control recovery"))));

	if (!RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is not in progress"),
				 errhint("Recovery control functions can only be executed during recovery.")));

	SetRecoveryPause(true);

	PG_RETURN_VOID();
}

/*
 * pg_xlog_replay_resume - resume recovery now
 */
Datum
pg_xlog_replay_resume(PG_FUNCTION_ARGS)
{
	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
			 (errmsg("must be superuser to control recovery"))));

	if (!RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is not in progress"),
				 errhint("Recovery control functions can only be executed during recovery.")));

	SetRecoveryPause(false);

	PG_RETURN_VOID();
}

/*
 * pg_is_xlog_replay_paused
 */
Datum
pg_is_xlog_replay_paused(PG_FUNCTION_ARGS)
{
	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
			 (errmsg("must be superuser to control recovery"))));

	if (!RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is not in progress"),
				 errhint("Recovery control functions can only be executed during recovery.")));

	PG_RETURN_BOOL(RecoveryIsPaused());
}

5794
/*
5795 5796 5797 5798 5799
 * Save timestamp of latest processed commit/abort record.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by processes other than the startup process.  Note in particular
 * that CreateRestartPoint is executed in the bgwriter.
5800
 */
5801 5802
static void
SetLatestXTime(TimestampTz xtime)
5803
{
5804 5805 5806 5807 5808 5809
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->recoveryLastXTime = xtime;
	SpinLockRelease(&xlogctl->info_lck);
5810 5811 5812
}

/*
5813
 * Fetch timestamp of latest processed commit/abort record.
5814
 */
5815
static TimestampTz
5816
GetLatestXTime(void)
5817 5818 5819
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
5820
	TimestampTz xtime;
5821 5822

	SpinLockAcquire(&xlogctl->info_lck);
5823
	xtime = xlogctl->recoveryLastXTime;
5824 5825
	SpinLockRelease(&xlogctl->info_lck);

5826 5827 5828
	return xtime;
}

5829 5830 5831 5832
/*
 * Returns timestamp of latest processed commit/abort record.
 *
 * When the server has been started normally without recovery the function
5833
 * returns NULL.
5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846
 */
Datum
pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
{
	TimestampTz	xtime;

	xtime = GetLatestXTime();
	if (xtime == 0)
		PG_RETURN_NULL();

	PG_RETURN_TIMESTAMPTZ(xtime);
}

5847 5848 5849 5850 5851 5852 5853
/*
 * Returns bool with current recovery mode, a global state.
 */
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
	PG_RETURN_BOOL(RecoveryInProgress());
5854 5855
}

5856 5857 5858 5859 5860 5861 5862 5863
/*
 * Returns time of receipt of current chunk of XLOG data, as well as
 * whether it was received from streaming replication or from archives.
 */
void
GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
{
	/*
B
Bruce Momjian 已提交
5864 5865
	 * This must be executed in the startup process, since we don't export the
	 * relevant state to shared memory.
5866 5867 5868 5869 5870 5871 5872
	 */
	Assert(InRecovery);

	*rtime = XLogReceiptTime;
	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
}

5873
/*
5874 5875
 * Note that text field supplied is a parameter name and does not require
 * translation
5876
 */
5877
#define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5878
do { \
5879
	if (currValue < minValue) \
5880
		ereport(ERROR, \
5881 5882 5883 5884 5885 5886 5887
				(errmsg("hot standby is not possible because " \
						"%s = %d is a lower setting than on the master server " \
						"(its value was %d)", \
						param_name, \
						currValue, \
						minValue))); \
} while(0)
5888 5889 5890 5891 5892 5893

/*
 * Check to see if required parameters are set high enough on this server
 * for various aspects of recovery operation.
 */
static void
5894
CheckRequiredParameterValues(void)
5895
{
5896
	/*
B
Bruce Momjian 已提交
5897 5898
	 * For archive recovery, the WAL must be generated with at least 'archive'
	 * wal_level.
5899 5900 5901 5902
	 */
	if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
	{
		ereport(WARNING,
5903 5904
				(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
				 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5905
	}
5906

5907
	/*
B
Bruce Momjian 已提交
5908 5909
	 * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
	 * we must have at least as many backend slots as the primary.
5910
	 */
5911
	if (InArchiveRecovery && EnableHotStandby)
5912 5913 5914
	{
		if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
			ereport(ERROR,
5915 5916
					(errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
					 errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5917

5918 5919
		/* We ignore autovacuum_max_workers when we make this test. */
		RecoveryRequiresIntParameter("max_connections",
5920 5921
									 MaxConnections,
									 ControlFile->MaxConnections);
5922
		RecoveryRequiresIntParameter("max_prepared_xacts",
5923 5924
									 max_prepared_xacts,
									 ControlFile->max_prepared_xacts);
5925
		RecoveryRequiresIntParameter("max_locks_per_xact",
5926 5927
									 max_locks_per_xact,
									 ControlFile->max_locks_per_xact);
5928
	}
5929 5930
}

5931
/*
T
Tom Lane 已提交
5932
 * This must be called ONCE during postmaster or standalone-backend startup
5933 5934
 */
void
T
Tom Lane 已提交
5935
StartupXLOG(void)
5936
{
5937 5938
	XLogCtlInsert *Insert;
	CheckPoint	checkPoint;
T
Tom Lane 已提交
5939
	bool		wasShutdown;
5940
	bool		reachedStopPoint = false;
5941
	bool		haveBackupLabel = false;
5942
	XLogRecPtr	RecPtr,
T
Tom Lane 已提交
5943 5944
				checkPointLoc,
				EndOfLog;
5945 5946
	uint32		endLogId;
	uint32		endLogSeg;
5947
	XLogRecord *record;
5948
	uint32		freespace;
5949
	TransactionId oldestActiveXID;
5950

5951
	/*
5952 5953
	 * Read control file and check XLOG status looks valid.
	 *
5954 5955
	 * Note: in most control paths, *ControlFile is already valid and we need
	 * not do ReadControlFile() here, but might as well do it to be sure.
5956
	 */
5957
	ReadControlFile();
5958

5959
	if (ControlFile->state < DB_SHUTDOWNED ||
5960
		ControlFile->state > DB_IN_PRODUCTION ||
5961
		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5962 5963
		ereport(FATAL,
				(errmsg("control file contains invalid data")));
5964 5965

	if (ControlFile->state == DB_SHUTDOWNED)
5966 5967 5968
		ereport(LOG,
				(errmsg("database system was shut down at %s",
						str_time(ControlFile->time))));
5969 5970 5971 5972
	else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
		ereport(LOG,
				(errmsg("database system was shut down in recovery at %s",
						str_time(ControlFile->time))));
5973
	else if (ControlFile->state == DB_SHUTDOWNING)
5974
		ereport(LOG,
5975
				(errmsg("database system shutdown was interrupted; last known up at %s",
5976
						str_time(ControlFile->time))));
5977
	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5978
		ereport(LOG,
B
Bruce Momjian 已提交
5979 5980 5981 5982
		   (errmsg("database system was interrupted while in recovery at %s",
				   str_time(ControlFile->time)),
			errhint("This probably means that some data is corrupted and"
					" you will have to use the last backup for recovery.")));
5983 5984
	else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
		ereport(LOG,
B
Bruce Momjian 已提交
5985 5986
				(errmsg("database system was interrupted while in recovery at log time %s",
						str_time(ControlFile->checkPointCopy.time)),
5987
				 errhint("If this has occurred more than once some data might be corrupted"
B
Bruce Momjian 已提交
5988
			  " and you might need to choose an earlier recovery target.")));
5989
	else if (ControlFile->state == DB_IN_PRODUCTION)
5990
		ereport(LOG,
B
Bruce Momjian 已提交
5991 5992
			  (errmsg("database system was interrupted; last known up at %s",
					  str_time(ControlFile->time))));
5993

5994 5995
	/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
5996
	if (ControlFile->state != DB_SHUTDOWNED)
5997
		pg_usleep(60000000L);
5998 5999
#endif

6000 6001
	/*
	 * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
6002 6003
	 * someone has performed a copy for PITR, these directories may have been
	 * excluded and need to be re-created.
6004 6005 6006
	 */
	ValidateXLOGDirectoryStructure();

6007
	/*
B
Bruce Momjian 已提交
6008 6009 6010 6011 6012 6013
	 * Clear out any old relcache cache files.	This is *necessary* if we do
	 * any WAL replay, since that would probably result in the cache files
	 * being out of sync with database reality.  In theory we could leave them
	 * in place if the database had been cleanly shut down, but it seems
	 * safest to just remove them always and let them be rebuilt during the
	 * first backend startup.
6014 6015 6016
	 */
	RelationCacheInitFileRemove();

6017
	/*
B
Bruce Momjian 已提交
6018 6019
	 * Initialize on the assumption we want to recover to the same timeline
	 * that's active according to pg_control.
6020 6021 6022
	 */
	recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

6023
	/*
B
Bruce Momjian 已提交
6024 6025
	 * Check for recovery control file, and if so set up state for offline
	 * recovery
6026 6027 6028
	 */
	readRecoveryCommandFile();

6029 6030 6031
	/* Now we can determine the list of expected TLIs */
	expectedTLIs = readTimeLineHistory(recoveryTargetTLI);

6032 6033 6034 6035 6036 6037
	/*
	 * If pg_control's timeline is not in expectedTLIs, then we cannot
	 * proceed: the backup is not part of the history of the requested
	 * timeline.
	 */
	if (!list_member_int(expectedTLIs,
B
Bruce Momjian 已提交
6038
						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
6039 6040 6041 6042 6043
		ereport(FATAL,
				(errmsg("requested timeline %u is not a child of database system timeline %u",
						recoveryTargetTLI,
						ControlFile->checkPointCopy.ThisTimeLineID)));

6044
	/*
B
Bruce Momjian 已提交
6045 6046 6047
	 * Save the selected recovery target timeline ID and
	 * archive_cleanup_command in shared memory so that other processes can
	 * see them
6048
	 */
6049
	XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
6050 6051 6052
	strncpy(XLogCtl->archiveCleanupCommand,
			archiveCleanupCommand ? archiveCleanupCommand : "",
			sizeof(XLogCtl->archiveCleanupCommand));
6053

6054 6055 6056 6057 6058 6059 6060
	if (InArchiveRecovery)
	{
		if (StandbyMode)
			ereport(LOG,
					(errmsg("entering standby mode")));
		else if (recoveryTarget == RECOVERY_TARGET_XID)
			ereport(LOG,
B
Bruce Momjian 已提交
6061 6062
					(errmsg("starting point-in-time recovery to XID %u",
							recoveryTargetXid)));
6063 6064 6065 6066
		else if (recoveryTarget == RECOVERY_TARGET_TIME)
			ereport(LOG,
					(errmsg("starting point-in-time recovery to %s",
							timestamptz_to_str(recoveryTargetTime))));
6067 6068 6069 6070
		else if (recoveryTarget == RECOVERY_TARGET_NAME)
			ereport(LOG,
					(errmsg("starting point-in-time recovery to \"%s\"",
							recoveryTargetName)));
6071 6072 6073 6074 6075
		else
			ereport(LOG,
					(errmsg("starting archive recovery")));
	}

6076
	/*
6077
	 * Take ownership of the wakeup latch if we're going to sleep during
6078 6079 6080 6081 6082
	 * recovery.
	 */
	if (StandbyMode)
		OwnLatch(&XLogCtl->recoveryWakeupLatch);

6083
	if (read_backup_label(&checkPointLoc))
T
Tom Lane 已提交
6084
	{
6085
		/*
B
Bruce Momjian 已提交
6086 6087
		 * When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control.
6088
		 */
6089
		record = ReadCheckpointRecord(checkPointLoc, 0);
6090 6091
		if (record != NULL)
		{
6092
			memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6093
			wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6094
			ereport(DEBUG1,
6095
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
6096
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
6097
			InRecovery = true;	/* force recovery even if SHUTDOWNED */
6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111

			/*
			 * Make sure that REDO location exists. This may not be
			 * the case if there was a crash during an online backup,
			 * which left a backup_label around that references a WAL
			 * segment that's already been archived.
			 */
			if (XLByteLT(checkPoint.redo, checkPointLoc))
			{
				if (!ReadRecord(&(checkPoint.redo), LOG, false))
					ereport(FATAL,
							(errmsg("could not find redo location referenced by checkpoint record"),
							 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
			}
6112 6113 6114
		}
		else
		{
6115
			ereport(FATAL,
B
Bruce Momjian 已提交
6116 6117
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6118
			wasShutdown = false; /* keep compiler quiet */
6119
		}
6120 6121
		/* set flag to delete it later */
		haveBackupLabel = true;
T
Tom Lane 已提交
6122 6123 6124
	}
	else
	{
6125
		/*
B
Bruce Momjian 已提交
6126 6127
		 * Get the last valid checkpoint record.  If the latest one according
		 * to pg_control is broken, try the next-to-last one.
6128 6129
		 */
		checkPointLoc = ControlFile->checkPoint;
6130
		RedoStartLSN = ControlFile->checkPointCopy.redo;
6131
		record = ReadCheckpointRecord(checkPointLoc, 1);
T
Tom Lane 已提交
6132 6133
		if (record != NULL)
		{
6134
			ereport(DEBUG1,
6135
					(errmsg("checkpoint record is at %X/%X",
B
Bruce Momjian 已提交
6136
							checkPointLoc.xlogid, checkPointLoc.xrecoff)));
T
Tom Lane 已提交
6137
		}
6138
		else if (StandbyMode)
6139 6140 6141 6142 6143 6144 6145 6146
		{
			/*
			 * The last valid checkpoint record required for a streaming
			 * recovery exists in neither standby nor the primary.
			 */
			ereport(PANIC,
					(errmsg("could not locate a valid checkpoint record")));
		}
T
Tom Lane 已提交
6147
		else
6148 6149
		{
			checkPointLoc = ControlFile->prevCheckPoint;
6150
			record = ReadCheckpointRecord(checkPointLoc, 2);
6151 6152 6153
			if (record != NULL)
			{
				ereport(LOG,
B
Bruce Momjian 已提交
6154 6155 6156
						(errmsg("using previous checkpoint record at %X/%X",
							  checkPointLoc.xlogid, checkPointLoc.xrecoff)));
				InRecovery = true;		/* force recovery even if SHUTDOWNED */
6157 6158 6159
			}
			else
				ereport(PANIC,
B
Bruce Momjian 已提交
6160
					 (errmsg("could not locate a valid checkpoint record")));
6161
		}
6162
		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6163
		wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
T
Tom Lane 已提交
6164
	}
6165

T
Tom Lane 已提交
6166
	LastRec = RecPtr = checkPointLoc;
6167

6168
	ereport(DEBUG1,
B
Bruce Momjian 已提交
6169 6170 6171
			(errmsg("redo record is at %X/%X; shutdown %s",
					checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
					wasShutdown ? "TRUE" : "FALSE")));
6172
	ereport(DEBUG1,
6173 6174 6175
			(errmsg("next transaction ID: %u/%u; next OID: %u",
					checkPoint.nextXidEpoch, checkPoint.nextXid,
					checkPoint.nextOid)));
6176
	ereport(DEBUG1,
6177 6178
			(errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
					checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6179 6180 6181
	ereport(DEBUG1,
			(errmsg("oldest unfrozen transaction ID: %u, in database %u",
					checkPoint.oldestXid, checkPoint.oldestXidDB)));
6182
	if (!TransactionIdIsNormal(checkPoint.nextXid))
6183
		ereport(PANIC,
6184
				(errmsg("invalid next transaction ID")));
6185 6186 6187

	ShmemVariableCache->nextXid = checkPoint.nextXid;
	ShmemVariableCache->nextOid = checkPoint.nextOid;
6188
	ShmemVariableCache->oidCount = 0;
6189
	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6190
	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6191

6192
	/*
B
Bruce Momjian 已提交
6193 6194 6195
	 * We must replay WAL entries using the same TimeLineID they were created
	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
	 * also xlog_redo()).
6196
	 */
6197
	ThisTimeLineID = checkPoint.ThisTimeLineID;
6198

6199
	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
V
WAL  
Vadim B. Mikheev 已提交
6200

6201
	if (XLByteLT(RecPtr, checkPoint.redo))
6202 6203
		ereport(PANIC,
				(errmsg("invalid redo in checkpoint record")));
6204

6205
	/*
B
Bruce Momjian 已提交
6206
	 * Check whether we need to force recovery from WAL.  If it appears to
B
Bruce Momjian 已提交
6207 6208
	 * have been a clean shutdown and we did not have a recovery.conf file,
	 * then assume no recovery needed.
6209
	 */
6210
	if (XLByteLT(checkPoint.redo, RecPtr))
6211
	{
T
Tom Lane 已提交
6212
		if (wasShutdown)
6213
			ereport(PANIC,
B
Bruce Momjian 已提交
6214
					(errmsg("invalid redo record in shutdown checkpoint")));
V
WAL  
Vadim B. Mikheev 已提交
6215
		InRecovery = true;
6216 6217
	}
	else if (ControlFile->state != DB_SHUTDOWNED)
V
WAL  
Vadim B. Mikheev 已提交
6218
		InRecovery = true;
6219 6220 6221 6222 6223
	else if (InArchiveRecovery)
	{
		/* force recovery due to presence of recovery.conf */
		InRecovery = true;
	}
6224

V
WAL  
Vadim B. Mikheev 已提交
6225
	/* REDO */
6226
	if (InRecovery)
6227
	{
B
Bruce Momjian 已提交
6228
		int			rmid;
B
Bruce Momjian 已提交
6229

6230 6231
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;
6232

6233
		/*
B
Bruce Momjian 已提交
6234 6235 6236 6237
		 * Update pg_control to show that we are recovering and to show the
		 * selected checkpoint as the place we are starting from. We also mark
		 * pg_control with any minimum recovery stop point obtained from a
		 * backup history file.
6238
		 */
6239
		if (InArchiveRecovery)
6240
			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6241
		else
6242
		{
6243
			ereport(LOG,
6244 6245
					(errmsg("database system was not properly shut down; "
							"automatic recovery in progress")));
6246 6247 6248 6249 6250
			ControlFile->state = DB_IN_CRASH_RECOVERY;
		}
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = checkPointLoc;
		ControlFile->checkPointCopy = checkPoint;
6251 6252 6253 6254 6255 6256
		if (InArchiveRecovery)
		{
			/* initialize minRecoveryPoint if not set yet */
			if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
				ControlFile->minRecoveryPoint = checkPoint.redo;
		}
B
Bruce Momjian 已提交
6257

6258
		/*
6259
		 * set backupStartPoint if we're starting recovery from a base backup
6260 6261 6262
		 */
		if (haveBackupLabel)
			ControlFile->backupStartPoint = checkPoint.redo;
6263
		ControlFile->time = (pg_time_t) time(NULL);
6264
		/* No need to hold ControlFileLock yet, we aren't up far enough */
6265 6266
		UpdateControlFile();

6267
		/* initialize our local copy of minRecoveryPoint */
6268 6269 6270 6271 6272 6273 6274
		minRecoveryPoint = ControlFile->minRecoveryPoint;

		/*
		 * Reset pgstat data, because it may be invalid after recovery.
		 */
		pgstat_reset_all();

6275
		/*
B
Bruce Momjian 已提交
6276 6277 6278 6279 6280 6281
		 * If there was a backup label file, it's done its job and the info
		 * has now been propagated into pg_control.  We must get rid of the
		 * label file so that if we crash during recovery, we'll pick up at
		 * the latest recovery restartpoint instead of going all the way back
		 * to the backup start point.  It seems prudent though to just rename
		 * the file out of the way rather than delete it completely.
6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292
		 */
		if (haveBackupLabel)
		{
			unlink(BACKUP_LABEL_OLD);
			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
				ereport(FATAL,
						(errcode_for_file_access(),
						 errmsg("could not rename file \"%s\" to \"%s\": %m",
								BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
		}

6293 6294 6295
		/* Check that the GUCs used to generate the WAL allow recovery */
		CheckRequiredParameterValues();

R
Robert Haas 已提交
6296 6297 6298 6299 6300 6301 6302 6303
		/*
		 * We're in recovery, so unlogged relations relations may be trashed
		 * and must be reset.  This should be done BEFORE allowing Hot
		 * Standby connections, so that read-only backends don't try to
		 * read whatever garbage is left over from before.
		 */
		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);

6304
		/*
B
Bruce Momjian 已提交
6305 6306
		 * Initialize for Hot Standby, if enabled. We won't let backends in
		 * yet, not until we've reached the min recovery point specified in
B
Bruce Momjian 已提交
6307
		 * control file and we've established a recovery snapshot from a
6308 6309
		 * running-xacts WAL record.
		 */
6310
		if (InArchiveRecovery && EnableHotStandby)
6311 6312
		{
			TransactionId *xids;
B
Bruce Momjian 已提交
6313
			int			nxids;
6314

6315
			ereport(DEBUG1,
6316
					(errmsg("initializing for hot standby")));
6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330

			InitRecoveryTransactionEnvironment();

			if (wasShutdown)
				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
			else
				oldestActiveXID = checkPoint.oldestActiveXid;
			Assert(TransactionIdIsValid(oldestActiveXID));

			/* Startup commit log and related stuff */
			StartupCLOG();
			StartupSUBTRANS(oldestActiveXID);
			StartupMultiXact();

6331 6332
			/*
			 * If we're beginning at a shutdown checkpoint, we know that
B
Bruce Momjian 已提交
6333 6334 6335
			 * nothing was running on the master at this point. So fake-up an
			 * empty running-xacts record and use that here and now. Recover
			 * additional standby state for prepared transactions.
6336 6337 6338 6339
			 */
			if (wasShutdown)
			{
				RunningTransactionsData running;
6340
				TransactionId latestCompletedXid;
6341 6342

				/*
B
Bruce Momjian 已提交
6343 6344 6345 6346
				 * Construct a RunningTransactions snapshot representing a
				 * shut down server, with only prepared transactions still
				 * alive. We're never overflowed at this point because all
				 * subxids are listed with their parent prepared transactions.
6347 6348 6349 6350 6351
				 */
				running.xcnt = nxids;
				running.subxid_overflow = false;
				running.nextXid = checkPoint.nextXid;
				running.oldestRunningXid = oldestActiveXID;
6352 6353
				latestCompletedXid = checkPoint.nextXid;
				TransactionIdRetreat(latestCompletedXid);
6354
				Assert(TransactionIdIsNormal(latestCompletedXid));
6355
				running.latestCompletedXid = latestCompletedXid;
6356 6357 6358 6359 6360 6361
				running.xids = xids;

				ProcArrayApplyRecoveryInfo(&running);

				StandbyRecoverPreparedTransactions(false);
			}
6362 6363
		}

6364
		/* Initialize resource managers */
6365 6366 6367 6368 6369 6370
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

6371
		/*
6372 6373
		 * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
		 * recoveryLastXTime.
6374 6375
		 *
		 * This is slightly confusing if we're starting from an online
B
Bruce Momjian 已提交
6376 6377 6378 6379 6380 6381
		 * checkpoint; we've just read and replayed the chekpoint record, but
		 * we're going to start replay from its redo pointer, which precedes
		 * the location of the checkpoint record itself. So even though the
		 * last record we've replayed is indeed ReadRecPtr, we haven't
		 * replayed all the preceding records yet. That's OK for the current
		 * use of these variables.
6382 6383 6384 6385
		 */
		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->replayEndRecPtr = ReadRecPtr;
		xlogctl->recoveryLastRecPtr = ReadRecPtr;
6386
		xlogctl->recoveryLastXTime = 0;
6387
		xlogctl->recoveryPause = false;
6388 6389
		SpinLockRelease(&xlogctl->info_lck);

6390 6391 6392
		/* Also ensure XLogReceiptTime has a sane value */
		XLogReceiptTime = GetCurrentTimestamp();

6393
		/*
B
Bruce Momjian 已提交
6394 6395 6396 6397 6398
		 * Let postmaster know we've started redo now, so that it can launch
		 * bgwriter to perform restartpoints.  We don't bother during crash
		 * recovery as restartpoints can only be performed during archive
		 * recovery.  And we'd like to keep crash recovery simple, to avoid
		 * introducing bugs that could affect you when recovering after crash.
6399 6400 6401 6402 6403 6404 6405
		 *
		 * After this point, we can no longer assume that we're the only
		 * process in addition to postmaster!  Also, fsync requests are
		 * subsequently to be handled by the bgwriter, not locally.
		 */
		if (InArchiveRecovery && IsUnderPostmaster)
		{
6406
			PublishStartupProcessInformation();
6407 6408 6409 6410 6411 6412
			SetForwardFsyncRequests();
			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
			bgwriterLaunched = true;
		}

		/*
B
Bruce Momjian 已提交
6413 6414
		 * Allow read-only connections immediately if we're consistent
		 * already.
6415 6416 6417
		 */
		CheckRecoveryConsistency();

6418
		/*
B
Bruce Momjian 已提交
6419 6420
		 * Find the first record that logically follows the checkpoint --- it
		 * might physically precede it, though.
6421
		 */
6422
		if (XLByteLT(checkPoint.redo, RecPtr))
6423 6424
		{
			/* back up to find the record */
6425
			record = ReadRecord(&(checkPoint.redo), PANIC, false);
6426
		}
B
Bruce Momjian 已提交
6427
		else
6428
		{
6429
			/* just have to read next record after CheckPoint */
6430
			record = ReadRecord(NULL, LOG, false);
6431
		}
6432

T
Tom Lane 已提交
6433
		if (record != NULL)
6434
		{
6435 6436
			bool		recoveryContinue = true;
			bool		recoveryApply = true;
6437
			bool		recoveryPause = false;
B
Bruce Momjian 已提交
6438
			ErrorContextCallback errcontext;
6439
			TimestampTz xtime;
6440

V
WAL  
Vadim B. Mikheev 已提交
6441
			InRedo = true;
6442

6443 6444 6445
			ereport(LOG,
					(errmsg("redo starts at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6446

6447 6448 6449
			/*
			 * main redo apply loop
			 */
6450 6451
			do
			{
6452
#ifdef WAL_DEBUG
6453
				if (XLOG_DEBUG ||
B
Bruce Momjian 已提交
6454
				 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6455
					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
V
WAL  
Vadim B. Mikheev 已提交
6456
				{
B
Bruce Momjian 已提交
6457
					StringInfoData buf;
V
WAL  
Vadim B. Mikheev 已提交
6458

6459 6460
					initStringInfo(&buf);
					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
B
Bruce Momjian 已提交
6461 6462
									 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
									 EndRecPtr.xlogid, EndRecPtr.xrecoff);
6463 6464 6465 6466
					xlog_outrec(&buf, record);
					appendStringInfo(&buf, " - ");
					RmgrTable[record->xl_rmid].rm_desc(&buf,
													   record->xl_info,
B
Bruce Momjian 已提交
6467
													 XLogRecGetData(record));
6468 6469
					elog(LOG, "%s", buf.data);
					pfree(buf.data);
V
WAL  
Vadim B. Mikheev 已提交
6470
				}
6471
#endif
V
WAL  
Vadim B. Mikheev 已提交
6472

6473 6474
				/* Handle interrupt signals of startup process */
				HandleStartupProcInterrupts();
6475

6476 6477
				/* Allow read-only connections if we're consistent now */
				CheckRecoveryConsistency();
6478

6479 6480 6481 6482 6483
				/*
				 * Have we reached our recovery target?
				 */
				if (recoveryStopsHere(record, &recoveryApply))
				{
6484 6485 6486 6487
					/*
					 * Pause only if users can connect to send a resume message
					 */
					if (recoveryPauseAtTarget && standbyState == STANDBY_SNAPSHOT_READY)
6488 6489 6490 6491
					{
						SetRecoveryPause(true);
						recoveryPausesHere();
					}
B
Bruce Momjian 已提交
6492
					reachedStopPoint = true;	/* see below */
6493 6494 6495 6496 6497
					recoveryContinue = false;
					if (!recoveryApply)
						break;
				}

6498 6499 6500 6501 6502 6503
				/* Setup error traceback support for ereport() */
				errcontext.callback = rm_redo_error_callback;
				errcontext.arg = (void *) record;
				errcontext.previous = error_context_stack;
				error_context_stack = &errcontext;

6504 6505
				/* nextXid must be beyond record's xid */
				if (TransactionIdFollowsOrEquals(record->xl_xid,
B
Bruce Momjian 已提交
6506
												 ShmemVariableCache->nextXid))
6507 6508 6509 6510 6511
				{
					ShmemVariableCache->nextXid = record->xl_xid;
					TransactionIdAdvance(ShmemVariableCache->nextXid);
				}

6512
				/*
6513 6514
				 * Update shared replayEndRecPtr before replaying this record,
				 * so that XLogFlush will update minRecoveryPoint correctly.
6515 6516 6517
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->replayEndRecPtr = EndRecPtr;
6518
				recoveryPause = xlogctl->recoveryPause;
6519 6520
				SpinLockRelease(&xlogctl->info_lck);

6521 6522 6523 6524
				/*
				 * Pause only if users can connect to send a resume message
				 */
				if (recoveryPause && standbyState == STANDBY_SNAPSHOT_READY)
6525 6526
					recoveryPausesHere();

B
Bruce Momjian 已提交
6527 6528 6529 6530
				/*
				 * If we are attempting to enter Hot Standby mode, process
				 * XIDs we see
				 */
6531 6532
				if (standbyState >= STANDBY_INITIALIZED &&
					TransactionIdIsValid(record->xl_xid))
6533 6534
					RecordKnownAssignedTransactionIds(record->xl_xid);

6535
				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6536

6537 6538 6539
				/* Pop the error context stack */
				error_context_stack = errcontext.previous;

6540 6541 6542 6543 6544 6545 6546 6547
				/*
				 * Update shared recoveryLastRecPtr after this record has been
				 * replayed.
				 */
				SpinLockAcquire(&xlogctl->info_lck);
				xlogctl->recoveryLastRecPtr = EndRecPtr;
				SpinLockRelease(&xlogctl->info_lck);

6548 6549
				LastRec = ReadRecPtr;

6550
				record = ReadRecord(NULL, LOG, false);
6551
			} while (record != NULL && recoveryContinue);
B
Bruce Momjian 已提交
6552

6553 6554 6555 6556
			/*
			 * end of main redo apply loop
			 */

6557 6558 6559
			ereport(LOG,
					(errmsg("redo done at %X/%X",
							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6560 6561
			xtime = GetLatestXTime();
			if (xtime)
6562
				ereport(LOG,
B
Bruce Momjian 已提交
6563
					 (errmsg("last completed transaction was at log time %s",
6564
							 timestamptz_to_str(xtime))));
V
WAL  
Vadim B. Mikheev 已提交
6565
			InRedo = false;
6566 6567
		}
		else
6568 6569
		{
			/* there are no WAL records following the checkpoint */
6570 6571
			ereport(LOG,
					(errmsg("redo is not required")));
6572
		}
V
WAL  
Vadim B. Mikheev 已提交
6573 6574
	}

6575 6576 6577 6578 6579 6580 6581 6582
	/*
	 * If we launched a WAL receiver, it should be gone by now. It will trump
	 * over the startup checkpoint and subsequent records if it's still alive,
	 * so be extra sure that it's gone.
	 */
	if (WalRcvInProgress())
		elog(PANIC, "wal receiver still active");

6583 6584
	/*
	 * We don't need the latch anymore. It's not strictly necessary to disown
6585
	 * it, but let's do it for the sake of tidiness.
6586 6587 6588 6589
	 */
	if (StandbyMode)
		DisownLatch(&XLogCtl->recoveryWakeupLatch);

6590 6591
	/*
	 * We are now done reading the xlog from stream. Turn off streaming
B
Bruce Momjian 已提交
6592 6593
	 * recovery to force fetching the files (which would be required at end of
	 * recovery, e.g., timeline history file) from archive or pg_xlog.
6594
	 */
6595
	StandbyMode = false;
6596

T
Tom Lane 已提交
6597
	/*
B
Bruce Momjian 已提交
6598 6599
	 * Re-fetch the last valid or last applied record, so we can identify the
	 * exact endpoint of what we consider the valid portion of WAL.
T
Tom Lane 已提交
6600
	 */
6601
	record = ReadRecord(&LastRec, PANIC, false);
T
Tom Lane 已提交
6602
	EndOfLog = EndRecPtr;
6603 6604
	XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);

6605 6606
	/*
	 * Complain if we did not roll forward far enough to render the backup
6607 6608 6609 6610
	 * dump consistent.  Note: it is indeed okay to look at the local variable
	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
	 * advanced beyond the WAL we processed.
6611
	 */
6612
	if (InRecovery &&
6613 6614
		(XLByteLT(EndOfLog, minRecoveryPoint) ||
		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6615
	{
6616
		if (reachedStopPoint)	/* stopped because of stop request */
6617
			ereport(FATAL,
6618
					(errmsg("requested recovery stop point is before consistent recovery point")));
B
Bruce Momjian 已提交
6619
		else	/* ran off end of WAL */
6620
			ereport(FATAL,
6621
					(errmsg("WAL ends before consistent recovery point")));
6622 6623
	}

6624 6625 6626
	/*
	 * Consider whether we need to assign a new timeline ID.
	 *
B
Bruce Momjian 已提交
6627 6628
	 * If we are doing an archive recovery, we always assign a new ID.	This
	 * handles a couple of issues.	If we stopped short of the end of WAL
6629 6630
	 * during recovery, then we are clearly generating a new timeline and must
	 * assign it a unique new ID.  Even if we ran to the end, modifying the
B
Bruce Momjian 已提交
6631 6632
	 * current last segment is problematic because it may result in trying to
	 * overwrite an already-archived copy of that segment, and we encourage
6633 6634 6635 6636
	 * DBAs to make their archive_commands reject that.  We can dodge the
	 * problem by making the new active segment have a new timeline ID.
	 *
	 * In a normal crash recovery, we can just extend the timeline we were in.
6637
	 */
6638
	if (InArchiveRecovery)
6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649
	{
		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
		ereport(LOG,
				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
							 curFileTLI, endLogId, endLogSeg);
	}

	/* Save the selected TimeLineID in shared memory, too */
	XLogCtl->ThisTimeLineID = ThisTimeLineID;

6650
	/*
B
Bruce Momjian 已提交
6651 6652 6653 6654
	 * We are now done reading the old WAL.  Turn off archive fetching if it
	 * was active, and make a writable copy of the last WAL segment. (Note
	 * that we also have a copy of the last block of the old WAL in readBuf;
	 * we will use that below.)
6655 6656
	 */
	if (InArchiveRecovery)
6657
		exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6658 6659 6660 6661 6662 6663 6664 6665

	/*
	 * Prepare to write WAL starting at EndOfLog position, and init xlog
	 * buffer cache using the block containing the last record from the
	 * previous incarnation.
	 */
	openLogId = endLogId;
	openLogSeg = endLogSeg;
6666
	openLogFile = XLogFileOpen(openLogId, openLogSeg);
T
Tom Lane 已提交
6667
	openLogOff = 0;
V
WAL  
Vadim B. Mikheev 已提交
6668
	Insert = &XLogCtl->Insert;
6669
	Insert->PrevRecord = LastRec;
6670 6671
	XLogCtl->xlblocks[0].xlogid = openLogId;
	XLogCtl->xlblocks[0].xrecoff =
6672
		((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
B
Bruce Momjian 已提交
6673 6674

	/*
B
Bruce Momjian 已提交
6675 6676 6677
	 * Tricky point here: readBuf contains the *last* block that the LastRec
	 * record spans, not the one it starts in.	The last block is indeed the
	 * one we want to use.
T
Tom Lane 已提交
6678
	 */
6679 6680
	Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
	memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6681
	Insert->currpos = (char *) Insert->currpage +
6682
		(EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
V
WAL  
Vadim B. Mikheev 已提交
6683

T
Tom Lane 已提交
6684
	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
V
WAL  
Vadim B. Mikheev 已提交
6685

T
Tom Lane 已提交
6686 6687 6688
	XLogCtl->Write.LogwrtResult = LogwrtResult;
	Insert->LogwrtResult = LogwrtResult;
	XLogCtl->LogwrtResult = LogwrtResult;
V
WAL  
Vadim B. Mikheev 已提交
6689

T
Tom Lane 已提交
6690 6691
	XLogCtl->LogwrtRqst.Write = EndOfLog;
	XLogCtl->LogwrtRqst.Flush = EndOfLog;
6692

6693 6694 6695 6696 6697 6698 6699 6700 6701 6702
	freespace = INSERT_FREESPACE(Insert);
	if (freespace > 0)
	{
		/* Make sure rest of page is zero */
		MemSet(Insert->currpos, 0, freespace);
		XLogCtl->Write.curridx = 0;
	}
	else
	{
		/*
B
Bruce Momjian 已提交
6703 6704
		 * Whenever Write.LogwrtResult points to exactly the end of a page,
		 * Write.curridx must point to the *next* page (see XLogWrite()).
6705
		 *
B
Bruce Momjian 已提交
6706
		 * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
B
Bruce Momjian 已提交
6707
		 * this is sufficient.	The first actual attempt to insert a log
6708
		 * record will advance the insert state.
6709 6710 6711 6712
		 */
		XLogCtl->Write.curridx = NextBufIdx(0);
	}

6713
	/* Pre-scan prepared transactions to find out the range of XIDs present */
6714
	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6715

V
WAL  
Vadim B. Mikheev 已提交
6716
	if (InRecovery)
6717
	{
B
Bruce Momjian 已提交
6718
		int			rmid;
6719

6720 6721 6722 6723 6724 6725 6726
		/*
		 * Resource managers might need to write WAL records, eg, to record
		 * index cleanup actions.  So temporarily enable XLogInsertAllowed in
		 * this process only.
		 */
		LocalSetXLogInsertAllowed();

6727 6728 6729 6730 6731 6732 6733 6734 6735
		/*
		 * Allow resource managers to do any required cleanup.
		 */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_cleanup != NULL)
				RmgrTable[rmid].rm_cleanup();
		}

6736 6737 6738
		/* Disallow XLogInsert again */
		LocalXLogInsertAllowed = -1;

6739 6740 6741 6742 6743 6744
		/*
		 * Check to see if the XLOG sequence contained any unresolved
		 * references to uninitialized pages.
		 */
		XLogCheckInvalidPages();

T
Tom Lane 已提交
6745
		/*
6746
		 * Perform a checkpoint to update all our recovery activity to disk.
6747
		 *
6748 6749 6750 6751 6752
		 * Note that we write a shutdown checkpoint rather than an on-line
		 * one. This is not particularly critical, but since we may be
		 * assigning a new TLI, using a shutdown checkpoint allows us to have
		 * the rule that TLI only changes in shutdown checkpoints, which
		 * allows some extra error checking in xlog_redo.
T
Tom Lane 已提交
6753
		 */
6754 6755 6756 6757 6758 6759
		if (bgwriterLaunched)
			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
							  CHECKPOINT_IMMEDIATE |
							  CHECKPOINT_WAIT);
		else
			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6760

T
Tom Lane 已提交
6761 6762 6763
		/*
		 * And finally, execute the recovery_end_command, if any.
		 */
6764
		if (recoveryEndCommand)
6765 6766 6767
			ExecuteRecoveryCommand(recoveryEndCommand,
								   "recovery_end_command",
								   true);
6768
	}
6769

T
Tom Lane 已提交
6770 6771 6772
	/*
	 * Preallocate additional log files, if wanted.
	 */
6773
	PreallocXlogFiles(EndOfLog);
6774

R
Robert Haas 已提交
6775 6776 6777 6778 6779 6780 6781 6782
	/*
	 * Reset initial contents of unlogged relations.  This has to be done
	 * AFTER recovery is complete so that any unlogged relations created
	 * during recovery also get picked up.
	 */
	if (InRecovery)
		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);

6783 6784 6785
	/*
	 * Okay, we're officially UP.
	 */
V
WAL  
Vadim B. Mikheev 已提交
6786
	InRecovery = false;
6787

6788
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6789
	ControlFile->state = DB_IN_PRODUCTION;
6790
	ControlFile->time = (pg_time_t) time(NULL);
6791
	UpdateControlFile();
6792
	LWLockRelease(ControlFileLock);
6793

6794
	/* start the archive_timeout timer running */
6795
	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6796

6797 6798 6799 6800
	/* initialize shared-memory copy of latest checkpoint XID/epoch */
	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;

6801 6802 6803 6804
	/* also initialize latestCompletedXid, to nextXid - 1 */
	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);

6805
	/*
B
Bruce Momjian 已提交
6806 6807
	 * Start up the commit log and related stuff, too. In hot standby mode we
	 * did this already before WAL replay.
6808 6809 6810 6811 6812 6813 6814
	 */
	if (standbyState == STANDBY_DISABLED)
	{
		StartupCLOG();
		StartupSUBTRANS(oldestActiveXID);
		StartupMultiXact();
	}
6815

6816 6817 6818
	/* Reload shared-memory state for prepared transactions */
	RecoverPreparedTransactions();

6819 6820 6821 6822 6823 6824 6825
	/*
	 * Shutdown the recovery environment. This must occur after
	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
	 */
	if (standbyState != STANDBY_DISABLED)
		ShutdownRecoveryTransactionEnvironment();

T
Tom Lane 已提交
6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836
	/* Shut down readFile facility, free space */
	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}
	if (readBuf)
	{
		free(readBuf);
		readBuf = NULL;
	}
6837 6838 6839 6840 6841 6842
	if (readRecordBuf)
	{
		free(readRecordBuf);
		readRecordBuf = NULL;
		readRecordBufSize = 0;
	}
6843

6844 6845 6846 6847 6848 6849 6850
	/*
	 * If any of the critical GUCs have changed, log them before we allow
	 * backends to write WAL.
	 */
	LocalSetXLogInsertAllowed();
	XLogReportParameters();

6851
	/*
B
Bruce Momjian 已提交
6852
	 * All done.  Allow backends to write WAL.	(Although the bool flag is
6853 6854 6855
	 * probably atomic in itself, we use the info_lck here to ensure that
	 * there are no race conditions concerning visibility of other recent
	 * updates to shared memory.)
6856
	 */
6857 6858 6859 6860 6861 6862 6863 6864
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedRecoveryInProgress = false;
		SpinLockRelease(&xlogctl->info_lck);
	}
6865 6866
}

6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888
/*
 * Checks if recovery has reached a consistent state. When consistency is
 * reached and we have a valid starting standby snapshot, tell postmaster
 * that it can start accepting read-only connections.
 */
static void
CheckRecoveryConsistency(void)
{
	/*
	 * Have we passed our safe starting point?
	 */
	if (!reachedMinRecoveryPoint &&
		XLByteLE(minRecoveryPoint, EndRecPtr) &&
		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
	{
		reachedMinRecoveryPoint = true;
		ereport(LOG,
				(errmsg("consistent recovery state reached at %X/%X",
						EndRecPtr.xlogid, EndRecPtr.xrecoff)));
	}

	/*
B
Bruce Momjian 已提交
6889 6890 6891
	 * Have we got a valid starting snapshot that will allow queries to be
	 * run? If so, we can tell postmaster that the database is consistent now,
	 * enabling connections.
6892 6893
	 */
	if (standbyState == STANDBY_SNAPSHOT_READY &&
6894
		!LocalHotStandbyActive &&
6895 6896 6897
		reachedMinRecoveryPoint &&
		IsUnderPostmaster)
	{
6898 6899 6900 6901 6902 6903 6904 6905 6906
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->SharedHotStandbyActive = true;
		SpinLockRelease(&xlogctl->info_lck);

		LocalHotStandbyActive = true;

6907
		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6908 6909 6910
	}
}

6911 6912 6913
/*
 * Is the system still in recovery?
 *
6914 6915 6916
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
6917 6918 6919 6920 6921 6922 6923
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
 */
bool
RecoveryInProgress(void)
{
	/*
B
Bruce Momjian 已提交
6924 6925 6926
	 * We check shared state each time only until we leave recovery mode. We
	 * can't re-enter recovery, so there's no need to keep checking after the
	 * shared variable has once been seen false.
6927 6928 6929 6930 6931 6932 6933 6934
	 */
	if (!LocalRecoveryInProgress)
		return false;
	else
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

6935 6936
		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
6937
		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6938
		SpinLockRelease(&xlogctl->info_lck);
6939 6940

		/*
6941
		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6942
		 * is finished. InitPostgres() relies upon this behaviour to ensure
B
Bruce Momjian 已提交
6943
		 * that InitXLOGAccess() is called at backend startup.	(If you change
6944
		 * this, see also LocalSetXLogInsertAllowed.)
6945 6946 6947 6948 6949 6950
		 */
		if (!LocalRecoveryInProgress)
			InitXLOGAccess();

		return LocalRecoveryInProgress;
	}
T
Tom Lane 已提交
6951 6952
}

6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984
/*
 * Is HotStandby active yet? This is only important in special backends
 * since normal backends won't ever be able to connect until this returns
 * true. Postmaster knows this by way of signal, not via shared memory.
 *
 * Unlike testing standbyState, this works in any process that's connected to
 * shared memory.
 */
bool
HotStandbyActive(void)
{
	/*
	 * We check shared state each time only until Hot Standby is active. We
	 * can't de-activate Hot Standby, so there's no need to keep checking after
	 * the shared variable has once been seen true.
	 */
	if (LocalHotStandbyActive)
		return true;
	else
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		/* spinlock is essential on machines with weak memory ordering! */
		SpinLockAcquire(&xlogctl->info_lck);
		LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
		SpinLockRelease(&xlogctl->info_lck);

		return LocalHotStandbyActive;
	}
}

6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995
/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool
XLogInsertAllowed(void)
{
	/*
B
Bruce Momjian 已提交
6996 6997 6998
	 * If value is "unconditionally true" or "unconditionally false", just
	 * return it.  This provides the normal fast path once recovery is known
	 * done.
6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009
	 */
	if (LocalXLogInsertAllowed >= 0)
		return (bool) LocalXLogInsertAllowed;

	/*
	 * Else, must check to see if we're still in recovery.
	 */
	if (RecoveryInProgress())
		return false;

	/*
B
Bruce Momjian 已提交
7010 7011
	 * On exit from recovery, reset to "unconditionally true", since there is
	 * no need to keep checking.
7012 7013 7014 7015 7016 7017 7018
	 */
	LocalXLogInsertAllowed = 1;
	return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
7019 7020 7021
 *
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
 * and even call LocalSetXLogInsertAllowed() again after that.
7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032
 */
static void
LocalSetXLogInsertAllowed(void)
{
	Assert(LocalXLogInsertAllowed == -1);
	LocalXLogInsertAllowed = 1;

	/* Initialize as RecoveryInProgress() would do when switching state */
	InitXLOGAccess();
}

7033 7034
/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
7035 7036 7037
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7038
 */
T
Tom Lane 已提交
7039
static XLogRecord *
7040
ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
T
Tom Lane 已提交
7041 7042 7043 7044 7045
{
	XLogRecord *record;

	if (!XRecOffIsValid(RecPtr.xrecoff))
	{
7046 7047 7048 7049
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
7050
				(errmsg("invalid primary checkpoint link in control file")));
7051 7052 7053 7054 7055 7056 7057
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint link in control file")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
7058
				   (errmsg("invalid checkpoint link in backup_label file")));
7059 7060
				break;
		}
T
Tom Lane 已提交
7061 7062 7063
		return NULL;
	}

7064
	record = ReadRecord(&RecPtr, LOG, true);
T
Tom Lane 已提交
7065 7066 7067

	if (record == NULL)
	{
7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
						(errmsg("invalid checkpoint record")));
				break;
		}
T
Tom Lane 已提交
7083 7084 7085 7086
		return NULL;
	}
	if (record->xl_rmid != RM_XLOG_ID)
	{
7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
						(errmsg("invalid resource manager ID in primary checkpoint record")));
				break;
			case 2:
				ereport(LOG,
						(errmsg("invalid resource manager ID in secondary checkpoint record")));
				break;
			default:
				ereport(LOG,
B
Bruce Momjian 已提交
7099
				(errmsg("invalid resource manager ID in checkpoint record")));
7100 7101
				break;
		}
T
Tom Lane 已提交
7102 7103 7104 7105 7106
		return NULL;
	}
	if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
		record->xl_info != XLOG_CHECKPOINT_ONLINE)
	{
7107 7108 7109 7110
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
7111
				   (errmsg("invalid xl_info in primary checkpoint record")));
7112 7113 7114
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
7115
				 (errmsg("invalid xl_info in secondary checkpoint record")));
7116 7117 7118 7119 7120 7121
				break;
			default:
				ereport(LOG,
						(errmsg("invalid xl_info in checkpoint record")));
				break;
		}
T
Tom Lane 已提交
7122 7123
		return NULL;
	}
7124 7125
	if (record->xl_len != sizeof(CheckPoint) ||
		record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
T
Tom Lane 已提交
7126
	{
7127 7128 7129 7130
		switch (whichChkpt)
		{
			case 1:
				ereport(LOG,
B
Bruce Momjian 已提交
7131
					(errmsg("invalid length of primary checkpoint record")));
7132 7133 7134
				break;
			case 2:
				ereport(LOG,
B
Bruce Momjian 已提交
7135
				  (errmsg("invalid length of secondary checkpoint record")));
7136 7137 7138 7139 7140 7141
				break;
			default:
				ereport(LOG,
						(errmsg("invalid length of checkpoint record")));
				break;
		}
T
Tom Lane 已提交
7142 7143 7144
		return NULL;
	}
	return record;
7145 7146
}

V
WAL  
Vadim B. Mikheev 已提交
7147
/*
7148 7149
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
7150
 * instead).  We need to initialize the local copies of ThisTimeLineID and
7151 7152
 * RedoRecPtr.
 *
7153
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
7154
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
7155
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
V
WAL  
Vadim B. Mikheev 已提交
7156 7157
 */
void
7158
InitXLOGAccess(void)
V
WAL  
Vadim B. Mikheev 已提交
7159
{
7160 7161
	/* ThisTimeLineID doesn't change so we need no lock to copy it */
	ThisTimeLineID = XLogCtl->ThisTimeLineID;
7162
	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
7163

7164 7165
	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
	(void) GetRedoRecPtr();
7166 7167 7168 7169 7170 7171 7172 7173
}

/*
 * Once spawned, a backend may update its local RedoRecPtr from
 * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
 * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
 */
XLogRecPtr
7174 7175
GetRedoRecPtr(void)
{
7176 7177 7178
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

7179
	SpinLockAcquire(&xlogctl->info_lck);
7180 7181
	Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
	RedoRecPtr = xlogctl->Insert.RedoRecPtr;
7182
	SpinLockRelease(&xlogctl->info_lck);
7183 7184

	return RedoRecPtr;
V
WAL  
Vadim B. Mikheev 已提交
7185 7186
}

7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200
/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr
GetInsertRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
B
Bruce Momjian 已提交
7201
	XLogRecPtr	recptr;
7202 7203 7204 7205 7206 7207 7208 7209

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->LogwrtRqst.Write;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

7210
/*
7211 7212
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
 * position known to be fsync'd to disk.
7213 7214
 */
XLogRecPtr
7215
GetFlushRecPtr(void)
7216 7217 7218 7219 7220 7221
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
7222
	recptr = xlogctl->LogwrtResult.Flush;
7223 7224 7225 7226 7227
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

7228 7229 7230
/*
 * Get the time of the last xlog segment switch
 */
7231
pg_time_t
7232 7233
GetLastSegSwitchTime(void)
{
7234
	pg_time_t	result;
7235 7236 7237 7238 7239 7240 7241 7242 7243

	/* Need WALWriteLock, but shared lock is sufficient */
	LWLockAcquire(WALWriteLock, LW_SHARED);
	result = XLogCtl->Write.lastSegSwitchTime;
	LWLockRelease(WALWriteLock);

	return result;
}

7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254
/*
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
 *
 * This is exported for use by code that would like to have 64-bit XIDs.
 * We don't really support such things, but all XIDs within the system
 * can be presumed "close to" the result, and thus the epoch associated
 * with them can be determined.
 */
void
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
{
B
Bruce Momjian 已提交
7255 7256 7257
	uint32		ckptXidEpoch;
	TransactionId ckptXid;
	TransactionId nextXid;
7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283

	/* Must read checkpoint info first, else have race condition */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		ckptXidEpoch = xlogctl->ckptXidEpoch;
		ckptXid = xlogctl->ckptXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

	/* Now fetch current nextXid */
	nextXid = ReadNewTransactionId();

	/*
	 * nextXid is certainly logically later than ckptXid.  So if it's
	 * numerically less, it must have wrapped into the next epoch.
	 */
	if (nextXid < ckptXid)
		ckptXidEpoch++;

	*xid = nextXid;
	*epoch = ckptXidEpoch;
}

7284 7285 7286 7287 7288 7289 7290 7291 7292 7293
/*
 * GetRecoveryTargetTLI - get the recovery target timeline ID
 */
TimeLineID
GetRecoveryTargetTLI(void)
{
	/* RecoveryTargetTLI doesn't change so we need no lock to copy it */
	return XLogCtl->RecoveryTargetTLI;
}

7294
/*
T
Tom Lane 已提交
7295
 * This must be called ONCE during postmaster or standalone-backend shutdown
7296 7297
 */
void
7298
ShutdownXLOG(int code, Datum arg)
7299
{
7300 7301
	ereport(LOG,
			(errmsg("shutting down")));
7302

7303 7304 7305
	if (RecoveryInProgress())
		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
	else
7306 7307 7308 7309 7310 7311 7312 7313 7314 7315
	{
		/*
		 * If archiving is enabled, rotate the last XLOG file so that all the
		 * remaining records are archived (postmaster wakes up the archiver
		 * process one more time at the end of shutdown). The checkpoint
		 * record will go to the next XLOG file and won't be archived (yet).
		 */
		if (XLogArchivingActive() && XLogArchiveCommandSet())
			RequestXLogSwitch();

7316
		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
7317
	}
7318
	ShutdownCLOG();
7319
	ShutdownSUBTRANS();
7320
	ShutdownMultiXact();
7321

7322 7323
	ereport(LOG,
			(errmsg("database system is shut down")));
7324 7325
}

7326
/*
7327 7328 7329
 * Log start of a checkpoint.
 */
static void
7330
LogCheckpointStart(int flags, bool restartpoint)
7331
{
7332
	const char *msg;
7333 7334

	/*
7335 7336
	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
	 * the main message, but what about all the flags?
7337 7338
	 */
	if (restartpoint)
7339
		msg = "restartpoint starting:%s%s%s%s%s%s%s";
7340
	else
7341
		msg = "checkpoint starting:%s%s%s%s%s%s%s";
7342 7343

	elog(LOG, msg,
7344
		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7345
		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7346 7347 7348 7349 7350 7351 7352
		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
		 (flags & CHECKPOINT_FORCE) ? " force" : "",
		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

7353
/*
7354 7355 7356
 * Log end of a checkpoint.
 */
static void
7357
LogCheckpointEnd(bool restartpoint)
7358
{
B
Bruce Momjian 已提交
7359 7360
	long		write_secs,
				sync_secs,
R
Robert Haas 已提交
7361 7362 7363
				total_secs,
				longest_secs,
				average_secs;
B
Bruce Momjian 已提交
7364 7365
	int			write_usecs,
				sync_usecs,
R
Robert Haas 已提交
7366 7367 7368 7369
				total_usecs,
				longest_usecs,
				average_usecs;
	uint64		average_sync_time;
7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384

	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

	TimestampDifference(CheckpointStats.ckpt_start_t,
						CheckpointStats.ckpt_end_t,
						&total_secs, &total_usecs);

	TimestampDifference(CheckpointStats.ckpt_write_t,
						CheckpointStats.ckpt_sync_t,
						&write_secs, &write_usecs);

	TimestampDifference(CheckpointStats.ckpt_sync_t,
						CheckpointStats.ckpt_sync_end_t,
						&sync_secs, &sync_usecs);

R
Robert Haas 已提交
7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400
	/*
	 * Timing values returned from CheckpointStats are in microseconds.
	 * Convert to the second plus microsecond form that TimestampDifference
	 * returns for homogeneous printing.
	 */
	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
	longest_usecs = CheckpointStats.ckpt_longest_sync -
		(uint64) longest_secs * 1000000;

	average_sync_time = 0;
	if (CheckpointStats.ckpt_sync_rels > 0) 
		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
			CheckpointStats.ckpt_sync_rels;
	average_secs = (long) (average_sync_time / 1000000);
	average_usecs = average_sync_time - (uint64) average_secs * 1000000;

7401 7402
	if (restartpoint)
		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7403
			 "%d transaction log file(s) added, %d removed, %d recycled; "
R
Robert Haas 已提交
7404 7405
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7406 7407
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7408 7409 7410
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
7411 7412
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
R
Robert Haas 已提交
7413 7414 7415 7416
			 total_secs, total_usecs / 1000,
			 CheckpointStats.ckpt_sync_rels,
			 longest_secs, longest_usecs / 1000,
			 average_secs, average_usecs / 1000);
7417 7418 7419
	else
		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
			 "%d transaction log file(s) added, %d removed, %d recycled; "
R
Robert Haas 已提交
7420 7421
			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
7422 7423 7424 7425 7426 7427 7428
			 CheckpointStats.ckpt_bufs_written,
			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
			 CheckpointStats.ckpt_segs_added,
			 CheckpointStats.ckpt_segs_removed,
			 CheckpointStats.ckpt_segs_recycled,
			 write_secs, write_usecs / 1000,
			 sync_secs, sync_usecs / 1000,
R
Robert Haas 已提交
7429 7430 7431 7432
			 total_secs, total_usecs / 1000,
			 CheckpointStats.ckpt_sync_rels,
			 longest_secs, longest_usecs / 1000,
			 average_secs, average_usecs / 1000);
7433 7434
}

T
Tom Lane 已提交
7435 7436
/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
7437
 *
7438 7439
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7440
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7441
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7442
 *		ignoring checkpoint_completion_target parameter.
7443
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
7444
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7445
 *		CHECKPOINT_END_OF_RECOVERY).
7446
 *
7447
 * Note: flags contains other bits, of interest here only for logging purposes.
7448 7449
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
T
Tom Lane 已提交
7450
 */
7451
void
7452
CreateCheckPoint(int flags)
7453
{
7454
	bool		shutdown;
7455 7456 7457
	CheckPoint	checkPoint;
	XLogRecPtr	recptr;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
B
Bruce Momjian 已提交
7458
	XLogRecData rdata;
7459
	uint32		freespace;
V
Vadim B. Mikheev 已提交
7460 7461
	uint32		_logId;
	uint32		_logSeg;
7462 7463
	TransactionId *inCommitXids;
	int			nInCommit;
V
Vadim B. Mikheev 已提交
7464

7465 7466 7467 7468
	/*
	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
	 * issued at a different time.
	 */
7469
	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7470 7471 7472
		shutdown = true;
	else
		shutdown = false;
7473

7474 7475 7476
	/* sanity check */
	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
		elog(ERROR, "can't create a checkpoint during recovery");
7477

7478 7479 7480 7481 7482 7483 7484 7485
	/*
	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
	 * (This is just pro forma, since in the present system structure there is
	 * only one process that is allowed to issue checkpoints at any given
	 * time.)
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

7486 7487 7488 7489 7490 7491 7492 7493 7494 7495
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

7496 7497 7498
	/*
	 * Use a critical section to force system panic if we have trouble.
	 */
7499 7500
	START_CRIT_SECTION();

7501 7502
	if (shutdown)
	{
7503
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7504
		ControlFile->state = DB_SHUTDOWNING;
7505
		ControlFile->time = (pg_time_t) time(NULL);
7506
		UpdateControlFile();
7507
		LWLockRelease(ControlFileLock);
7508
	}
T
Tom Lane 已提交
7509

7510
	/*
B
Bruce Momjian 已提交
7511 7512 7513
	 * Let smgr prepare for checkpoint; this has to happen before we determine
	 * the REDO pointer.  Note that smgr must not do anything that'd have to
	 * be undone if we decide no checkpoint is needed.
7514 7515 7516 7517
	 */
	smgrpreckpt();

	/* Begin filling in the checkpoint WAL record */
7518
	MemSet(&checkPoint, 0, sizeof(checkPoint));
7519
	checkPoint.time = (pg_time_t) time(NULL);
7520

7521
	/*
7522 7523
	 * We must hold WALInsertLock while examining insert state to determine
	 * the checkpoint REDO pointer.
7524
	 */
7525
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
T
Tom Lane 已提交
7526 7527

	/*
B
Bruce Momjian 已提交
7528 7529 7530 7531 7532 7533 7534 7535
	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
	 * any XLOG records since the start of the last checkpoint, skip the
	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
	 * when the system is idle. That wastes log space, and more importantly it
	 * exposes us to possible loss of both current and previous checkpoint
	 * records if the machine crashes just as we're writing the update.
	 * (Perhaps it'd make even more sense to checkpoint only when the previous
	 * checkpoint record is in a different xlog page?)
T
Tom Lane 已提交
7536
	 *
7537 7538 7539 7540
	 * We have to make two tests to determine that nothing has happened since
	 * the start of the last checkpoint: current insertion point must match
	 * the end of the last checkpoint record, and its redo pointer must point
	 * to itself.
T
Tom Lane 已提交
7541
	 */
7542 7543
	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
				  CHECKPOINT_FORCE)) == 0)
T
Tom Lane 已提交
7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555
	{
		XLogRecPtr	curInsert;

		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
			ControlFile->checkPoint.xlogid ==
			ControlFile->checkPointCopy.redo.xlogid &&
			ControlFile->checkPoint.xrecoff ==
			ControlFile->checkPointCopy.redo.xrecoff)
		{
7556 7557
			LWLockRelease(WALInsertLock);
			LWLockRelease(CheckpointLock);
T
Tom Lane 已提交
7558 7559 7560 7561 7562
			END_CRIT_SECTION();
			return;
		}
	}

7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573
	/*
	 * An end-of-recovery checkpoint is created before anyone is allowed to
	 * write WAL. To allow us to write the checkpoint record, temporarily
	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
	 */
	if (flags & CHECKPOINT_END_OF_RECOVERY)
		LocalSetXLogInsertAllowed();

	checkPoint.ThisTimeLineID = ThisTimeLineID;

T
Tom Lane 已提交
7574 7575 7576
	/*
	 * Compute new REDO record ptr = location of next XLOG record.
	 *
B
Bruce Momjian 已提交
7577 7578 7579 7580
	 * NB: this is NOT necessarily where the checkpoint record itself will be,
	 * since other backends may insert more XLOG records while we're off doing
	 * the buffer flush work.  Those XLOG records are logically after the
	 * checkpoint, even though physically before it.  Got that?
T
Tom Lane 已提交
7581 7582
	 */
	freespace = INSERT_FREESPACE(Insert);
7583 7584
	if (freespace < SizeOfXLogRecord)
	{
7585
		(void) AdvanceXLInsertBuffer(false);
T
Tom Lane 已提交
7586
		/* OK to ignore update return flag, since we will do flush anyway */
7587
		freespace = INSERT_FREESPACE(Insert);
7588
	}
T
Tom Lane 已提交
7589
	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
B
Bruce Momjian 已提交
7590

T
Tom Lane 已提交
7591
	/*
B
Bruce Momjian 已提交
7592 7593
	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
	 * must be done while holding the insert lock AND the info_lck.
7594
	 *
B
Bruce Momjian 已提交
7595
	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
B
Bruce Momjian 已提交
7596 7597 7598 7599 7600
	 * pointing past where it really needs to point.  This is okay; the only
	 * consequence is that XLogInsert might back up whole buffers that it
	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
	 * XLogInserts that happen while we are dumping buffers must assume that
	 * their buffer changes are not included in the checkpoint.
T
Tom Lane 已提交
7601
	 */
7602 7603 7604 7605
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

7606
		SpinLockAcquire(&xlogctl->info_lck);
7607
		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7608
		SpinLockRelease(&xlogctl->info_lck);
7609
	}
B
Bruce Momjian 已提交
7610

T
Tom Lane 已提交
7611
	/*
7612 7613
	 * Now we can release WAL insert lock, allowing other xacts to proceed
	 * while we are flushing disk buffers.
T
Tom Lane 已提交
7614
	 */
7615
	LWLockRelease(WALInsertLock);
7616

7617
	/*
B
Bruce Momjian 已提交
7618 7619
	 * If enabled, log checkpoint start.  We postpone this until now so as not
	 * to log anything if we decided to skip the checkpoint.
7620 7621
	 */
	if (log_checkpoints)
7622
		LogCheckpointStart(flags, false);
7623

7624 7625
	TRACE_POSTGRESQL_CHECKPOINT_START(flags);

7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640
	/*
	 * Before flushing data, we must wait for any transactions that are
	 * currently in their commit critical sections.  If an xact inserted its
	 * commit record into XLOG just before the REDO point, then a crash
	 * restart from the REDO point would not replay that record, which means
	 * that our flushing had better include the xact's update of pg_clog.  So
	 * we wait till he's out of his commit critical section before proceeding.
	 * See notes in RecordTransactionCommit().
	 *
	 * Because we've already released WALInsertLock, this test is a bit fuzzy:
	 * it is possible that we will wait for xacts we didn't really need to
	 * wait for.  But the delay should be short and it seems better to make
	 * checkpoint take a bit longer than to hold locks longer than necessary.
	 * (In fact, the whole reason we have this issue is that xact.c does
	 * commit record XLOG insertion and clog update as two separate steps
B
Bruce Momjian 已提交
7641 7642
	 * protected by different locks, but again that seems best on grounds of
	 * minimizing lock contention.)
7643
	 *
B
Bruce Momjian 已提交
7644 7645
	 * A transaction that has not yet set inCommit when we look cannot be at
	 * risk, since he's not inserted his commit record yet; and one that's
7646 7647 7648 7649 7650 7651 7652
	 * already cleared it is not at risk either, since he's done fixing clog
	 * and we will correctly flush the update below.  So we cannot miss any
	 * xacts we need to wait for.
	 */
	nInCommit = GetTransactionsInCommit(&inCommitXids);
	if (nInCommit > 0)
	{
B
Bruce Momjian 已提交
7653 7654 7655
		do
		{
			pg_usleep(10000L);	/* wait for 10 msec */
7656 7657 7658 7659
		} while (HaveTransactionsInCommit(inCommitXids, nInCommit));
	}
	pfree(inCommitXids);

7660 7661 7662
	/*
	 * Get the other info we need for the checkpoint record.
	 */
7663
	LWLockAcquire(XidGenLock, LW_SHARED);
7664
	checkPoint.nextXid = ShmemVariableCache->nextXid;
7665 7666
	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7667
	LWLockRelease(XidGenLock);
T
Tom Lane 已提交
7668

7669 7670 7671 7672 7673
	/* Increase XID epoch if we've wrapped around since last checkpoint */
	checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
	if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
		checkPoint.nextXidEpoch++;

7674
	LWLockAcquire(OidGenLock, LW_SHARED);
7675
	checkPoint.nextOid = ShmemVariableCache->nextOid;
7676 7677
	if (!shutdown)
		checkPoint.nextOid += ShmemVariableCache->oidCount;
7678
	LWLockRelease(OidGenLock);
7679

7680 7681 7682
	MultiXactGetCheckptMulti(shutdown,
							 &checkPoint.nextMulti,
							 &checkPoint.nextMultiOffset);
7683

T
Tom Lane 已提交
7684
	/*
B
Bruce Momjian 已提交
7685 7686
	 * Having constructed the checkpoint record, ensure all shmem disk buffers
	 * and commit-log buffers are flushed to disk.
7687
	 *
7688 7689
	 * This I/O could fail for various reasons.  If so, we will fail to
	 * complete the checkpoint, but there is no reason to force a system
7690
	 * panic. Accordingly, exit critical section while doing it.
T
Tom Lane 已提交
7691
	 */
7692 7693
	END_CRIT_SECTION();

7694
	CheckPointGuts(checkPoint.redo, flags);
7695

7696
	/*
B
Bruce Momjian 已提交
7697 7698 7699
	 * Take a snapshot of running transactions and write this to WAL. This
	 * allows us to reconstruct the state of running transactions during
	 * archive recovery, if required. Skip, if this info disabled.
7700 7701 7702 7703 7704 7705 7706
	 *
	 * If we are shutting down, or Startup process is completing crash
	 * recovery we don't need to write running xact data.
	 *
	 * Update checkPoint.nextXid since we have a later value
	 */
	if (!shutdown && XLogStandbyInfoActive())
B
Bruce Momjian 已提交
7707
		LogStandbySnapshot(&checkPoint.oldestActiveXid, &checkPoint.nextXid);
7708 7709 7710
	else
		checkPoint.oldestActiveXid = InvalidTransactionId;

7711 7712
	START_CRIT_SECTION();

T
Tom Lane 已提交
7713 7714 7715
	/*
	 * Now insert the checkpoint record into XLOG.
	 */
B
Bruce Momjian 已提交
7716
	rdata.data = (char *) (&checkPoint);
7717
	rdata.len = sizeof(checkPoint);
7718
	rdata.buffer = InvalidBuffer;
7719 7720
	rdata.next = NULL;

T
Tom Lane 已提交
7721 7722 7723 7724 7725 7726
	recptr = XLogInsert(RM_XLOG_ID,
						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
						XLOG_CHECKPOINT_ONLINE,
						&rdata);

	XLogFlush(recptr);
7727

7728
	/*
B
Bruce Momjian 已提交
7729 7730 7731 7732
	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
	 * overwritten at next startup.  No-one should even try, this just allows
	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
	 * to just temporarily disable writing until the system has exited
7733 7734 7735 7736 7737
	 * recovery.
	 */
	if (shutdown)
	{
		if (flags & CHECKPOINT_END_OF_RECOVERY)
B
Bruce Momjian 已提交
7738
			LocalXLogInsertAllowed = -1;		/* return to "check" state */
7739
		else
B
Bruce Momjian 已提交
7740
			LocalXLogInsertAllowed = 0; /* never again write WAL */
7741 7742
	}

T
Tom Lane 已提交
7743
	/*
B
Bruce Momjian 已提交
7744 7745
	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
	 * = end of actual checkpoint record.
T
Tom Lane 已提交
7746 7747
	 */
	if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7748 7749
		ereport(PANIC,
				(errmsg("concurrent transaction log activity while database system is shutting down")));
7750

T
Tom Lane 已提交
7751
	/*
7752 7753
	 * Select point at which we can truncate the log, which we base on the
	 * prior checkpoint's earliest info.
T
Tom Lane 已提交
7754
	 */
7755
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7756

T
Tom Lane 已提交
7757 7758 7759
	/*
	 * Update the control file.
	 */
7760
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7761 7762
	if (shutdown)
		ControlFile->state = DB_SHUTDOWNED;
T
Tom Lane 已提交
7763 7764 7765
	ControlFile->prevCheckPoint = ControlFile->checkPoint;
	ControlFile->checkPoint = ProcLastRecPtr;
	ControlFile->checkPointCopy = checkPoint;
7766
	ControlFile->time = (pg_time_t) time(NULL);
7767 7768
	/* crash recovery should always recover to the end of WAL */
	MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7769
	UpdateControlFile();
7770
	LWLockRelease(ControlFileLock);
7771

7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782
	/* Update shared-memory copy of checkpoint XID/epoch */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
		xlogctl->ckptXid = checkPoint.nextXid;
		SpinLockRelease(&xlogctl->info_lck);
	}

7783
	/*
B
Bruce Momjian 已提交
7784
	 * We are now done with critical updates; no need for system panic if we
7785
	 * have trouble while fooling with old log segments.
7786 7787 7788
	 */
	END_CRIT_SECTION();

7789 7790 7791 7792 7793
	/*
	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
	 */
	smgrpostckpt();

V
Vadim B. Mikheev 已提交
7794
	/*
7795 7796
	 * Delete old log files (those no longer needed even for previous
	 * checkpoint or the standbys in XLOG streaming).
7797
	 */
7798
	if (_logId || _logSeg)
7799
	{
7800 7801
		/*
		 * Calculate the last segment that we need to retain because of
B
Bruce Momjian 已提交
7802 7803
		 * wal_keep_segments, by subtracting wal_keep_segments from the new
		 * checkpoint location.
7804
		 */
7805
		if (wal_keep_segments > 0)
7806
		{
7807 7808 7809 7810 7811 7812 7813
			uint32		log;
			uint32		seg;
			int			d_log;
			int			d_seg;

			XLByteToSeg(recptr, log, seg);

7814 7815
			d_seg = wal_keep_segments % XLogSegsPerFile;
			d_log = wal_keep_segments / XLogSegsPerFile;
7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832
			if (seg < d_seg)
			{
				d_log += 1;
				seg = seg - d_seg + XLogSegsPerFile;
			}
			else
				seg = seg - d_seg;
			/* avoid underflow, don't go below (0,1) */
			if (log < d_log || (log == d_log && seg == 0))
			{
				log = 0;
				seg = 1;
			}
			else
				log = log - d_log;

			/* don't delete WAL segments newer than the calculated segment */
7833 7834
			if (log < _logId || (log == _logId && seg < _logSeg))
			{
B
Bruce Momjian 已提交
7835 7836
				_logId = log;
				_logSeg = seg;
7837 7838 7839
			}
		}

T
Tom Lane 已提交
7840
		PrevLogSeg(_logId, _logSeg);
7841
		RemoveOldXlogFiles(_logId, _logSeg, recptr);
V
Vadim B. Mikheev 已提交
7842 7843
	}

T
Tom Lane 已提交
7844
	/*
7845 7846
	 * Make more log segments if needed.  (Do this after recycling old log
	 * segments, since that may supply some of the needed files.)
T
Tom Lane 已提交
7847 7848
	 */
	if (!shutdown)
7849
		PreallocXlogFiles(recptr);
T
Tom Lane 已提交
7850

7851
	/*
B
Bruce Momjian 已提交
7852 7853 7854 7855 7856
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	During recovery, though, we mustn't do this because
	 * StartupSUBTRANS hasn't been called yet.
7857
	 */
7858
	if (!RecoveryInProgress())
7859
		TruncateSUBTRANS(GetOldestXmin(true, false));
7860

7861 7862
	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
7863
		LogCheckpointEnd(false);
7864

7865 7866 7867 7868 7869
	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
									 NBuffers,
									 CheckpointStats.ckpt_segs_added,
									 CheckpointStats.ckpt_segs_removed,
									 CheckpointStats.ckpt_segs_recycled);
7870

7871
	LWLockRelease(CheckpointLock);
7872
}
V
WAL  
Vadim B. Mikheev 已提交
7873

7874 7875 7876 7877 7878 7879 7880
/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void
7881
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7882 7883 7884 7885
{
	CheckPointCLOG();
	CheckPointSUBTRANS();
	CheckPointMultiXact();
7886
	CheckPointPredicate();
7887
	CheckPointRelationMap();
B
Bruce Momjian 已提交
7888
	CheckPointBuffers(flags);	/* performs all required fsyncs */
7889 7890 7891 7892 7893
	/* We deliberately delay 2PC checkpointing as long as possible */
	CheckPointTwoPhase(checkPointRedo);
}

/*
7894 7895 7896 7897 7898 7899 7900 7901
 * Save a checkpoint for recovery restart if appropriate
 *
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the bgwriter, while this one will be executed by the startup
 * process.)
7902 7903 7904 7905
 */
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
{
B
Bruce Momjian 已提交
7906
	int			rmid;
7907

7908 7909
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920

	/*
	 * Is it safe to checkpoint?  We must ask each of the resource managers
	 * whether they have any partial state information that might prevent a
	 * correct restart from this point.  If so, we skip this opportunity, but
	 * return at the next checkpoint record for another try.
	 */
	for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
	{
		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7921
			{
7922
				elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
7923 7924 7925
					 rmid,
					 checkPoint->redo.xlogid,
					 checkPoint->redo.xrecoff);
7926
				return;
7927
			}
7928 7929 7930
	}

	/*
7931 7932
	 * Copy the checkpoint record to shared memory, so that bgwriter can use
	 * it the next time it wants to perform a restartpoint.
7933 7934 7935 7936 7937 7938 7939 7940
	 */
	SpinLockAcquire(&xlogctl->info_lck);
	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);
}

/*
7941 7942
 * Establish a restartpoint if possible.
 *
7943 7944 7945 7946 7947
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
7948
 * a restartpoint if we have replayed a safe checkpoint record since last
7949 7950 7951 7952 7953
 * restartpoint.
 */
bool
CreateRestartPoint(int flags)
{
7954 7955
	XLogRecPtr	lastCheckPointRecPtr;
	CheckPoint	lastCheckPoint;
7956 7957
	uint32		_logId;
	uint32		_logSeg;
B
Bruce Momjian 已提交
7958
	TimestampTz xtime;
7959

7960 7961 7962 7963 7964 7965 7966 7967 7968
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;

	/*
	 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
	 * happens at a time.
	 */
	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

7969
	/* Get a local copy of the last safe checkpoint record. */
7970 7971 7972 7973 7974
	SpinLockAcquire(&xlogctl->info_lck);
	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
	SpinLockRelease(&xlogctl->info_lck);

7975
	/*
7976 7977 7978 7979 7980 7981
	 * Check that we're still in recovery mode. It's ok if we exit recovery
	 * mode after this check, the restart point is valid anyway.
	 */
	if (!RecoveryInProgress())
	{
		ereport(DEBUG2,
7982
			  (errmsg("skipping restartpoint, recovery has already ended")));
7983 7984 7985 7986 7987 7988 7989 7990 7991
		LWLockRelease(CheckpointLock);
		return false;
	}

	/*
	 * If the last checkpoint record we've replayed is already our last
	 * restartpoint, we can't perform a new restart point. We still update
	 * minRecoveryPoint in that case, so that if this is a shutdown restart
	 * point, we won't start up earlier than before. That's not strictly
B
Bruce Momjian 已提交
7992 7993 7994 7995
	 * necessary, but when hot standby is enabled, it would be rather weird if
	 * the database opened up for read-only connections at a point-in-time
	 * before the last shutdown. Such time travel is still possible in case of
	 * immediate shutdown, though.
7996 7997
	 *
	 * We don't explicitly advance minRecoveryPoint when we do create a
7998 7999
	 * restartpoint. It's assumed that flushing the buffers will do that as a
	 * side-effect.
8000
	 */
8001 8002 8003
	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
		XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
	{
8004 8005
		XLogRecPtr	InvalidXLogRecPtr = {0, 0};

8006 8007
		ereport(DEBUG2,
				(errmsg("skipping restartpoint, already performed at %X/%X",
8008
				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
8009 8010

		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
8011 8012 8013 8014 8015 8016 8017 8018
		if (flags & CHECKPOINT_IS_SHUTDOWN)
		{
			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
			ControlFile->time = (pg_time_t) time(NULL);
			UpdateControlFile();
			LWLockRelease(ControlFileLock);
		}
8019 8020 8021 8022
		LWLockRelease(CheckpointLock);
		return false;
	}

8023
	/*
B
Bruce Momjian 已提交
8024 8025 8026
	 * Update the shared RedoRecPtr so that the startup process can calculate
	 * the number of segments replayed since last restartpoint, and request a
	 * restartpoint if it exceeds checkpoint_segments.
8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037
	 *
	 * You need to hold WALInsertLock and info_lck to update it, although
	 * during recovery acquiring WALInsertLock is just pro forma, because
	 * there is no other processes updating Insert.RedoRecPtr.
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
	SpinLockAcquire(&xlogctl->info_lck);
	xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
	SpinLockRelease(&xlogctl->info_lck);
	LWLockRelease(WALInsertLock);

8038 8039 8040 8041 8042 8043 8044 8045 8046
	/*
	 * Prepare to accumulate statistics.
	 *
	 * Note: because it is possible for log_checkpoints to change while a
	 * checkpoint proceeds, we always accumulate stats, even if
	 * log_checkpoints is currently off.
	 */
	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8047

8048
	if (log_checkpoints)
8049 8050 8051
		LogCheckpointStart(flags, true);

	CheckPointGuts(lastCheckPoint.redo, flags);
8052

8053 8054 8055 8056 8057 8058
	/*
	 * Select point at which we can truncate the xlog, which we base on the
	 * prior checkpoint's earliest info.
	 */
	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);

8059
	/*
8060 8061
	 * Update pg_control, using current time.  Check that it still shows
	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
B
Bruce Momjian 已提交
8062 8063
	 * this is a quick hack to make sure nothing really bad happens if somehow
	 * we get here after the end-of-recovery checkpoint.
8064
	 */
8065
	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8066 8067 8068 8069 8070 8071 8072
	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
	{
		ControlFile->prevCheckPoint = ControlFile->checkPoint;
		ControlFile->checkPoint = lastCheckPointRecPtr;
		ControlFile->checkPointCopy = lastCheckPoint;
		ControlFile->time = (pg_time_t) time(NULL);
8073 8074
		if (flags & CHECKPOINT_IS_SHUTDOWN)
			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
8075 8076
		UpdateControlFile();
	}
8077
	LWLockRelease(ControlFileLock);
8078

8079 8080 8081 8082 8083 8084 8085
	/*
	 * Delete old log files (those no longer needed even for previous
	 * checkpoint/restartpoint) to prevent the disk holding the xlog from
	 * growing full. We don't need do this during normal recovery, but during
	 * streaming recovery we have to or the disk will eventually fill up from
	 * old log files streamed from master.
	 */
8086
	if (WalRcvInProgress() && (_logId || _logSeg))
8087 8088 8089 8090
	{
		XLogRecPtr	endptr;

		/* Get the current (or recent) end of xlog */
8091
		endptr = GetWalRcvWriteRecPtr(NULL);
8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102

		PrevLogSeg(_logId, _logSeg);
		RemoveOldXlogFiles(_logId, _logSeg, endptr);

		/*
		 * Make more log segments if needed.  (Do this after recycling old log
		 * segments, since that may supply some of the needed files.)
		 */
		PreallocXlogFiles(endptr);
	}

8103
	/*
8104 8105 8106 8107 8108
	 * Truncate pg_subtrans if possible.  We can throw away all data before
	 * the oldest XMIN of any running transaction.	No future transaction will
	 * attempt to reference any pg_subtrans entry older than that (see Asserts
	 * in subtrans.c).	When hot standby is disabled, though, we mustn't do
	 * this because StartupSUBTRANS hasn't been called yet.
8109
	 */
8110 8111
	if (EnableHotStandby)
		TruncateSUBTRANS(GetOldestXmin(true, false));
8112 8113 8114 8115 8116

	/* All real work is done, but log before releasing lock. */
	if (log_checkpoints)
		LogCheckpointEnd(true);

8117
	xtime = GetLatestXTime();
8118
	ereport((log_checkpoints ? LOG : DEBUG2),
8119 8120
			(errmsg("recovery restart point at %X/%X",
					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff),
B
Bruce Momjian 已提交
8121 8122
		   xtime ? errdetail("last completed transaction was at log time %s",
							 timestamptz_to_str(xtime)) : 0));
8123 8124

	LWLockRelease(CheckpointLock);
8125 8126

	/*
8127
	 * Finally, execute archive_cleanup_command, if any.
8128
	 */
8129 8130 8131
	if (XLogCtl->archiveCleanupCommand[0])
		ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
							   "archive_cleanup_command",
8132 8133
							   false);

8134
	return true;
8135 8136
}

T
Tom Lane 已提交
8137 8138 8139
/*
 * Write a NEXTOID log record
 */
8140 8141 8142
void
XLogPutNextOid(Oid nextOid)
{
B
Bruce Momjian 已提交
8143
	XLogRecData rdata;
8144

B
Bruce Momjian 已提交
8145
	rdata.data = (char *) (&nextOid);
8146
	rdata.len = sizeof(Oid);
8147
	rdata.buffer = InvalidBuffer;
8148 8149
	rdata.next = NULL;
	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
B
Bruce Momjian 已提交
8150

8151 8152
	/*
	 * We need not flush the NEXTOID record immediately, because any of the
B
Bruce Momjian 已提交
8153 8154 8155 8156 8157
	 * just-allocated OIDs could only reach disk as part of a tuple insert or
	 * update that would have its own XLOG record that must follow the NEXTOID
	 * record.	Therefore, the standard buffer LSN interlock applied to those
	 * records will ensure no such OID reaches disk before the NEXTOID record
	 * does.
8158 8159
	 *
	 * Note, however, that the above statement only covers state "within" the
B
Bruce Momjian 已提交
8160 8161
	 * database.  When we use a generated OID as a file or directory name, we
	 * are in a sense violating the basic WAL rule, because that filesystem
8162
	 * change may reach disk before the NEXTOID WAL record does.  The impact
B
Bruce Momjian 已提交
8163 8164 8165 8166 8167
	 * of this is that if a database crash occurs immediately afterward, we
	 * might after restart re-generate the same OID and find that it conflicts
	 * with the leftover file or directory.  But since for safety's sake we
	 * always loop until finding a nonconflicting filename, this poses no real
	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8168 8169 8170
	 */
}

8171 8172 8173 8174 8175 8176 8177 8178 8179 8180
/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
8181
XLogRecPtr
8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197
RequestXLogSwitch(void)
{
	XLogRecPtr	RecPtr;
	XLogRecData rdata;

	/* XLOG SWITCH, alone among xlog record types, has no data */
	rdata.buffer = InvalidBuffer;
	rdata.data = NULL;
	rdata.len = 0;
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

	return RecPtr;
}

8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217
/*
 * Write a RESTORE POINT record
 */
XLogRecPtr
XLogRestorePoint(const char *rpName)
{
	XLogRecPtr				RecPtr;
	XLogRecData				rdata;
	xl_restore_point		xlrec;

	xlrec.rp_time = GetCurrentTimestamp();
	strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);

	rdata.buffer = InvalidBuffer;
	rdata.data = (char *) &xlrec;
	rdata.len = sizeof(xl_restore_point);
	rdata.next = NULL;

	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);

R
Robert Haas 已提交
8218 8219 8220 8221
	ereport(LOG,
			(errmsg("restore point \"%s\" created at %X/%X",
					rpName,	RecPtr.xlogid, RecPtr.xrecoff)));

8222 8223 8224
	return RecPtr;
}

8225
/*
8226 8227
 * Check if any of the GUC parameters that are critical for hot standby
 * have changed, and update the value in pg_control file if necessary.
8228
 */
8229 8230
static void
XLogReportParameters(void)
8231
{
8232 8233 8234
	if (wal_level != ControlFile->wal_level ||
		MaxConnections != ControlFile->MaxConnections ||
		max_prepared_xacts != ControlFile->max_prepared_xacts ||
8235
		max_locks_per_xact != ControlFile->max_locks_per_xact)
8236 8237
	{
		/*
B
Bruce Momjian 已提交
8238 8239 8240 8241 8242
		 * The change in number of backend slots doesn't need to be WAL-logged
		 * if archiving is not enabled, as you can't start archive recovery
		 * with wal_level=minimal anyway. We don't really care about the
		 * values in pg_control either if wal_level=minimal, but seems better
		 * to keep them up-to-date to avoid confusion.
8243 8244 8245 8246 8247
		 */
		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
		{
			XLogRecData rdata;
			xl_parameter_change xlrec;
8248

8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260
			xlrec.MaxConnections = MaxConnections;
			xlrec.max_prepared_xacts = max_prepared_xacts;
			xlrec.max_locks_per_xact = max_locks_per_xact;
			xlrec.wal_level = wal_level;

			rdata.buffer = InvalidBuffer;
			rdata.data = (char *) &xlrec;
			rdata.len = sizeof(xlrec);
			rdata.next = NULL;

			XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
		}
8261

8262 8263 8264 8265 8266 8267
		ControlFile->MaxConnections = MaxConnections;
		ControlFile->max_prepared_xacts = max_prepared_xacts;
		ControlFile->max_locks_per_xact = max_locks_per_xact;
		ControlFile->wal_level = wal_level;
		UpdateControlFile();
	}
8268 8269
}

T
Tom Lane 已提交
8270 8271
/*
 * XLOG resource manager's routines
8272 8273
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
8274
 * not all record types are related to control file updates.
T
Tom Lane 已提交
8275
 */
V
WAL  
Vadim B. Mikheev 已提交
8276 8277 8278
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
{
B
Bruce Momjian 已提交
8279
	uint8		info = record->xl_info & ~XLR_INFO_MASK;
8280

8281 8282 8283
	/* Backup blocks are not used in xlog records */
	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

8284
	if (info == XLOG_NEXTOID)
8285
	{
B
Bruce Momjian 已提交
8286
		Oid			nextOid;
8287 8288 8289

		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
		if (ShmemVariableCache->nextOid < nextOid)
T
Tom Lane 已提交
8290
		{
8291
			ShmemVariableCache->nextOid = nextOid;
T
Tom Lane 已提交
8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303
			ShmemVariableCache->oidCount = 0;
		}
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		/* In a SHUTDOWN checkpoint, believe the counters exactly */
		ShmemVariableCache->nextXid = checkPoint.nextXid;
		ShmemVariableCache->nextOid = checkPoint.nextOid;
		ShmemVariableCache->oidCount = 0;
8304 8305
		MultiXactSetNextMXact(checkPoint.nextMulti,
							  checkPoint.nextMultiOffset);
8306
		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
B
Bruce Momjian 已提交
8307

8308
		/*
B
Bruce Momjian 已提交
8309 8310 8311
		 * If we see a shutdown checkpoint while waiting for an end-of-backup
		 * record, the backup was cancelled and the end-of-backup record will
		 * never arrive.
8312
		 */
8313
		if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8314 8315 8316
			ereport(ERROR,
					(errmsg("online backup was cancelled, recovery cannot continue")));

8317
		/*
B
Bruce Momjian 已提交
8318 8319 8320 8321
		 * If we see a shutdown checkpoint, we know that nothing was running
		 * on the master at this point. So fake-up an empty running-xacts
		 * record and use that here and now. Recover additional standby state
		 * for prepared transactions.
8322
		 */
8323 8324
		if (standbyState >= STANDBY_INITIALIZED)
		{
8325 8326 8327
			TransactionId *xids;
			int			nxids;
			TransactionId oldestActiveXID;
8328
			TransactionId latestCompletedXid;
8329 8330 8331 8332
			RunningTransactionsData running;

			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);

8333
			/*
8334
			 * Construct a RunningTransactions snapshot representing a shut
B
Bruce Momjian 已提交
8335 8336 8337
			 * down server, with only prepared transactions still alive. We're
			 * never overflowed at this point because all subxids are listed
			 * with their parent prepared transactions.
8338
			 */
8339 8340 8341 8342
			running.xcnt = nxids;
			running.subxid_overflow = false;
			running.nextXid = checkPoint.nextXid;
			running.oldestRunningXid = oldestActiveXID;
8343 8344
			latestCompletedXid = checkPoint.nextXid;
			TransactionIdRetreat(latestCompletedXid);
8345
			Assert(TransactionIdIsNormal(latestCompletedXid));
8346
			running.latestCompletedXid = latestCompletedXid;
8347 8348 8349 8350 8351
			running.xids = xids;

			ProcArrayApplyRecoveryInfo(&running);

			StandbyRecoverPreparedTransactions(true);
8352 8353
		}

8354 8355 8356 8357
		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

8358
		/*
B
Bruce Momjian 已提交
8359
		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
8360 8361 8362 8363 8364 8365 8366 8367
		 */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
		{
			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
				!list_member_int(expectedTLIs,
								 (int) checkPoint.ThisTimeLineID))
				ereport(PANIC,
						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
8368 8369 8370
								checkPoint.ThisTimeLineID, ThisTimeLineID)));
			/* Following WAL records should be run with new TLI */
			ThisTimeLineID = checkPoint.ThisTimeLineID;
8371
		}
8372 8373

		RecoveryRestartPoint(&checkPoint);
T
Tom Lane 已提交
8374 8375 8376 8377 8378 8379
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;

		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8380
		/* In an ONLINE checkpoint, treat the counters like NEXTOID */
8381 8382
		if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
								  checkPoint.nextXid))
T
Tom Lane 已提交
8383 8384 8385 8386 8387 8388
			ShmemVariableCache->nextXid = checkPoint.nextXid;
		if (ShmemVariableCache->nextOid < checkPoint.nextOid)
		{
			ShmemVariableCache->nextOid = checkPoint.nextOid;
			ShmemVariableCache->oidCount = 0;
		}
8389 8390
		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
								  checkPoint.nextMultiOffset);
8391 8392
		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
								  checkPoint.oldestXid))
8393 8394
			SetTransactionIdLimit(checkPoint.oldestXid,
								  checkPoint.oldestXidDB);
8395 8396 8397 8398 8399

		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

8400 8401
		/* TLI should not change in an on-line checkpoint */
		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8402
			ereport(PANIC,
8403 8404
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							checkPoint.ThisTimeLineID, ThisTimeLineID)));
8405 8406

		RecoveryRestartPoint(&checkPoint);
8407
	}
8408 8409 8410 8411
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
8412 8413 8414 8415
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
8416 8417 8418 8419
	else if (info == XLOG_RESTORE_POINT)
	{
		/* nothing to do here */
	}
8420 8421 8422
	else if (info == XLOG_BACKUP_END)
	{
		XLogRecPtr	startpoint;
B
Bruce Momjian 已提交
8423

8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446
		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done. The data on disk is now consistent.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
				ControlFile->minRecoveryPoint = lsn;
			MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
8447
	else if (info == XLOG_PARAMETER_CHANGE)
8448
	{
8449 8450 8451 8452 8453
		xl_parameter_change xlrec;

		/* Update our copy of the parameters in pg_control */
		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));

8454
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8455 8456 8457 8458
		ControlFile->MaxConnections = xlrec.MaxConnections;
		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
		ControlFile->wal_level = xlrec.wal_level;
B
Bruce Momjian 已提交
8459

8460
		/*
B
Bruce Momjian 已提交
8461 8462 8463 8464 8465 8466
		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
		 * recover back up to this point before allowing hot standby again.
		 * This is particularly important if wal_level was set to 'archive'
		 * before, and is now 'hot_standby', to ensure you don't run queries
		 * against the WAL preceding the wal_level change. Same applies to
		 * decreasing max_* settings.
8467 8468 8469 8470 8471 8472 8473 8474
		 */
		minRecoveryPoint = ControlFile->minRecoveryPoint;
		if ((minRecoveryPoint.xlogid != 0 || minRecoveryPoint.xrecoff != 0)
			&& XLByteLT(minRecoveryPoint, lsn))
		{
			ControlFile->minRecoveryPoint = lsn;
		}

8475
		UpdateControlFile();
8476
		LWLockRelease(ControlFileLock);
8477 8478 8479

		/* Check to see if any changes to max_connections give problems */
		CheckRequiredParameterValues();
8480
	}
V
WAL  
Vadim B. Mikheev 已提交
8481
}
B
Bruce Momjian 已提交
8482

V
WAL  
Vadim B. Mikheev 已提交
8483
void
8484
xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
V
WAL  
Vadim B. Mikheev 已提交
8485
{
B
Bruce Momjian 已提交
8486
	uint8		info = xl_info & ~XLR_INFO_MASK;
V
WAL  
Vadim B. Mikheev 已提交
8487

T
Tom Lane 已提交
8488 8489
	if (info == XLOG_CHECKPOINT_SHUTDOWN ||
		info == XLOG_CHECKPOINT_ONLINE)
V
WAL  
Vadim B. Mikheev 已提交
8490
	{
B
Bruce Momjian 已提交
8491 8492
		CheckPoint *checkpoint = (CheckPoint *) rec;

8493
		appendStringInfo(buf, "checkpoint: redo %X/%X; "
8494
						 "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
8495
						 "oldest xid %u in DB %u; oldest running xid %u; %s",
B
Bruce Momjian 已提交
8496 8497 8498 8499 8500 8501
						 checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
						 checkpoint->ThisTimeLineID,
						 checkpoint->nextXidEpoch, checkpoint->nextXid,
						 checkpoint->nextOid,
						 checkpoint->nextMulti,
						 checkpoint->nextMultiOffset,
8502 8503
						 checkpoint->oldestXid,
						 checkpoint->oldestXidDB,
8504
						 checkpoint->oldestActiveXid,
B
Bruce Momjian 已提交
8505
				 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
T
Tom Lane 已提交
8506
	}
8507 8508 8509 8510
	else if (info == XLOG_NOOP)
	{
		appendStringInfo(buf, "xlog no-op");
	}
8511 8512
	else if (info == XLOG_NEXTOID)
	{
B
Bruce Momjian 已提交
8513
		Oid			nextOid;
8514 8515

		memcpy(&nextOid, rec, sizeof(Oid));
8516
		appendStringInfo(buf, "nextOid: %u", nextOid);
8517
	}
8518 8519 8520 8521
	else if (info == XLOG_SWITCH)
	{
		appendStringInfo(buf, "xlog switch");
	}
8522 8523 8524 8525 8526 8527 8528
	else if (info == XLOG_RESTORE_POINT)
	{
		xl_restore_point *xlrec = (xl_restore_point *) rec;

		appendStringInfo(buf, "restore point: %s", xlrec->rp_name);

	}
8529 8530
	else if (info == XLOG_BACKUP_END)
	{
B
Bruce Momjian 已提交
8531
		XLogRecPtr	startpoint;
8532 8533 8534 8535 8536

		memcpy(&startpoint, rec, sizeof(XLogRecPtr));
		appendStringInfo(buf, "backup end: %X/%X",
						 startpoint.xlogid, startpoint.xrecoff);
	}
8537
	else if (info == XLOG_PARAMETER_CHANGE)
8538
	{
8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554
		xl_parameter_change xlrec;
		const char *wal_level_str;
		const struct config_enum_entry *entry;

		memcpy(&xlrec, rec, sizeof(xl_parameter_change));

		/* Find a string representation for wal_level */
		wal_level_str = "?";
		for (entry = wal_level_options; entry->name; entry++)
		{
			if (entry->val == xlrec.wal_level)
			{
				wal_level_str = entry->name;
				break;
			}
		}
8555

8556 8557 8558 8559 8560
		appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
						 xlrec.MaxConnections,
						 xlrec.max_prepared_xacts,
						 xlrec.max_locks_per_xact,
						 wal_level_str);
8561
	}
V
WAL  
Vadim B. Mikheev 已提交
8562
	else
8563
		appendStringInfo(buf, "UNKNOWN");
V
WAL  
Vadim B. Mikheev 已提交
8564 8565
}

8566
#ifdef WAL_DEBUG
8567

V
WAL  
Vadim B. Mikheev 已提交
8568
static void
8569
xlog_outrec(StringInfo buf, XLogRecord *record)
V
WAL  
Vadim B. Mikheev 已提交
8570
{
B
Bruce Momjian 已提交
8571
	int			i;
8572

8573
	appendStringInfo(buf, "prev %X/%X; xid %u",
8574 8575
					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
					 record->xl_xid);
8576

8577 8578 8579
	appendStringInfo(buf, "; len %u",
					 record->xl_len);

8580
	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8581
	{
8582
		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
B
Bruce Momjian 已提交
8583
			appendStringInfo(buf, "; bkpb%d", i + 1);
8584 8585
	}

8586
	appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
V
WAL  
Vadim B. Mikheev 已提交
8587
}
B
Bruce Momjian 已提交
8588
#endif   /* WAL_DEBUG */
8589 8590 8591


/*
8592 8593
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
8594
 */
8595 8596
static int
get_sync_bit(int method)
8597
{
B
Bruce Momjian 已提交
8598
	int			o_direct_flag = 0;
8599

8600 8601 8602
	/* If fsync is disabled, never open in sync mode */
	if (!enableFsync)
		return 0;
8603

8604 8605
	/*
	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
8606
	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
B
Bruce Momjian 已提交
8607 8608 8609 8610 8611
	 * disabled, otherwise the archive command or walsender process will read
	 * the WAL soon after writing it, which is guaranteed to cause a physical
	 * read if we bypassed the kernel cache. We also skip the
	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
	 * reason.
8612 8613 8614 8615 8616 8617 8618 8619 8620
	 *
	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
	 * written by walreceiver is normally read by the startup process soon
	 * after its written. Also, walreceiver performs unaligned writes, which
	 * don't work with O_DIRECT, so it is required for correctness too.
	 */
	if (!XLogIsNeeded() && !am_walreceiver)
		o_direct_flag = PG_O_DIRECT;

8621
	switch (method)
8622
	{
8623 8624 8625 8626 8627 8628
			/*
			 * enum values for all sync options are defined even if they are
			 * not supported on the current platform.  But if not, they are
			 * not included in the enum option array, and therefore will never
			 * be seen here.
			 */
8629 8630 8631
		case SYNC_METHOD_FSYNC:
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
		case SYNC_METHOD_FDATASYNC:
8632
			return 0;
8633
#ifdef OPEN_SYNC_FLAG
8634
		case SYNC_METHOD_OPEN:
8635
			return OPEN_SYNC_FLAG | o_direct_flag;
8636 8637
#endif
#ifdef OPEN_DATASYNC_FLAG
8638
		case SYNC_METHOD_OPEN_DSYNC:
8639
			return OPEN_DATASYNC_FLAG | o_direct_flag;
8640
#endif
8641
		default:
8642 8643
			/* can't happen (unless we are out of sync with option array) */
			elog(ERROR, "unrecognized wal_sync_method: %d", method);
8644
			return 0;			/* silence warning */
8645
	}
8646
}
8647

8648 8649 8650 8651 8652 8653
/*
 * GUC support
 */
bool
assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
{
8654
	if (!doit)
8655
		return true;
8656

8657
	if (sync_method != new_sync_method)
8658 8659
	{
		/*
B
Bruce Momjian 已提交
8660 8661
		 * To ensure that no blocks escape unsynced, force an fsync on the
		 * currently open log segment (if any).  Also, if the open flag is
B
Bruce Momjian 已提交
8662 8663
		 * changing, close the log file so it will be reopened (with new flag
		 * bit) at next use.
8664 8665 8666 8667
		 */
		if (openLogFile >= 0)
		{
			if (pg_fsync(openLogFile) != 0)
8668 8669
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8670 8671
						 errmsg("could not fsync log file %u, segment %u: %m",
								openLogId, openLogSeg)));
8672
			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8673
				XLogFileClose();
8674 8675
		}
	}
8676

8677
	return true;
8678 8679 8680 8681
}


/*
8682 8683 8684 8685
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
8686
 */
8687 8688
void
issue_xlog_fsync(int fd, uint32 log, uint32 seg)
8689 8690 8691
{
	switch (sync_method)
	{
8692
		case SYNC_METHOD_FSYNC:
8693
			if (pg_fsync_no_writethrough(fd) != 0)
8694 8695
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8696
						 errmsg("could not fsync log file %u, segment %u: %m",
8697
								log, seg)));
8698
			break;
8699 8700
#ifdef HAVE_FSYNC_WRITETHROUGH
		case SYNC_METHOD_FSYNC_WRITETHROUGH:
8701
			if (pg_fsync_writethrough(fd) != 0)
8702 8703
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8704
						 errmsg("could not fsync write-through log file %u, segment %u: %m",
8705
								log, seg)));
8706 8707
			break;
#endif
8708 8709
#ifdef HAVE_FDATASYNC
		case SYNC_METHOD_FDATASYNC:
8710
			if (pg_fdatasync(fd) != 0)
8711 8712
				ereport(PANIC,
						(errcode_for_file_access(),
B
Bruce Momjian 已提交
8713
					errmsg("could not fdatasync log file %u, segment %u: %m",
8714
						   log, seg)));
8715 8716 8717
			break;
#endif
		case SYNC_METHOD_OPEN:
8718
		case SYNC_METHOD_OPEN_DSYNC:
8719 8720 8721
			/* write synced it already */
			break;
		default:
8722
			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8723 8724 8725
			break;
	}
}
8726 8727 8728 8729 8730 8731 8732 8733 8734


/*
 * pg_start_backup: set up for taking an on-line backup dump
 *
 * Essentially what this does is to create a backup label file in $PGDATA,
 * where it will be archived as part of the backup dump.  The label file
 * contains the user-supplied label string (typically this would be used
 * to tell where the backup dump will be stored) and the starting time and
8735
 * starting WAL location for the dump.
8736 8737 8738 8739 8740
 */
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
	text	   *backupid = PG_GETARG_TEXT_P(0);
8741
	bool		fast = PG_GETARG_BOOL(1);
8742
	char	   *backupidstr;
8743 8744 8745 8746 8747
	XLogRecPtr  startpoint;
	char		startxlogstr[MAXFNAMELEN];

	backupidstr = text_to_cstring(backupid);

8748
	startpoint = do_pg_start_backup(backupidstr, fast, NULL);
8749 8750 8751 8752 8753 8754

	snprintf(startxlogstr, sizeof(startxlogstr), "%X/%X",
			 startpoint.xlogid, startpoint.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(startxlogstr));
}

8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775
/*
 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
 * function. It creates the necessary starting checkpoint and constructs the
 * backup label file.
 * 
 * There are two kind of backups: exclusive and non-exclusive. An exclusive
 * backup is started with pg_start_backup(), and there can be only one active
 * at a time. The backup label file of an exclusive backup is written to
 * $PGDATA/backup_label, and it is removed by pg_stop_backup().
 *
 * A non-exclusive backup is used for the streaming base backups (see
 * src/backend/replication/basebackup.c). The difference to exclusive backups
 * is that the backup label file is not written to disk. Instead, its would-be
 * contents are returned in *labelfile, and the caller is responsible for
 * including it in the backup archive as 'backup_label'. There can be many
 * non-exclusive backups active at the same time, and they don't conflict
 * with an exclusive backup either.
 *
 * Every successfully started non-exclusive backup must be stopped by calling
 * do_pg_stop_backup() or do_pg_abort_backup().
 */
8776
XLogRecPtr
8777
do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile)
8778
{
8779
	bool		exclusive = (labelfile == NULL);
8780
	XLogRecPtr	checkpointloc;
8781
	XLogRecPtr	startpoint;
8782
	pg_time_t	stamp_time;
8783 8784 8785 8786 8787 8788
	char		strfbuf[128];
	char		xlogfilename[MAXFNAMELEN];
	uint32		_logId;
	uint32		_logSeg;
	struct stat stat_buf;
	FILE	   *fp;
8789
	StringInfoData labelfbuf;
8790

8791
	if (!superuser() && !is_authenticated_user_replication_role())
8792 8793
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8794
				 errmsg("must be superuser or replication role to run a backup")));
8795

8796 8797 8798 8799 8800 8801
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

8802
	if (!XLogIsNeeded())
8803 8804
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
B
Bruce Momjian 已提交
8805
			  errmsg("WAL level not sufficient for making an online backup"),
8806
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8807

8808 8809 8810 8811 8812 8813
	if (strlen(backupidstr) > MAXPGPATH)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("backup label too long (max %d bytes)",
						MAXPGPATH)));

8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826
	/*
	 * Force an XLOG file switch before the checkpoint, to ensure that the WAL
	 * segment the checkpoint is written to doesn't contain pages with old
	 * timeline IDs. That would otherwise happen if you called
	 * pg_start_backup() right after restoring from a PITR archive: the first
	 * WAL segment containing the startup checkpoint has pages in the
	 * beginning with the old timeline ID. That can cause trouble at recovery:
	 * we won't have a history file covering the old timeline if pg_xlog
	 * directory was not included in the base backup and the WAL archive was
	 * cleared too before starting the backup.
	 */
	RequestXLogSwitch();

8827
	/*
8828 8829 8830
	 * Mark backup active in shared memory.  We must do full-page WAL writes
	 * during an on-line backup even if not doing so at other times, because
	 * it's quite possible for the backup dump to obtain a "torn" (partially
B
Bruce Momjian 已提交
8831 8832 8833 8834 8835 8836 8837 8838 8839
	 * written) copy of a database page if it reads the page concurrently with
	 * our write to the same page.	This can be fixed as long as the first
	 * write to the page in the WAL sequence is a full-page write. Hence, we
	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
	 * are no dirty pages in shared memory that might get dumped while the
	 * backup is in progress without having a corresponding WAL record.  (Once
	 * the backup is complete, we need not force full-page writes anymore,
	 * since we expect that any pages not modified during the backup interval
	 * must have been correctly captured by the backup.)
8840
	 *
B
Bruce Momjian 已提交
8841 8842
	 * We must hold WALInsertLock to change the value of forcePageWrites, to
	 * ensure adequate interlocking against XLogInsert().
8843
	 */
8844
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8845
	if (exclusive)
8846
	{
8847 8848 8849 8850 8851 8852 8853 8854 8855
		if (XLogCtl->Insert.exclusiveBackup)
		{
			LWLockRelease(WALInsertLock);
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is already in progress"),
					 errhint("Run pg_stop_backup() and try again.")));
		}
		XLogCtl->Insert.exclusiveBackup = true;
8856
	}
8857 8858
	else
		XLogCtl->Insert.nonExclusiveBackups++;
8859 8860
	XLogCtl->Insert.forcePageWrites = true;
	LWLockRelease(WALInsertLock);
B
Bruce Momjian 已提交
8861

8862
	/* Ensure we release forcePageWrites if fail below */
8863
	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
8864
	{
8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878
		bool gotUniqueStartpoint = false;
		do
		{
			/*
			 * Force a CHECKPOINT.	Aside from being necessary to prevent torn
			 * page problems, this guarantees that two successive backup runs will
			 * have different checkpoint positions and hence different history
			 * file names, even if nothing happened in between.
			 *
			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
			 * fast = true).  Otherwise this can take awhile.
			 */
			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
							  (fast ? CHECKPOINT_IMMEDIATE : 0));
8879

8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909
			/*
			 * Now we need to fetch the checkpoint record location, and also its
			 * REDO pointer.  The oldest point in WAL that would be needed to
			 * restore starting from the checkpoint is precisely the REDO pointer.
			 */
			LWLockAcquire(ControlFileLock, LW_SHARED);
			checkpointloc = ControlFile->checkPoint;
			startpoint = ControlFile->checkPointCopy.redo;
			LWLockRelease(ControlFileLock);

			/*
			 * If two base backups are started at the same time (in WAL
			 * sender processes), we need to make sure that they use
			 * different checkpoints as starting locations, because we use
			 * the starting WAL location as a unique identifier for the base
			 * backup in the end-of-backup WAL record and when we write the
			 * backup history file. Perhaps it would be better generate a
			 * separate unique ID for each backup instead of forcing another
			 * checkpoint, but taking a checkpoint right after another is
			 * not that expensive either because only few buffers have been
			 * dirtied yet.
			 */
			LWLockAcquire(WALInsertLock, LW_SHARED);
			if (XLByteLT(XLogCtl->Insert.lastBackupStart, startpoint))
			{
				XLogCtl->Insert.lastBackupStart = startpoint;
				gotUniqueStartpoint = true;
			}
			LWLockRelease(WALInsertLock);
		} while(!gotUniqueStartpoint);
B
Bruce Momjian 已提交
8910

8911 8912
		XLByteToSeg(startpoint, _logId, _logSeg);
		XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
B
Bruce Momjian 已提交
8913

8914 8915 8916 8917 8918
		/*
		 * Construct backup label file 
		 */
		initStringInfo(&labelfbuf);

8919 8920 8921 8922 8923
		/* Use the log timezone here, not the session timezone */
		stamp_time = (pg_time_t) time(NULL);
		pg_strftime(strfbuf, sizeof(strfbuf),
					"%Y-%m-%d %H:%M:%S %Z",
					pg_localtime(&stamp_time, log_timezone));
8924 8925 8926 8927 8928 8929
		appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
						 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
		appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
						 checkpointloc.xlogid, checkpointloc.xrecoff);
		appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
		appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);
8930 8931

		/*
8932
		 * Okay, write the file, or return its contents to caller.
8933
		 */
8934
		if (exclusive)
8935
		{
8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958
			/*
			 * Check for existing backup label --- implies a backup is already
			 * running.  (XXX given that we checked exclusiveBackup above, maybe
			 * it would be OK to just unlink any such label file?)
			 */
			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
			{
				if (errno != ENOENT)
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not stat file \"%s\": %m",
									BACKUP_LABEL_FILE)));
			}
			else
				ereport(ERROR,
						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						 errmsg("a backup is already in progress"),
						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
								 BACKUP_LABEL_FILE)));

			fp = AllocateFile(BACKUP_LABEL_FILE, "w");

			if (!fp)
8959 8960
				ereport(ERROR,
						(errcode_for_file_access(),
8961 8962 8963 8964 8965 8966 8967
						 errmsg("could not create file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			fwrite(labelfbuf.data, labelfbuf.len, 1, fp);
			if (fflush(fp) || ferror(fp) || FreeFile(fp))
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not write file \"%s\": %m",
8968
								BACKUP_LABEL_FILE)));
8969
			pfree(labelfbuf.data);
8970 8971
		}
		else
8972
			*labelfile = labelfbuf.data;
8973
	}
8974
	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
B
Bruce Momjian 已提交
8975

8976
	/*
8977
	 * We're done.  As a convenience, return the starting WAL location.
8978
	 */
8979
	return startpoint;
8980 8981
}

8982 8983 8984 8985
/* Error cleanup callback for pg_start_backup */
static void
pg_start_backup_callback(int code, Datum arg)
{
8986 8987 8988
	bool exclusive = DatumGetBool(arg);

	/* Update backup counters and forcePageWrites on failure */
8989
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005
	if (exclusive)
	{
		Assert(XLogCtl->Insert.exclusiveBackup);
		XLogCtl->Insert.exclusiveBackup = false;
	}
	else
	{
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
9006 9007 9008
	LWLockRelease(WALInsertLock);
}

9009 9010 9011
/*
 * pg_stop_backup: finish taking an on-line backup dump
 *
9012 9013 9014 9015
 * We write an end-of-backup WAL record, and remove the backup label file
 * created by pg_start_backup, creating a backup history file in pg_xlog
 * instead (whence it will immediately be archived). The backup history file
 * contains the same info found in the label file, plus the backup-end time
9016
 * and WAL location. Before 9.0, the backup-end time was read from the backup
9017 9018 9019
 * history file at the beginning of archive recovery, but we now use the WAL
 * record for that and the file is for informational and debug purposes only.
 *
9020
 * Note: different from CancelBackup which just cancels online backup mode.
9021 9022 9023
 */
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
9024 9025 9026 9027
{
	XLogRecPtr	stoppoint;
	char		stopxlogstr[MAXFNAMELEN];

9028
	stoppoint = do_pg_stop_backup(NULL, true);
9029 9030 9031 9032 9033 9034

	snprintf(stopxlogstr, sizeof(stopxlogstr), "%X/%X",
			 stoppoint.xlogid, stoppoint.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(stopxlogstr));
}

9035 9036 9037 9038 9039 9040 9041
/*
 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
 * function.

 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
 * the non-exclusive backup specified by 'labelfile'.
 */
9042
XLogRecPtr
9043
do_pg_stop_backup(char *labelfile, bool waitforarchive)
9044
{
9045
	bool		exclusive = (labelfile == NULL);
9046 9047
	XLogRecPtr	startpoint;
	XLogRecPtr	stoppoint;
B
Bruce Momjian 已提交
9048
	XLogRecData rdata;
9049
	pg_time_t	stamp_time;
9050
	char		strfbuf[128];
9051
	char		histfilepath[MAXPGPATH];
9052 9053
	char		startxlogfilename[MAXFNAMELEN];
	char		stopxlogfilename[MAXFNAMELEN];
9054 9055
	char		lastxlogfilename[MAXFNAMELEN];
	char		histfilename[MAXFNAMELEN];
9056 9057 9058 9059 9060
	uint32		_logId;
	uint32		_logSeg;
	FILE	   *lfp;
	FILE	   *fp;
	char		ch;
9061 9062
	int			seconds_before_warning;
	int			waits = 0;
9063
	bool		reported_waiting = false;
9064
	char	   *remaining;
9065

9066
	if (!superuser() && !is_authenticated_user_replication_role())
9067 9068
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
9069
				 (errmsg("must be superuser or replication role to run a backup"))));
B
Bruce Momjian 已提交
9070

9071 9072 9073 9074 9075 9076
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

9077
	if (!XLogIsNeeded())
9078 9079
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
B
Bruce Momjian 已提交
9080
			  errmsg("WAL level not sufficient for making an online backup"),
9081
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
9082

9083
	/*
9084
	 * OK to update backup counters and forcePageWrites
9085 9086
	 */
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105
	if (exclusive)
		XLogCtl->Insert.exclusiveBackup = false;
	else
	{
		/*
		 * The user-visible pg_start/stop_backup() functions that operate on
		 * exclusive backups can be called at any time, but for non-exclusive
		 * backups, it is expected that each do_pg_start_backup() call is
		 * matched by exactly one do_pg_stop_backup() call.
		 */
		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
		XLogCtl->Insert.nonExclusiveBackups--;
	}

	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
9106 9107
	LWLockRelease(WALInsertLock);

9108
	if (exclusive)
9109
	{
9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130
		/*
		 * Read the existing label file into memory.
		 */
		struct	stat statbuf;
		int		r;

		if (stat(BACKUP_LABEL_FILE, &statbuf))
		{
			if (errno != ENOENT)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not stat file \"%s\": %m",
								BACKUP_LABEL_FILE)));
			ereport(ERROR,
					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
					 errmsg("a backup is not in progress")));
		}

		lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
		if (!lfp)
		{
9131 9132 9133
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
9134
							BACKUP_LABEL_FILE)));
9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152
		}
		labelfile = palloc(statbuf.st_size + 1);
		r = fread(labelfile, statbuf.st_size, 1, lfp);
		labelfile[statbuf.st_size] = '\0';

		/*
		 * Close and remove the backup label file
		 */
		if (r != 1 || ferror(lfp) || FreeFile(lfp))
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
							BACKUP_LABEL_FILE)));
		if (unlink(BACKUP_LABEL_FILE) != 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not remove file \"%s\": %m",
							BACKUP_LABEL_FILE)));
9153
	}
B
Bruce Momjian 已提交
9154

9155
	/*
B
Bruce Momjian 已提交
9156 9157
	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
	 * but we are not expecting any variability in the file format).
9158
	 */
9159
	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
9160 9161 9162 9163
			   &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
			   &ch) != 4 || ch != '\n')
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9164
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9165
	remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
B
Bruce Momjian 已提交
9166

9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181
	/*
	 * Write the backup-end xlog record
	 */
	rdata.data = (char *) (&startpoint);
	rdata.len = sizeof(startpoint);
	rdata.buffer = InvalidBuffer;
	rdata.next = NULL;
	stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);

	/*
	 * Force a switch to a new xlog segment file, so that the backup is valid
	 * as soon as archiver moves out the current segment file.
	 */
	RequestXLogSwitch();

9182
	XLByteToPrevSeg(stoppoint, _logId, _logSeg);
9183 9184 9185 9186 9187 9188 9189 9190
	XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);

	/* Use the log timezone here, not the session timezone */
	stamp_time = (pg_time_t) time(NULL);
	pg_strftime(strfbuf, sizeof(strfbuf),
				"%Y-%m-%d %H:%M:%S %Z",
				pg_localtime(&stamp_time, log_timezone));

9191 9192 9193 9194
	/*
	 * Write the backup history file
	 */
	XLByteToSeg(startpoint, _logId, _logSeg);
9195
	BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
9196
						  startpoint.xrecoff % XLogSegSize);
9197
	fp = AllocateFile(histfilepath, "w");
9198 9199 9200 9201
	if (!fp)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m",
9202
						histfilepath)));
9203 9204 9205 9206
	fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
			startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
	fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
			stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
9207
	/* transfer remaining lines from label to history file */
9208
	fprintf(fp, "%s", remaining);
9209 9210 9211 9212 9213
	fprintf(fp, "STOP TIME: %s\n", strfbuf);
	if (fflush(fp) || ferror(fp) || FreeFile(fp))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write file \"%s\": %m",
9214
						histfilepath)));
B
Bruce Momjian 已提交
9215

9216
	/*
B
Bruce Momjian 已提交
9217 9218 9219
	 * Clean out any no-longer-needed history files.  As a side effect, this
	 * will post a .ready file for the newly created history file, notifying
	 * the archiver that history file may be archived immediately.
9220
	 */
9221
	CleanupBackupHistory();
B
Bruce Momjian 已提交
9222

9223
	/*
9224
	 * If archiving is enabled, wait for all the required WAL files to be
B
Bruce Momjian 已提交
9225 9226 9227 9228 9229 9230
	 * archived before returning. If archiving isn't enabled, the required WAL
	 * needs to be transported via streaming replication (hopefully with
	 * wal_keep_segments set high enough), or some more exotic mechanism like
	 * polling and copying files from pg_xlog with script. We have no
	 * knowledge of those mechanisms, so it's up to the user to ensure that he
	 * gets all the required WAL.
9231 9232
	 *
	 * We wait until both the last WAL file filled during backup and the
B
Bruce Momjian 已提交
9233 9234 9235
	 * history file have been archived, and assume that the alphabetic sorting
	 * property of the WAL files ensures any earlier WAL files are safely
	 * archived as well.
9236
	 *
9237 9238
	 * We wait forever, since archive_command is supposed to work and we
	 * assume the admin wanted his backup to work completely. If you don't
B
Bruce Momjian 已提交
9239 9240
	 * wish to wait, you can set statement_timeout.  Also, some notices are
	 * issued to clue in anyone who might be doing this interactively.
9241
	 */
9242
	if (waitforarchive && XLogArchivingActive())
9243
	{
B
Bruce Momjian 已提交
9244 9245
		XLByteToPrevSeg(stoppoint, _logId, _logSeg);
		XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
9246

B
Bruce Momjian 已提交
9247 9248 9249
		XLByteToSeg(startpoint, _logId, _logSeg);
		BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
							  startpoint.xrecoff % XLogSegSize);
9250

B
Bruce Momjian 已提交
9251 9252
		seconds_before_warning = 60;
		waits = 0;
9253

B
Bruce Momjian 已提交
9254 9255
		while (XLogArchiveIsBusy(lastxlogfilename) ||
			   XLogArchiveIsBusy(histfilename))
9256
		{
B
Bruce Momjian 已提交
9257
			CHECK_FOR_INTERRUPTS();
9258

B
Bruce Momjian 已提交
9259 9260 9261 9262 9263 9264
			if (!reported_waiting && waits > 5)
			{
				ereport(NOTICE,
						(errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
				reported_waiting = true;
			}
9265

B
Bruce Momjian 已提交
9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277
			pg_usleep(1000000L);

			if (++waits >= seconds_before_warning)
			{
				seconds_before_warning *= 2;	/* This wraps in >10 years... */
				ereport(WARNING,
						(errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
								waits),
						 errhint("Check that your archive_command is executing properly.  "
								 "pg_stop_backup can be cancelled safely, "
								 "but the database backup will not be usable without all the WAL segments.")));
			}
9278 9279
		}

B
Bruce Momjian 已提交
9280 9281
		ereport(NOTICE,
				(errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
9282
	}
9283
	else if (waitforarchive)
9284 9285
		ereport(NOTICE,
				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9286

9287
	/*
9288
	 * We're done.  As a convenience, return the ending WAL location.
9289
	 */
9290 9291 9292 9293 9294 9295 9296
	return stoppoint;
}


/*
 * do_pg_abort_backup: abort a running backup
 *
9297
 * This does just the most basic steps of do_pg_stop_backup(), by taking the
9298 9299
 * system out of backup mode, thus making it a lot more safe to call from
 * an error handler.
9300 9301 9302 9303
 *
 * NB: This is only for aborting a non-exclusive backup that doesn't write
 * backup_label. A backup started with pg_stop_backup() needs to be finished
 * with pg_stop_backup().
9304 9305 9306 9307 9308
 */
void
do_pg_abort_backup(void)
{
	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
9309 9310
	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
	XLogCtl->Insert.nonExclusiveBackups--;
9311

9312 9313 9314 9315 9316 9317
	if (!XLogCtl->Insert.exclusiveBackup &&
		XLogCtl->Insert.nonExclusiveBackups == 0)
	{
		XLogCtl->Insert.forcePageWrites = false;
	}
	LWLockRelease(WALInsertLock);
9318
}
9319

9320 9321 9322 9323 9324 9325
/*
 * pg_switch_xlog: switch to next xlog file
 */
Datum
pg_switch_xlog(PG_FUNCTION_ARGS)
{
B
Bruce Momjian 已提交
9326
	XLogRecPtr	switchpoint;
9327 9328 9329 9330 9331
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
B
Bruce Momjian 已提交
9332
			 (errmsg("must be superuser to switch transaction log files"))));
9333

9334 9335 9336 9337 9338 9339
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

9340 9341 9342 9343 9344 9345 9346
	switchpoint = RequestXLogSwitch();

	/*
	 * As a convenience, return the WAL location of the switch record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			 switchpoint.xlogid, switchpoint.xrecoff);
9347
	PG_RETURN_TEXT_P(cstring_to_text(location));
9348 9349
}

9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382
/*
 * pg_create_restore_point: a named point for restore
 */
Datum
pg_create_restore_point(PG_FUNCTION_ARGS)
{
	text		*restore_name = PG_GETARG_TEXT_P(0);
	char		*restore_name_str;
	XLogRecPtr	restorepoint;
	char		location[MAXFNAMELEN];

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to create a restore point"))));

	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 (errmsg("recovery is in progress"),
				  errhint("WAL control functions cannot be executed during recovery."))));

	if (!XLogIsNeeded())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
			  errmsg("WAL level not sufficient for creating a restore point"),
				 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

	restore_name_str = text_to_cstring(restore_name);

	if (strlen(restore_name_str) >= MAXFNAMELEN)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
9383
				 errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394

	restorepoint = XLogRestorePoint(restore_name_str);

	/*
	 * As a convenience, return the WAL location of the restore point record
	 */
	snprintf(location, sizeof(location), "%X/%X",
			restorepoint.xlogid, restorepoint.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

9395
/*
9396 9397 9398 9399 9400
 * Report the current WAL write location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to an external
 * archiving process.  Note that the data before this point is written out
 * to the kernel, but is not necessarily synced to disk.
9401 9402 9403
 */
Datum
pg_current_xlog_location(PG_FUNCTION_ARGS)
9404 9405 9406
{
	char		location[MAXFNAMELEN];

9407 9408 9409 9410 9411 9412
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424
	/* Make sure we have an up-to-date local LogwrtResult */
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile XLogCtlData *xlogctl = XLogCtl;

		SpinLockAcquire(&xlogctl->info_lck);
		LogwrtResult = xlogctl->LogwrtResult;
		SpinLockRelease(&xlogctl->info_lck);
	}

	snprintf(location, sizeof(location), "%X/%X",
			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
9425
	PG_RETURN_TEXT_P(cstring_to_text(location));
9426 9427 9428 9429 9430 9431 9432 9433 9434
}

/*
 * Report the current WAL insert location (same format as pg_start_backup etc)
 *
 * This function is mostly for debugging purposes.
 */
Datum
pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
9435 9436 9437 9438 9439
{
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	XLogRecPtr	current_recptr;
	char		location[MAXFNAMELEN];

9440 9441 9442 9443 9444 9445
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("WAL control functions cannot be executed during recovery.")));

9446 9447 9448 9449 9450 9451 9452 9453 9454
	/*
	 * Get the current end-of-WAL position ... shared lock is sufficient
	 */
	LWLockAcquire(WALInsertLock, LW_SHARED);
	INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
	LWLockRelease(WALInsertLock);

	snprintf(location, sizeof(location), "%X/%X",
			 current_recptr.xlogid, current_recptr.xrecoff);
9455
	PG_RETURN_TEXT_P(cstring_to_text(location));
9456 9457
}

9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469
/*
 * Report the last WAL receive location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is guaranteed to be received
 * and synced to disk by walreceiver.
 */
Datum
pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
{
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

9470
	recptr = GetWalRcvWriteRecPtr(NULL);
9471

9472 9473 9474
	if (recptr.xlogid == 0 && recptr.xrecoff == 0)
		PG_RETURN_NULL();

9475 9476 9477 9478 9479
	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498
/*
 * Get latest redo apply position.
 *
 * Exported to allow WALReceiver to read the pointer directly.
 */
XLogRecPtr
GetXLogReplayRecPtr(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile XLogCtlData *xlogctl = XLogCtl;
	XLogRecPtr	recptr;

	SpinLockAcquire(&xlogctl->info_lck);
	recptr = xlogctl->recoveryLastRecPtr;
	SpinLockRelease(&xlogctl->info_lck);

	return recptr;
}

9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510
/*
 * Report the last WAL replay location (same format as pg_start_backup etc)
 *
 * This is useful for determining how much of WAL is visible to read-only
 * connections during recovery.
 */
Datum
pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
{
	XLogRecPtr	recptr;
	char		location[MAXFNAMELEN];

9511
	recptr = GetXLogReplayRecPtr();
9512

9513 9514 9515
	if (recptr.xlogid == 0 && recptr.xrecoff == 0)
		PG_RETURN_NULL();

9516 9517 9518 9519 9520
	snprintf(location, sizeof(location), "%X/%X",
			 recptr.xlogid, recptr.xrecoff);
	PG_RETURN_TEXT_P(cstring_to_text(location));
}

9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540
/*
 * Compute an xlog file name and decimal byte offset given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 *
 * Note that a location exactly at a segment boundary is taken to be in
 * the previous segment.  This is usually the right thing, since the
 * expected usage is to determine which xlog file(s) are ready to archive.
 */
Datum
pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	uint32		xrecoff;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];
B
Bruce Momjian 已提交
9541 9542 9543 9544 9545
	Datum		values[2];
	bool		isnull[2];
	TupleDesc	resultTupleDesc;
	HeapTuple	resultHeapTuple;
	Datum		result;
9546

9547 9548 9549 9550 9551 9552
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
				 errhint("pg_xlogfile_name_offset() cannot be executed during recovery.")));

9553 9554 9555
	/*
	 * Read input and parse
	 */
9556
	locationstr = text_to_cstring(location);
9557 9558 9559 9560

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
9561
				 errmsg("could not parse transaction log location \"%s\"",
9562 9563 9564 9565 9566
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

9567
	/*
B
Bruce Momjian 已提交
9568 9569
	 * Construct a tuple descriptor for the result row.  This must match this
	 * function's pg_proc entry!
9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581
	 */
	resultTupleDesc = CreateTemplateTupleDesc(2, false);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
					   TEXTOID, -1, 0);
	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
					   INT4OID, -1, 0);

	resultTupleDesc = BlessTupleDesc(resultTupleDesc);

	/*
	 * xlogfilename
	 */
9582 9583 9584
	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

9585
	values[0] = CStringGetTextDatum(xlogfilename);
9586 9587 9588 9589 9590
	isnull[0] = false;

	/*
	 * offset
	 */
9591 9592
	xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;

9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603
	values[1] = UInt32GetDatum(xrecoff);
	isnull[1] = false;

	/*
	 * Tuple jam: Having first prepared your Datums, then squash together
	 */
	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);

	result = HeapTupleGetDatum(resultHeapTuple);

	PG_RETURN_DATUM(result);
9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621
}

/*
 * Compute an xlog file name given a WAL location,
 * such as is returned by pg_stop_backup() or pg_xlog_switch().
 */
Datum
pg_xlogfile_name(PG_FUNCTION_ARGS)
{
	text	   *location = PG_GETARG_TEXT_P(0);
	char	   *locationstr;
	unsigned int uxlogid;
	unsigned int uxrecoff;
	uint32		xlogid;
	uint32		xlogseg;
	XLogRecPtr	locationpoint;
	char		xlogfilename[MAXFNAMELEN];

9622 9623 9624 9625
	if (RecoveryInProgress())
		ereport(ERROR,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
				 errmsg("recovery is in progress"),
B
Bruce Momjian 已提交
9626
		 errhint("pg_xlogfile_name() cannot be executed during recovery.")));
9627

9628
	locationstr = text_to_cstring(location);
9629 9630 9631 9632

	if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
P
Peter Eisentraut 已提交
9633
				 errmsg("could not parse transaction log location \"%s\"",
9634 9635 9636 9637 9638 9639 9640 9641
						locationstr)));

	locationpoint.xlogid = uxlogid;
	locationpoint.xrecoff = uxrecoff;

	XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
	XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);

9642
	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
9643 9644
}

9645 9646 9647 9648 9649
/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
B
Bruce Momjian 已提交
9650
 * identified by the label file, NOT what pg_control says.	This avoids the
9651 9652 9653 9654 9655
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
9656 9657
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not.
9658 9659
 */
static bool
9660
read_backup_label(XLogRecPtr *checkPointLoc)
9661 9662 9663 9664 9665 9666 9667 9668 9669
{
	char		startxlogfilename[MAXFNAMELEN];
	TimeLineID	tli;
	FILE	   *lfp;
	char		ch;

	/*
	 * See if label file is present
	 */
9670
	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9671 9672 9673 9674 9675 9676
	if (!lfp)
	{
		if (errno != ENOENT)
			ereport(FATAL,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m",
9677
							BACKUP_LABEL_FILE)));
9678 9679
		return false;			/* it's not there, all is fine */
	}
B
Bruce Momjian 已提交
9680

9681
	/*
B
Bruce Momjian 已提交
9682 9683 9684
	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
	 * is pretty crude, but we are not expecting any variability in the file
	 * format).
9685 9686
	 */
	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9687
			   &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
9688 9689 9690
			   startxlogfilename, &ch) != 5 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9691
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9692 9693 9694 9695 9696
	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
			   &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
			   &ch) != 3 || ch != '\n')
		ereport(FATAL,
				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9697
				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9698 9699 9700 9701
	if (ferror(lfp) || FreeFile(lfp))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read file \"%s\": %m",
9702
						BACKUP_LABEL_FILE)));
B
Bruce Momjian 已提交
9703

9704 9705 9706
	return true;
}

9707 9708 9709 9710 9711 9712
/*
 * Error context callback for errors occurring during rm_redo().
 */
static void
rm_redo_error_callback(void *arg)
{
B
Bruce Momjian 已提交
9713 9714
	XLogRecord *record = (XLogRecord *) arg;
	StringInfoData buf;
9715 9716

	initStringInfo(&buf);
9717 9718
	RmgrTable[record->xl_rmid].rm_desc(&buf,
									   record->xl_info,
9719 9720 9721 9722 9723 9724 9725 9726
									   XLogRecGetData(record));

	/* don't bother emitting empty description */
	if (buf.len > 0)
		errcontext("xlog redo %s", buf.data);

	pfree(buf.data);
}
9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763

/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool
BackupInProgress(void)
{
	struct stat stat_buf;

	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}

/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
void
CancelBackup(void)
{
	struct stat stat_buf;

	/* if the file is not there, return */
	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
		return;

	/* remove leftover file from previously cancelled backup if it exists */
	unlink(BACKUP_LABEL_OLD);

	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
	{
		ereport(LOG,
				(errmsg("online backup mode cancelled"),
9764
				 errdetail("\"%s\" was renamed to \"%s\".",
9765
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9766 9767 9768 9769 9770
	}
	else
	{
		ereport(WARNING,
				(errcode_for_file_access(),
9771 9772
				 errmsg("online backup mode was not cancelled"),
				 errdetail("Could not rename \"%s\" to \"%s\": %m.",
9773
						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9774 9775 9776
	}
}

9777
/* ------------------------------------------------------
9778
 *	Startup Process main entry point and signal handlers
9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793
 * ------------------------------------------------------
 */

/*
 * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
 *
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void
startupproc_quickdie(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);

	/*
9794 9795 9796 9797 9798 9799 9800 9801 9802 9803
	 * We DO NOT want to run proc_exit() callbacks -- we're here because
	 * shared memory may be corrupted, so we don't want to try to clean up our
	 * transaction.  Just nail the windows shut and get out of town.  Now that
	 * there's an atexit callback to prevent third-party code from breaking
	 * things by calling exit() directly, we have to reset the callbacks
	 * explicitly to make this work as intended.
	 */
	on_exit_reset();

	/*
9804 9805 9806
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
9807
	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
9808 9809
	 * should ensure the postmaster sees this as a crash, too, but no harm in
	 * being doubly sure.)
9810 9811 9812 9813 9814
	 */
	exit(2);
}


9815 9816 9817 9818 9819 9820 9821
/* SIGUSR1: let latch facility handle the signal */
static void
StartupProcSigUsr1Handler(SIGNAL_ARGS)
{
	latch_sigusr1_handler();
}

R
Robert Haas 已提交
9822 9823 9824 9825 9826 9827 9828 9829
/* SIGUSR2: set flag to finish recovery */
static void
StartupProcTriggerHandler(SIGNAL_ARGS)
{
	promote_triggered = true;
	WakeupRecovery();
}

9830 9831 9832 9833 9834
/* SIGHUP: set flag to re-read config file at next convenient time */
static void
StartupProcSigHupHandler(SIGNAL_ARGS)
{
	got_SIGHUP = true;
9835
	WakeupRecovery();
9836 9837
}

9838 9839 9840 9841 9842
/* SIGTERM: set flag to abort redo and exit */
static void
StartupProcShutdownHandler(SIGNAL_ARGS)
{
	if (in_restore_command)
9843
		proc_exit(1);
9844 9845
	else
		shutdown_requested = true;
9846
	WakeupRecovery();
9847 9848
}

9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860
/* Handle SIGHUP and SIGTERM signals of startup process */
void
HandleStartupProcInterrupts(void)
{
	/*
	 * Check if we were requested to re-read config file.
	 */
	if (got_SIGHUP)
	{
		got_SIGHUP = false;
		ProcessConfigFile(PGC_SIGHUP);
	}
B
Bruce Momjian 已提交
9861

9862 9863 9864 9865 9866
	/*
	 * Check if we were requested to exit without finishing recovery.
	 */
	if (shutdown_requested)
		proc_exit(1);
9867 9868 9869 9870 9871 9872 9873

	/*
	 * Emergency bailout if postmaster has died.  This is to avoid the
	 * necessity for manual cleanup of all postmaster children.
	 */
	if (IsUnderPostmaster && !PostmasterIsAlive(true))
		exit(1);
9874 9875
}

9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889
/* Main entry point for startup process */
void
StartupProcessMain(void)
{
	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
9890 9891 9892 9893 9894
	 * Properly accept or ignore signals the postmaster might send us.
	 *
	 * Note: ideally we'd not enable handle_standby_sig_alarm unless actually
	 * doing hot standby, but we don't know that yet.  Rely on it to not do
	 * anything if it shouldn't.
9895
	 */
9896 9897
	pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
	pqsignal(SIGINT, SIG_IGN);	/* ignore query cancel */
B
Bruce Momjian 已提交
9898 9899
	pqsignal(SIGTERM, StartupProcShutdownHandler);		/* request shutdown */
	pqsignal(SIGQUIT, startupproc_quickdie);	/* hard crash time */
9900
	if (EnableHotStandby)
B
Bruce Momjian 已提交
9901 9902
		pqsignal(SIGALRM, handle_standby_sig_alarm);	/* ignored unless
														 * InHotStandby */
9903 9904
	else
		pqsignal(SIGALRM, SIG_IGN);
9905
	pqsignal(SIGPIPE, SIG_IGN);
9906
	pqsignal(SIGUSR1, StartupProcSigUsr1Handler);
R
Robert Haas 已提交
9907
	pqsignal(SIGUSR2, StartupProcTriggerHandler);
9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

9923
	StartupXLOG();
9924

9925
	/*
9926 9927
	 * Exit normally. Exit code 0 tells postmaster that we completed recovery
	 * successfully.
9928
	 */
9929 9930
	proc_exit(0);
}
9931 9932 9933

/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
9934
 * Returns true if the page is read successfully.
9935 9936 9937
 *
 * This is responsible for restoring files from archive as needed, as well
 * as for waiting for the requested WAL record to arrive in standby mode.
9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951
 *
 * 'emode' specifies the log level used for reporting "file not found" or
 * "end of WAL" situations in archive recovery, or in standby mode when a
 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
 * false in those situations, on higher log levels the ereport() won't
 * return.
 *
 * In standby mode, if after a successful return of XLogPageRead() the
 * caller finds the record it's interested in to be broken, it should
 * ereport the error with the level determined by
 * emode_for_corrupt_record(), and then set "failedSources |= readSource"
 * and call XLogPageRead() again with the same arguments. This lets
 * XLogPageRead() to try fetching the record from another source, or to
 * sleep and retry.
9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962
 */
static bool
XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
			 bool randAccess)
{
	static XLogRecPtr receivedUpto = {0, 0};
	bool		switched_segment = false;
	uint32		targetPageOff;
	uint32		targetRecOff;
	uint32		targetId;
	uint32		targetSeg;
9963
	static pg_time_t last_fail_time = 0;
9964 9965 9966 9967 9968 9969

	XLByteToSeg(*RecPtr, targetId, targetSeg);
	targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
	targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;

	/* Fast exit if we have read the record in the current buffer already */
9970
	if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
9971 9972 9973 9974 9975 9976 9977 9978 9979
		targetPageOff == readOff && targetRecOff < readLen)
		return true;

	/*
	 * See if we need to switch to a new segment because the requested record
	 * is not in the currently open one.
	 */
	if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
	{
9980
		/*
B
Bruce Momjian 已提交
9981 9982
		 * Signal bgwriter to start a restartpoint if we've replayed too much
		 * xlog since the last one.
9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993
		 */
		if (StandbyMode && bgwriterLaunched)
		{
			if (XLogCheckpointNeeded(readId, readSeg))
			{
				(void) GetRedoRecPtr();
				if (XLogCheckpointNeeded(readId, readSeg))
					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
			}
		}

9994 9995
		close(readFile);
		readFile = -1;
9996
		readSource = 0;
9997 9998 9999 10000
	}

	XLByteToSeg(*RecPtr, readId, readSeg);

10001
retry:
10002 10003
	/* See if we need to retrieve more data */
	if (readFile < 0 ||
10004
		(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
10005 10006 10007 10008 10009
	{
		if (StandbyMode)
		{
			/*
			 * In standby mode, wait for the requested record to become
B
Bruce Momjian 已提交
10010 10011
			 * available, either via restore_command succeeding to restore the
			 * segment, or via walreceiver having streamed the record.
10012 10013 10014 10015 10016
			 */
			for (;;)
			{
				if (WalRcvInProgress())
				{
B
Bruce Momjian 已提交
10017
					bool		havedata;
10018

10019 10020 10021
					/*
					 * If we find an invalid record in the WAL streamed from
					 * master, something is seriously wrong. There's little
B
Bruce Momjian 已提交
10022 10023 10024 10025 10026 10027
					 * chance that the problem will just go away, but PANIC is
					 * not good for availability either, especially in hot
					 * standby mode. Disconnect, and retry from
					 * archive/pg_xlog again. The WAL in the archive should be
					 * identical to what was streamed, so it's unlikely that
					 * it helps, but one can hope...
10028 10029 10030 10031 10032 10033 10034
					 */
					if (failedSources & XLOG_FROM_STREAM)
					{
						ShutdownWalRcv();
						continue;
					}

10035
					/*
10036 10037 10038 10039 10040 10041
					 * Walreceiver is active, so see if new data has arrived.
					 *
					 * We only advance XLogReceiptTime when we obtain fresh
					 * WAL from walreceiver and observe that we had already
					 * processed everything before the most recent "chunk"
					 * that it flushed to disk.  In steady state where we are
B
Bruce Momjian 已提交
10042 10043
					 * keeping up with the incoming data, XLogReceiptTime will
					 * be updated on each cycle.  When we are behind,
10044 10045
					 * XLogReceiptTime will not advance, so the grace time
					 * alloted to conflicting queries will decrease.
10046 10047
					 */
					if (XLByteLT(*RecPtr, receivedUpto))
10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063
						havedata = true;
					else
					{
						XLogRecPtr	latestChunkStart;

						receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
						if (XLByteLT(*RecPtr, receivedUpto))
						{
							havedata = true;
							if (!XLByteLT(*RecPtr, latestChunkStart))
								XLogReceiptTime = GetCurrentTimestamp();
						}
						else
							havedata = false;
					}
					if (havedata)
10064 10065 10066
					{
						/*
						 * Great, streamed far enough. Open the file if it's
10067 10068 10069
						 * not open already.  Use XLOG_FROM_STREAM so that
						 * source info is set correctly and XLogReceiptTime
						 * isn't changed.
10070 10071 10072 10073 10074
						 */
						if (readFile < 0)
						{
							readFile =
								XLogFileRead(readId, readSeg, PANIC,
10075
											 recoveryTargetTLI,
10076 10077
											 XLOG_FROM_STREAM, false);
							Assert(readFile >= 0);
10078
							switched_segment = true;
10079 10080 10081 10082
						}
						else
						{
							/* just make sure source info is correct... */
10083
							readSource = XLOG_FROM_STREAM;
10084
							XLogReceiptSource = XLOG_FROM_STREAM;
10085 10086 10087 10088
						}
						break;
					}

10089
					/*
10090 10091
					 * Data not here yet, so check for trigger then sleep for
					 * five seconds like in the WAL file polling case below.
10092
					 */
10093
					if (CheckForStandbyTrigger())
10094
						goto retry;
10095 10096

					/*
10097
					 * Wait for more WAL to arrive, or timeout to be reached
10098
					 */
10099 10100
					WaitLatch(&XLogCtl->recoveryWakeupLatch, 5000000L);
					ResetLatch(&XLogCtl->recoveryWakeupLatch);
10101 10102 10103
				}
				else
				{
B
Bruce Momjian 已提交
10104 10105
					int			sources;
					pg_time_t	now;
10106

10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120
					/*
					 * Until walreceiver manages to reconnect, poll the
					 * archive.
					 */
					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}
					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;

					/*
10121 10122
					 * Try to restore the file from archive, or read an
					 * existing file from pg_xlog.
10123
					 */
10124 10125
					sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
					if (!(sources & ~failedSources))
10126 10127
					{
						/*
10128
						 * We've exhausted all options for retrieving the
10129
						 * file. Retry.
10130 10131 10132 10133
						 */
						failedSources = 0;

						/*
10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146
						 * Before we sleep, re-scan for possible new timelines
						 * if we were requested to recover to the latest
						 * timeline.
						 */
						if (recoveryTargetIsLatest)
						{
							if (rescanLatestTimeLine())
								continue;
						}

						/*
						 * If it hasn't been long since last attempt, sleep
						 * to avoid busy-waiting.
10147
						 */
10148 10149 10150 10151 10152 10153 10154 10155 10156 10157
						now = (pg_time_t) time(NULL);
						if ((now - last_fail_time) < 5)
						{
							pg_usleep(1000000L * (5 - (now - last_fail_time)));
							now = (pg_time_t) time(NULL);
						}
						last_fail_time = now;

						/*
						 * If primary_conninfo is set, launch walreceiver to
B
Bruce Momjian 已提交
10158 10159
						 * try to stream the missing WAL, before retrying to
						 * restore from archive/pg_xlog.
10160 10161 10162
						 *
						 * If fetching_ckpt is TRUE, RecPtr points to the
						 * initial checkpoint location. In that case, we use
B
Bruce Momjian 已提交
10163 10164 10165 10166
						 * RedoStartLSN as the streaming start position
						 * instead of RecPtr, so that when we later jump
						 * backwards to start redo at RedoStartLSN, we will
						 * have the logs streamed already.
10167 10168 10169 10170
						 */
						if (PrimaryConnInfo)
						{
							RequestXLogStreaming(
B
Bruce Momjian 已提交
10171 10172
									  fetching_ckpt ? RedoStartLSN : *RecPtr,
												 PrimaryConnInfo);
10173 10174
							continue;
						}
10175
					}
10176 10177 10178 10179 10180
					/* Don't try to read from a source that just failed */
					sources &= ~failedSources;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
												  sources);
					switched_segment = true;
10181
					if (readFile >= 0)
10182
						break;
10183 10184

					/*
10185 10186 10187 10188 10189
					 * Nope, not found in archive and/or pg_xlog.
					 */
					failedSources |= sources;

					/*
B
Bruce Momjian 已提交
10190 10191 10192 10193
					 * Check to see if the trigger file exists. Note that we
					 * do this only after failure, so when you create the
					 * trigger file, we still finish replaying as much as we
					 * can from archive and pg_xlog before failover.
10194
					 */
10195 10196
					if (CheckForStandbyTrigger())
						goto triggered;
10197 10198 10199
				}

				/*
B
Bruce Momjian 已提交
10200 10201
				 * This possibly-long loop needs to handle interrupts of
				 * startup process.
10202 10203 10204 10205 10206 10207 10208 10209 10210
				 */
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In archive or crash recovery. */
			if (readFile < 0)
			{
B
Bruce Momjian 已提交
10211
				int			sources;
10212

10213 10214 10215
				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;
10216 10217 10218 10219 10220

				sources = XLOG_FROM_PG_XLOG;
				if (InArchiveRecovery)
					sources |= XLOG_FROM_ARCHIVE;

10221
				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
10222
											  sources);
10223 10224 10225 10226 10227 10228 10229 10230
				switched_segment = true;
				if (readFile < 0)
					return false;
			}
		}
	}

	/*
B
Bruce Momjian 已提交
10231 10232
	 * At this point, we have the right segment open and if we're streaming we
	 * know the requested record is in it.
10233 10234 10235 10236
	 */
	Assert(readFile != -1);

	/*
B
Bruce Momjian 已提交
10237 10238 10239 10240
	 * If the current segment is being streamed from master, calculate how
	 * much of the current page we have received already. We know the
	 * requested record has been received, but this is for the benefit of
	 * future calls, to allow quick exit at the top of this function.
10241
	 */
10242
	if (readSource == XLOG_FROM_STREAM)
10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266
	{
		if (RecPtr->xlogid != receivedUpto.xlogid ||
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
		/*
		 * Whenever switching to a new WAL segment, we read the first page of
		 * the file and validate its header, even if that's not where the
		 * target record is.  This is so that we can check the additional
		 * identification info that is present in the first page's "long"
		 * header.
		 */
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
10267
			ereport(emode_for_corrupt_record(emode, *RecPtr),
10268 10269 10270 10271 10272
					(errcode_for_file_access(),
					 errmsg("could not read from log file %u, segment %u, offset %u: %m",
							readId, readSeg, readOff)));
			goto next_record_is_invalid;
		}
10273
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10274 10275 10276 10277 10278 10279 10280
			goto next_record_is_invalid;
	}

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
10281
		ereport(emode_for_corrupt_record(emode, *RecPtr),
10282
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
10283 10284
		 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
				readId, readSeg, readOff)));
10285 10286 10287 10288
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
10289
		ereport(emode_for_corrupt_record(emode, *RecPtr),
10290
				(errcode_for_file_access(),
B
Bruce Momjian 已提交
10291 10292
		 errmsg("could not read from log file %u, segment %u, offset %u: %m",
				readId, readSeg, readOff)));
10293 10294
		goto next_record_is_invalid;
	}
10295
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
10296 10297 10298 10299 10300 10301 10302 10303 10304 10305
		goto next_record_is_invalid;

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:
10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320
	failedSources |= readSource;

	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return false;

triggered:
10321 10322 10323 10324
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
10325
	readSource = 0;
10326 10327 10328 10329

	return false;
}

10330 10331 10332 10333 10334
/*
 * Determine what log level should be used to report a corrupt WAL record
 * in the current WAL page, previously read by XLogPageRead().
 *
 * 'emode' is the error mode that would be used to report a file-not-found
B
Bruce Momjian 已提交
10335
 * or legitimate end-of-WAL situation.	 Generally, we use it as-is, but if
10336
 * we're retrying the exact same record that we've tried previously, only
B
Bruce Momjian 已提交
10337
 * complain the first time to keep the noise down.	However, we only do when
10338 10339 10340
 * reading from pg_xlog, because we don't expect any invalid records in archive
 * or in records streamed from master. Files in the archive should be complete,
 * and we should never hit the end of WAL because we stop and wait for more WAL
B
Bruce Momjian 已提交
10341
 * to arrive before replaying it.
10342 10343 10344 10345 10346
 *
 * NOTE: This function remembers the RecPtr value it was last called with,
 * to suppress repeated messages about the same record. Only call this when
 * you are about to ereport(), or you might cause a later message to be
 * erroneously suppressed.
10347 10348
 */
static int
10349
emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
10350
{
10351 10352
	static XLogRecPtr lastComplaint = {0, 0};

10353
	if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
10354 10355 10356 10357 10358 10359
	{
		if (XLByteEQ(RecPtr, lastComplaint))
			emode = DEBUG1;
		else
			lastComplaint = RecPtr;
	}
10360 10361 10362
	return emode;
}

10363
/*
R
Robert Haas 已提交
10364 10365 10366
 * Check to see whether the user-specified trigger file exists and whether a
 * promote request has arrived.  If either condition holds, request postmaster
 * to shut down walreceiver, wait for it to exit, and return true.
10367 10368 10369 10370 10371
 */
static bool
CheckForStandbyTrigger(void)
{
	struct stat stat_buf;
10372 10373 10374 10375
	static bool	triggered = false;

	if (triggered)
		return true;
10376

R
Robert Haas 已提交
10377 10378 10379 10380 10381 10382 10383 10384 10385 10386
	if (promote_triggered)
	{
		ereport(LOG,
				(errmsg("received promote request")));
		ShutdownWalRcv();
		promote_triggered = false;
		triggered = true;
		return true;
	}

10387 10388 10389 10390 10391 10392 10393 10394 10395
	if (TriggerFile == NULL)
		return false;

	if (stat(TriggerFile, &stat_buf) == 0)
	{
		ereport(LOG,
				(errmsg("trigger file found: %s", TriggerFile)));
		ShutdownWalRcv();
		unlink(TriggerFile);
10396
		triggered = true;
10397 10398 10399 10400
		return true;
	}
	return false;
}
10401

R
Robert Haas 已提交
10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422
/*
 * Check to see if a promote request has arrived. Should be
 * called by postmaster after receiving SIGUSR1.
 */
bool
CheckPromoteSignal(void)
{
	struct stat stat_buf;

	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
	{
		/*
		 * Since we are in a signal handler, it's not safe
		 * to elog. We silently ignore any error from unlink.
		 */
		unlink(PROMOTE_SIGNAL_FILE);
		return true;
	}
	return false;
}

10423 10424 10425 10426 10427 10428 10429 10430 10431
/*
 * Wake up startup process to replay newly arrived WAL, or to notice that
 * failover has been requested.
 */
void
WakeupRecovery(void)
{
	SetLatch(&XLogCtl->recoveryWakeupLatch);
}